1 files changed, 691 insertions, 0 deletions
diff --git a/librdfa/curie.c b/librdfa/curie.c
new file mode 100644
index 0000000..98ee7e0
--- /dev/null
+++ b/librdfa/curie.c
@@ -0,0 +1,691 @@
+/**
+ * Copyright 2008 Digital Bazaar, Inc.
+ *
+ * This file is part of librdfa.
+ * 
+ * librdfa is Free Software, and can be licensed under any of the
+ * following three licenses:
+ * 
+ *   1. GNU Lesser General Public License (LGPL) V2.1 or any 
+ *      newer version
+ *   2. GNU General Public License (GPL) V2 or any newer version
+ *   3. Apache License, V2.0 or any newer version
+ * 
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ * 
+ * See LICENSE-* at the top of this software distribution for more
+ * information regarding the details of each license.
+ *
+ * The CURIE module is used to resolve all forms of CURIEs that
+ * XHTML+RDFa accepts.
+ *
+ * @author Manu Sporny
+ */
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#ifdef HAVE_STRINGS_H
+#  include <strings.h>
+#endif
+#include <stdio.h>
+#include <ctype.h>
+#include "rdfa_utils.h"
+#include "rdfa.h"
+#include "strtok_r.h"
+
+/* The base XHTML vocab URL is used to resolve URIs that are reserved
+ * words. Any reserved listed above is appended to the URL below to
+ * form a complete IRI. */
+#define XHTML_VOCAB_URI "http://www.w3.org/1999/xhtml/vocab#"
+#define XHTML_VOCAB_URI_SIZE 35
+
+/**
+ * Gets the type of CURIE that is passed to it.
+ *
+ * @param uri the uri to check.
+ *
+ * @return either CURIE_TYPE_SAFE, CURIE_TYPE_URI or CURIE_TYPE_INVALID.
+ */
+static curie_t rdfa_get_curie_type(const char* uri)
+{
+   curie_t rval = CURIE_TYPE_INVALID;
+   
+   if(uri != NULL)
+   {
+      size_t uri_length = strlen(uri);
+
+      if((uri[0] == '[') && (uri[uri_length - 1] == ']'))
+      {
+         /* a safe curie starts with [ and ends with ] */
+         rval = CURIE_TYPE_SAFE;
+      }
+      else if(strstr(uri, ":") != NULL)
+      {
+         /* at this point, it is unknown whether or not the CURIE is
+          * an IRI or an unsafe CURIE */
+         rval = CURIE_TYPE_IRI_OR_UNSAFE;
+      }
+      else
+      {
+         /* if none of the above match, then the CURIE is probably a
+          * relative IRI */
+         rval = CURIE_TYPE_IRI_OR_UNSAFE;
+      }
+   }
+
+   return rval;
+}
+
+char* rdfa_resolve_uri(rdfacontext* context, const char* uri)
+{
+   char* rval = NULL;
+   char* path_start = NULL;
+   size_t base_length = strlen(context->base);
+   
+   if(strlen(uri) < 1)
+   {
+      /* if a blank URI is given, use the base context */
+      rval = rdfa_replace_string(rval, context->base);
+   }
+   else if(strstr(uri, ":") != NULL)
+   {
+      /* if a IRI is given, don't concatenate */
+      rval = rdfa_replace_string(rval, uri);
+   }
+   else if(uri[0] == '#' || uri[0] == '?')
+   {
+      /* if a fragment ID or start of a query parameter is given,
+       * concatenate it with the base URI */
+      rval = rdfa_join_string(context->base, uri);
+   }
+   else if(uri[0] == '/')
+   {
+      /* if a relative URI is given, but it starts with a '/', use the
+       * host part concatenated to the given URI */
+      char* tmp = NULL;
+      char* end_index = NULL;
+
+      /* initialize the working-set data */
+      tmp = rdfa_replace_string(tmp, context->base);
+      end_index = strchr(tmp, '/');
+
+
+      /* find the final '/' character after the host part of the context base. */
+      if(end_index != NULL)
+      {
+	 end_index = strchr(end_index + 1, '/');
+	
+	 if(end_index != NULL)
+	 {
+	    end_index = strchr(end_index + 1, '/');
+         }
+      }
+
+      /* if the '/' character after the host part was found, copy the host
+       * part and append the given URI to the URI, otherwise, append the
+       * host part and the URI part as-is, ensuring that a '/' exists at the
+       * end of the host part. */
+      if(end_index != NULL)
+      {
+         char* rval_copy;
+
+	 *end_index = '\0';
+	 
+	 /* if the '/' character after the host part was found, copy the host
+	  * part and append the given URI to the URI. */
+	 rval_copy = rdfa_replace_string(rval, tmp);
+	 rval = rdfa_join_string(rval_copy, uri);
+         free(rval_copy);
+      }
+      else
+      {
+	 /* append the host part and the URI part as-is, ensuring that a
+	  * '/' exists at the end of the host part. */
+ 	 size_t tlen = strlen(tmp) - 1;
+         char* rval_copy;
+
+	 rval_copy = rdfa_replace_string(rval, tmp);
+
+	 if(rval_copy[tlen] == '/')
+	 {
+	    rval_copy[tlen] = '\0';
+	 }
+	 rval = rdfa_join_string(rval_copy, uri);
+         free(rval_copy);
+      }
+
+      free(tmp);
+   }
+   else
+   {
+      if((char)context->base[base_length - 1] == '/')
+      {
+         /* if the base URI already ends in /, concatenate */
+         rval = rdfa_join_string(context->base, uri);
+      }
+      else
+      {
+         /* if we have a relative URI, chop off the name of the file
+          * and replace it with the relative pathname */
+         char* end_index = strrchr(context->base, '/');
+
+         if(end_index != NULL)
+         {
+            char* tmpstr = NULL;
+            char* end_index2;
+
+            tmpstr = rdfa_replace_string(tmpstr, context->base);
+            end_index2 = strrchr(tmpstr, '/');
+            if(end_index2 != NULL) {
+              end_index2++;
+              *end_index2 = '\0';
+            }
+
+            rval = rdfa_join_string(tmpstr, uri);
+            free(tmpstr);
+         }
+      }
+   }
+
+   /* It is possible that rval may be NULL here in OOM scenarios */
+   if(!rval)
+     return NULL;
+
+   /* Find the start of a scheme-based URL path */
+   path_start = (char*)strstr(rval, "://");
+   if(path_start != NULL)
+   {
+      if(strstr(path_start, "/.") != NULL)
+      {
+         path_start += 3;
+         path_start = strstr(path_start, "/");
+      }
+      else
+      {
+         path_start = NULL;
+      }
+   }
+
+   /* remove any dot-segments that remain in the URL for URLs w/ schemes */
+   if(path_start != NULL)
+   {
+      size_t rlen = strlen(rval) + 1;
+      size_t hlen = path_start - rval;
+      char* src = (char*)malloc(rlen + 4);
+      char* sptr = src + hlen;
+      char* dest = (char*)malloc(rlen + 1);
+      char* dptr = dest + hlen;
+      char* dfence = dptr;
+
+      memset(src, 0, rlen + 4);
+      memcpy(src, rval, rlen);
+      strncpy(dest, rval, hlen);
+
+      /* Process the path portion of the IRI */
+      while(sptr[0] != '?' && sptr[0] != '\0')
+      {
+         if(sptr[0] == '.' && sptr[1] == '.' && sptr[2] == '/')
+         {
+            /* A.  If the input buffer begins with a prefix of "../",
+             * then remove that prefix from the input buffer; otherwise,
+             */
+            sptr += 3;
+         }
+         else if(sptr[0] == '.' && sptr[1] == '/')
+         {
+            /* A.  If the input buffer begins with a prefix of "./",
+             * then remove that prefix from the input buffer; otherwise,
+             */
+            sptr += 2;
+         }
+         else if(sptr[0] == '/' && sptr[1] == '.' && sptr[2] == '/')
+         {
+            /* B.  if the input buffer begins with a prefix of "/./",
+             * then replace that prefix with "/" in the input buffer;
+             * otherwise,
+             */
+            sptr += 2;
+         }
+         else if(sptr[0] == '/' && sptr[1] == '.' && sptr[2] == '\0')
+         {
+            /* B.  if the input buffer begins with a prefix of "/.",
+             * where "." is a complete path segment, then replace that
+             * prefix with "/" in the input buffer; otherwise,
+             */
+            sptr += 1;
+            *sptr = '/';
+         }
+         else if(sptr[0] == '/' && sptr[1] == '.' && sptr[2] == '.' &&
+            ((sptr[3] == '/') || (sptr[3] == '\0')))
+         {
+            /* C.  if the input buffer begins with a prefix of "/../",
+             * then replace that prefix with "/" in the input buffer and
+             * remove the last segment and its preceding "/" (if any) from
+             * the output buffer; otherwise,
+             */
+            if(sptr[3] == '/')
+            {
+               sptr += 3;
+            }
+            else if(sptr[3] == '\0')
+            {
+               sptr += 2;
+               *sptr = '/';
+            }
+
+            /* remove the last segment and the preceding '/' */
+            if(dptr > dfence)
+            {
+               dptr--;
+               if(dptr[0] == '/')
+               {
+                  dptr--;
+               }
+            }
+            while(dptr >= dfence && dptr[0] != '/')
+            {
+               dptr--;
+            }
+            if(dptr >= dfence)
+            {
+               dptr[0] = '\0';
+            }
+            else
+            {
+               dptr = dfence;
+               dptr[0] = '\0';
+            }
+         }
+         else if(sptr[0] == '.' && sptr[1] == '\0')
+         {
+            /* D. if the input buffer consists only of ".", then remove
+             * that from the input buffer; otherwise,
+             */
+            sptr++;
+
+         }
+         else if(sptr[0] == '.' && sptr[1] == '.' && sptr[2] == '\0')
+         {
+            /* D. if the input buffer consists only of "..", then remove
+             * that from the input buffer; otherwise,
+             */
+            sptr += 2;
+         }
+         else
+         {
+            /* Copy the path segment */
+            do
+            {
+               *dptr++ = *sptr++;
+               *dptr = '\0';
+            } while(sptr[0] != '/' && sptr[0] != '?' && sptr[0] != '\0');
+         }
+      }
+
+      /* Copy the remaining query parameters */
+      if(sptr[0] == '?')
+      {
+         size_t rest_len = strlen(sptr);
+         memcpy(dptr, sptr, rest_len + 1);
+      }
+      else
+      {
+         dptr[0] = '\0';
+      }
+
+      free(rval);
+      free(src);
+      rval = dest;
+   }
+
+   return rval;
+}
+
+char* rdfa_resolve_curie(
+   rdfacontext* context, const char* uri, curieparse_t mode)
+{
+   char* rval = NULL;
+   curie_t ctype = rdfa_get_curie_type(uri);
+
+   if(!uri)
+      return NULL;
+
+   if(ctype == CURIE_TYPE_INVALID)
+   {
+      rval = NULL;
+   }
+   else if((ctype == CURIE_TYPE_IRI_OR_UNSAFE) &&
+           ((mode == CURIE_PARSE_HREF_SRC) ||
+            (context->rdfa_version == RDFA_VERSION_1_0 &&
+               mode == CURIE_PARSE_ABOUT_RESOURCE)))
+   {
+      /* If we are parsing something that can take either a CURIE or a
+       * URI, and the type is either IRI or UNSAFE, assume that it is
+       * an IRI */
+      rval = rdfa_resolve_uri(context, uri);
+   }
+
+   /*
+    * Check to see if the value is a term.
+    */
+   if(ctype == CURIE_TYPE_IRI_OR_UNSAFE && mode == CURIE_PARSE_PROPERTY)
+   {
+      const char* term_iri;
+      term_iri = (const char*)rdfa_get_mapping(context->term_mappings, uri);
+      if(term_iri != NULL)
+      {
+         rval = rdfa_strdup(term_iri);
+      }
+      else if(context->default_vocabulary == NULL && strstr(uri, ":") == NULL)
+      {
+         /* Generate the processor warning if this is a missing term */
+#define FORMAT_1 "The use of the '%s' term was unrecognized by the RDFa processor because it is not a valid term for the current Host Language."
+
+#ifdef LIBRDFA_IN_RAPTOR
+         raptor_parser_warning((raptor_parser*)context->callback_data, 
+                               FORMAT_1, uri);
+#else
+         char msg[1024];
+         snprintf(msg, 1024, FORMAT_1, uri);
+
+         rdfa_processor_triples(context, RDFA_PROCESSOR_WARNING, msg);
+#endif
+      }
+   }
+
+   /* if we are processing a safe CURIE OR
+    * if we are parsing an unsafe CURIE that is an @type_of,
+    * @datatype, @property, @rel, or @rev attribute, treat the curie
+    * as not an IRI, but an unsafe CURIE */
+   if(rval == NULL && ((ctype == CURIE_TYPE_SAFE) ||
+         ((ctype == CURIE_TYPE_IRI_OR_UNSAFE) &&
+          ((mode == CURIE_PARSE_INSTANCEOF_DATATYPE) ||
+           (mode == CURIE_PARSE_PROPERTY) ||
+           (mode == CURIE_PARSE_RELREV) ||
+           (context->rdfa_version == RDFA_VERSION_1_1 &&
+              mode == CURIE_PARSE_ABOUT_RESOURCE)))))
+   {
+      char* working_copy = NULL;
+      char* wcptr = NULL;
+      char* prefix = NULL;
+      char* curie_reference = NULL;
+      const char* expanded_prefix = NULL;
+      size_t uri_len = strlen(uri);
+      working_copy = (char*)malloc(uri_len + 1);
+      memcpy(working_copy, uri, uri_len + 1);/*rdfa_replace_string(working_copy, uri);*/
+
+      /* if this is a safe CURIE, chop off the beginning and the end */
+      if(ctype == CURIE_TYPE_SAFE)
+      {
+         prefix = strtok_r(working_copy, "[:]", &wcptr);
+         if(wcptr)
+            curie_reference = strtok_r(NULL, "[]", &wcptr);
+      }
+      else if(ctype == CURIE_TYPE_IRI_OR_UNSAFE)
+      {
+         prefix = strtok_r(working_copy, ":", &wcptr);
+         if(wcptr)
+            curie_reference = strtok_r(NULL, "", &wcptr);
+      }
+
+      /* fully resolve the prefix and get its length */
+
+      /* if a colon was found, but no prefix, use the XHTML vocabulary URI
+       * as the expanded prefix */
+      if((uri[0] == ':') || (strcmp(uri, "[:]") == 0))
+      {
+         expanded_prefix = XHTML_VOCAB_URI;
+         curie_reference = prefix;
+         prefix = NULL;
+      }
+      else if(uri[0] == ':')
+      {
+         /* FIXME: This looks like a bug - don't know why this code is
+          * in here. I think it's for the case where ":next" is
+          * specified, but the code's not checking that -- manu */
+         expanded_prefix = context->base;
+         curie_reference = prefix;
+         prefix = NULL;
+      }
+      else if(prefix != NULL)
+      {
+         if((mode != CURIE_PARSE_PROPERTY) &&
+            (mode != CURIE_PARSE_RELREV) &&
+            strcmp(prefix, "_") == 0)
+         {
+            /* if the prefix specifies this as a blank node, then we
+             * use the blank node prefix */
+            expanded_prefix = "_";
+         }
+         else
+         {
+            /* if the prefix was defined, get it from the set of URI mappings. */
+#ifdef LIBRDFA_IN_RAPTOR
+            if(!strcmp(prefix, "xml"))
+            {
+               expanded_prefix = RAPTOR_GOOD_CAST(const char*, raptor_xml_namespace_uri);
+            }
+            else
+            {
+               raptor_namespace *nspace;
+               raptor_uri* ns_uri;
+               nspace = raptor_namespaces_find_namespace(&context->sax2->namespaces,
+                                                         (const unsigned char*)prefix,
+                                                         (int)strlen(prefix));
+               if(nspace) {
+                  ns_uri = raptor_namespace_get_uri(nspace);
+                  if(ns_uri)
+                     expanded_prefix = (const char*)raptor_uri_as_string(ns_uri);
+               }
+            }
+#else
+            expanded_prefix =
+               rdfa_get_mapping(context->uri_mappings, prefix);
+
+            /* Generate the processor warning if the prefix was not found */
+            if(expanded_prefix == NULL && strstr(uri, ":") != NULL &&
+               strstr(uri, "://") == NULL)
+            {
+#define FORMAT_2 "The '%s' prefix was not found. You may want to check that it is declared before it is used, or that it is a valid prefix string."
+#ifdef LIBRDFA_IN_RAPTOR
+              raptor_parser_warning((raptor_parser*)context->callback_data, 
+                                    FORMAT_2, prefix);
+#else
+              char msg[1024];
+              snprintf(msg, 1024, FORMAT_2, prefix);
+
+               rdfa_processor_triples(context, RDFA_PROCESSOR_WARNING, msg);
+#endif
+            }
+#endif
+         }
+      }
+
+      if((expanded_prefix != NULL) && (curie_reference != NULL))
+      {
+         /* if the expanded prefix and the reference exist, generate the
+          * full IRI. */
+         if(strcmp(expanded_prefix, "_") == 0)
+         {
+            rval = rdfa_join_string("_:", curie_reference);
+         }
+         else
+         {
+            rval = rdfa_join_string(expanded_prefix, curie_reference);
+         }
+      }
+      else if((expanded_prefix != NULL) && (expanded_prefix[0] != '_') && 
+         (curie_reference == NULL))
+      {
+         /* if the expanded prefix exists, but the reference is null,
+          * generate the CURIE because a reference-less CURIE is still
+          * valid */
+ 	 rval = rdfa_join_string(expanded_prefix, "");
+      }
+
+      free(working_copy);
+   }
+
+   if(rval == NULL)
+   {
+      /* if we're NULL at this point, the CURIE might be the special
+       * unnamed bnode specified by _: */
+      if((strcmp(uri, "[_:]") == 0) || (strcmp(uri, "_:") == 0))
+      {
+         if(context->underscore_colon_bnode_name == NULL)
+         {
+            context->underscore_colon_bnode_name = rdfa_create_bnode(context);
+         }
+         rval = rdfa_replace_string(rval, context->underscore_colon_bnode_name);
+      }
+      /* if we're NULL at this point and the IRI isn't [], then this might be
+       * an IRI */
+      else if(context->rdfa_version == RDFA_VERSION_1_1 &&
+         (strcmp(uri, "[]") != 0))
+      {
+         if((context->default_vocabulary != NULL) &&
+            ((mode == CURIE_PARSE_PROPERTY) || (mode == CURIE_PARSE_RELREV) ||
+               (mode == CURIE_PARSE_INSTANCEOF_DATATYPE)) &&
+            (strstr(uri, ":") == NULL))
+         {
+            rval = rdfa_join_string(context->default_vocabulary, uri);
+         }
+         else if(((mode == CURIE_PARSE_PROPERTY) ||
+            (mode == CURIE_PARSE_ABOUT_RESOURCE) ||
+            (mode == CURIE_PARSE_INSTANCEOF_DATATYPE)) &&
+            (strstr(uri, "_:") == NULL) && (strstr(uri, "[_:") == NULL))
+         {
+            rval = rdfa_resolve_uri(context, uri);
+         }
+      }
+   }
+   
+   /* even though a reference-only CURIE is valid, it does not
+    * generate a triple in XHTML+RDFa. If we're NULL at this point,
+    * the given value wasn't valid in XHTML+RDFa. */
+   
+   return rval;
+}
+
+/**
+ * Resolves a given uri depending on whether or not it is a fully
+ * qualified IRI, a CURIE, or a short-form XHTML reserved word for
+ * @rel or @rev as defined in the XHTML+RDFa Syntax Document.
+ *
+ * @param context the current processing context.
+ * @param uri the URI part to process.
+ *
+ * @return the fully qualified IRI, or NULL if the conversion failed
+ *         due to the given URI not being a short-form XHTML reserved
+ *         word. The memory returned from this function MUST be freed.
+ */
+char* rdfa_resolve_relrev_curie(rdfacontext* context, const char* uri)
+{
+   char* rval = NULL;
+   const char* resource = uri;
+
+   /* check to make sure the URI doesn't have an empty prefix */
+   if(uri[0] == ':')
+   {
+      resource++;
+   }
+
+   /* override reserved words if there is a default vocab defined
+    * NOTE: Don't have to check for RDFa 1.1 mode because vocab is only defined
+    * in RDFa 1.1 */
+   if(context->default_vocabulary != NULL)
+   {
+      rval = rdfa_resolve_curie(context, uri, CURIE_PARSE_RELREV);
+   }
+   else if(context->host_language == HOST_LANGUAGE_XHTML1)
+   {
+      /* search all of the XHTML @rel/@rev reserved words for a
+       * case-insensitive match against the given URI */
+      char* term = rdfa_strdup(resource);
+      char* ptr = NULL;
+
+      for(ptr = term; *ptr; ptr++)
+      {
+        *ptr = RAPTOR_GOOD_CAST(char, tolower(*ptr));
+      }
+
+      rval = (char*)rdfa_get_mapping(context->term_mappings, term);
+      if(rval != NULL)
+      {
+         rval = rdfa_strdup(rval);
+      }
+      free(term);
+   }
+   else
+   {
+      /* Search the term mappings for a match */
+      rval = (char*)rdfa_get_mapping(context->term_mappings, resource);
+      if(rval != NULL)
+      {
+         rval = rdfa_strdup(rval);
+      }
+   }
+
+   /* if a search against the registered terms failed,
+    * attempt to resolve the value as a standard CURIE */
+   if(rval == NULL)
+   {
+      rval = rdfa_resolve_curie(context, uri, CURIE_PARSE_RELREV);
+   }
+
+   /* if a CURIE wasn't found, attempt to resolve the value as an IRI */
+   if(rval == NULL && (context->rdfa_version == RDFA_VERSION_1_1))
+   {
+      rval = rdfa_resolve_uri(context, uri);
+   }
+   
+   return rval;
+}
+
+rdfalist* rdfa_resolve_curie_list(
+   rdfacontext* rdfa_context, const char* uris, curieparse_t mode)
+{
+   rdfalist* rval = rdfa_create_list(3);
+   char* working_uris = NULL;
+   char* uptr = NULL;
+   char* ctoken = NULL;
+   working_uris = rdfa_replace_string(working_uris, uris);
+
+   /* go through each item in the list of CURIEs and resolve each */
+   ctoken = strtok_r(working_uris, RDFA_WHITESPACE, &uptr);
+   
+   while(ctoken != NULL)
+   {
+      char* resolved_curie = NULL;
+
+      if((mode == CURIE_PARSE_INSTANCEOF_DATATYPE) ||
+         (mode == CURIE_PARSE_ABOUT_RESOURCE) ||
+         (mode == CURIE_PARSE_PROPERTY))
+      {
+         resolved_curie =
+            rdfa_resolve_curie(rdfa_context, ctoken, mode);
+      }
+      else if(mode == CURIE_PARSE_RELREV)
+      {
+         resolved_curie =
+            rdfa_resolve_relrev_curie(rdfa_context, ctoken);
+      }
+
+      /* add the CURIE if it was a valid one */
+      if(resolved_curie != NULL)
+      {
+         rdfa_add_item(rval, resolved_curie, RDFALIST_FLAG_TEXT);
+         free(resolved_curie);
+      }
+      
+      ctoken = strtok_r(NULL, RDFA_WHITESPACE, &uptr);
+   }
+   
+   free(working_uris);
+
+   return rval;
+}