/* -*- Mode: c; c-basic-offset: 2 -*- * * raptor_rfc2396.c - Raptor URI resolving from RFC2396 and RFC3986 * * Copyright (C) 2004-2009, David Beckett http://www.dajobe.org/ * Copyright (C) 2004-2004, University of Bristol, UK http://www.bristol.ac.uk/ * * This package is Free Software and part of Redland http://librdf.org/ * * It is licensed under the following three licenses as alternatives: * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version * 2. GNU General Public License (GPL) V2 or any newer version * 3. Apache License, V2.0 or any newer version * * You may not use this file except in compliance with at least one of * the above three licenses. * * See LICENSE.html or LICENSE.txt at the top of this package for the * complete terms and further detail along with the license texts for * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. * * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #ifdef HAVE_STDLIB_H #include #endif /* Raptor includes */ #include "raptor2.h" #include "raptor_internal.h" #ifndef STANDALONE /** * raptor_new_uri_detail: * @uri_string: The URI string to split * * Create a URI detailed structure from a URI string. * **/ raptor_uri_detail* raptor_new_uri_detail(const unsigned char *uri_string) { const unsigned char *s = NULL; unsigned char *b = NULL; raptor_uri_detail *ud; size_t uri_len; if(!uri_string) return NULL; uri_len = strlen((const char*)uri_string); /* The extra +5 is for the 5 \0s that may be added for each component * even if the entire URI is empty */ ud = RAPTOR_CALLOC(raptor_uri_detail*, 1, sizeof(*ud) + uri_len + 5 + 1); if(!ud) return NULL; ud->uri_len = uri_len; ud->buffer = (unsigned char*)((unsigned char*)ud + sizeof(raptor_uri_detail)); s = uri_string; b = ud->buffer; /* Split the URI into it's syntactic components */ /* * scheme is checked in more detail since it is important * to recognise absolute URIs for resolving, and it is easy to do. * * scheme = alpha *( alpha | digit | "+" | "-" | "." ) * RFC 2396 section 3.1 Scheme Component */ if(*s && isalpha((int)*s)) { s++; while(*s && (isalnum((int)*s) || (*s == '+') || (*s == '-') || (*s == '.'))) s++; if(*s == ':') { /* it matches the URI scheme grammar, so store this as a scheme */ ud->scheme = b; ud->scheme_len = s-uri_string; while(*uri_string != ':') *b++ = *uri_string++; *b++ = '\0'; /* and move past the : */ s++; } else s = uri_string; } /* authority */ if(*s && s[1] && *s == '/' && s[1] == '/') { ud->authority = b; s += 2; /* skip "//" */ while(*s && *s != '/' && *s != '?' && *s != '#') *b++ = *s++; ud->authority_len = b-ud->authority; *b++ = '\0'; } /* path */ if(*s && *s != '?' && *s != '#') { ud->path = b; while(*s && *s != '?' && *s != '#') *b++ = *s++; ud->path_len = b-ud->path; *b++ = '\0'; } /* query */ if(*s && *s == '?') { ud->query = b; s++; while(*s && *s != '#') *b++ = *s++; ud->query_len = b-ud->query; *b++ = '\0'; } /* fragment identifier - RFC2396 Section 4.1 */ if(*s && *s == '#') { ud->fragment = b; s++; while(*s) *b++ = *s++; ud->fragment_len = b-ud->fragment; *b='\0'; } ud->is_hierarchical = (ud->path && *ud->path == '/'); return ud; } void raptor_free_uri_detail(raptor_uri_detail* uri_detail) { /* Also frees the uri_detail->buffer allocated in raptor_uri_parse() */ RAPTOR_FREE(raptor_uri_detail, uri_detail); } unsigned char* raptor_uri_detail_to_string(raptor_uri_detail *ud, size_t* len_p) { size_t len = 0; unsigned char *buffer, *p; if(ud->scheme) len+= ud->scheme_len+1; /* : */ if(ud->authority) len+= 2 + ud->authority_len; /* // */ if(ud->path) len+= ud->path_len; if(ud->fragment) len+= 1 + ud->fragment_len; /* # */ if(ud->query) len+= 1 + ud->query_len; /* ? */ if(len_p) *len_p=len; buffer = RAPTOR_MALLOC(unsigned char*, len + 1); if(!buffer) return NULL; p = buffer; if(ud->scheme) { unsigned char *src = ud->scheme; while(*src) *p++ = *src++; *p++ = ':'; } if(ud->authority) { unsigned char *src = ud->authority; *p++ = '/'; *p++ = '/'; while(*src) *p++ = *src++; } if(ud->path) { unsigned char *src = ud->path; while(*src) *p++ = *src++; } if(ud->fragment) { unsigned char *src = ud->fragment; *p++ = '#'; while(*src) *p++ = *src++; } if(ud->query) { unsigned char *src = ud->query; *p++ = '?'; while(*src) *p++ = *src++; } *p='\0'; return buffer; } /* * raptor_uri_normalize_path: * @path_buffer: URI/file path * @path_len: length of above * * INTERNAL - normalize a URI path (POSIX path too) * * Return value: new path length or 0 on failure */ size_t raptor_uri_normalize_path(unsigned char* path_buffer, size_t path_len) { unsigned char *p, *cur, *prev, *s; unsigned char last_char; #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 2 RAPTOR_DEBUG3("Input path \"%s\" (%zu)\n", (const char*)path_buffer, path_len); #endif /* remove all "./" path components */ for(p = (prev = path_buffer); *p; p++) { if(*p != '/') continue; if(p == (prev+1) && *prev == '.') { unsigned char *dest = prev; p++; while(*p) *dest++ = *p++; *dest= '\0'; p = prev; path_len -= 2; if(!*p) break; } else { prev = p+1; } } if(p == (prev+1) && *prev == '.') { /* Remove "." at the end of a path */ *prev = '\0'; path_len--; } #if defined(RAPTOR_DEBUG) if(path_len != strlen((const char*)path_buffer)) RAPTOR_FATAL4("Path '%s' length %ld does not match calculated %ld.", (const char*)path_buffer, (long)strlen((const char*)path_buffer), (long)path_len); #endif /* Remove all "/../" path components */ /* * The pointers: * /../ * prev-^ cur-^ * and p points to the previous prev (can be NULL) */ prev = NULL; cur = NULL; p = NULL; last_char='\0'; for(s = path_buffer; *s; last_char=*s++) { /* find the path components */ if(*s != '/') { /* If it is the start or following a /, record a new path component */ if(!last_char || last_char == '/') { /* Store 2 path components */ if(!prev) prev = s; else if(!cur) cur = s; } continue; } /* Wait till there are two path components */ if(!prev || !cur) continue; #if defined(RAPTOR_DEBUG) if(path_len != strlen((const char*)path_buffer)) RAPTOR_FATAL3("Path length %ld does not match calculated %ld.", (long)strlen((const char*)path_buffer), (long)path_len); #endif /* If the current one is '..' */ if(s == (cur+2) && cur[0] == '.' && cur[1] == '.') { /* and if the previous one isn't '..' * (which means it is beyond the root such as a path "/foo/../..") */ if(cur != (prev+3) || prev[0] != '.' || prev[1] != '.') { unsigned char *dest = prev; /* remove the /../ * prev-^ cur-^ ^-s */ size_t len = s-prev+1; /* length of path component we are removing */ s++; while(*s) *dest++ = *s++; *dest = '\0'; path_len -= len; if(p && p < prev) { /* We know the previous prev path component and we didn't do * two adjustments in a row, so can adjust the * pointers to continue the newly shortened path: * s to the / before (autoincremented by the loop) * prev to the previous prev path component * cur to NULL. Will be set by the next loop iteration since s * points to a '/', last_char will be set to *s. */ s = prev-1; prev = p; cur = NULL; p = NULL; } else { /* Otherwise must start from the beginning again */ prev = NULL; cur = NULL; p = NULL; s = path_buffer; } } } else { /* otherwise this is not a special path component so * shift the path components stack */ p = prev; prev = cur; cur = NULL; } } if(prev && s == (cur+2) && cur[0] == '.' && cur[1] == '.') { /* Remove /.. at the end of the path */ *prev = '\0'; path_len -= (s-prev); } #if defined(RAPTOR_DEBUG) if(path_len != strlen((const char*)path_buffer)) RAPTOR_FATAL3("Path length %ld does not match calculated %ld.", (long)strlen((const char*)path_buffer), (long)path_len); #endif /* RFC3986 Appendix C.2 / 5.4.2 Abnormal Examples * Remove leading /../ and /./ */ for(p = path_buffer; p; ) { if(!strncmp((const char *)p, "/../", 4)) { path_len -= 3; memmove(p, p+3, path_len+1); } else if(!strncmp((const char *)p, "/./", 3)) { path_len -= 2; memmove(p, p+2, path_len+1); } else break; } #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 2 fprintf(stderr, " Normalized path \"%s\" (%zu)\n", path_buffer, path_len); #endif return path_len; } /** * raptor_uri_resolve_uri_reference: * @base_uri: Base URI string * @reference_uri: Reference URI string * @buffer: Destination URI output buffer * @length: Length of destination output buffer * * Resolve a URI against a base URI to create a new absolute URI. * * Return value: length of resolved string or 0 on failure (such as @buffer too small) **/ size_t raptor_uri_resolve_uri_reference(const unsigned char *base_uri, const unsigned char *reference_uri, unsigned char *buffer, size_t length) { raptor_uri_detail *ref = NULL; raptor_uri_detail *base = NULL; raptor_uri_detail result; /* static - pointers go to inside ref or base */ unsigned char *path_buffer = NULL; unsigned char *p; size_t result_len = 0; size_t l; #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 2 RAPTOR_DEBUG4("base uri='%s', reference_uri='%s, buffer size %d\n", (base_uri ? (const char*)base_uri : "NULL"), (reference_uri ? (const char*)reference_uri : "NULL"), (int)length); #endif *buffer = '\0'; memset(&result, 0, sizeof(result)); ref = raptor_new_uri_detail(reference_uri); if(!ref) goto resolve_tidy; /* is reference URI "" or "#frag"? */ if(!ref->scheme && !ref->authority && !ref->path && !ref->query) { unsigned char c; /* Copy base URI to result up to '\0' or '#' */ for(p = buffer, l = length; (c = *base_uri) && c != '#' && l; p++, base_uri++, l--) *p = c; if(!l) { result_len = 0; goto resolve_tidy; } *p = '\0'; if(ref->fragment) { unsigned char *src = ref->fragment; /* Append any fragment */ *p++ = '#'; while(*src && l) { *p++ = *src++; l--; } if(!l) { result_len = 0; goto resolve_tidy; } *p = '\0'; } result_len = p - buffer; goto resolve_tidy; } /* reference has a scheme - is an absolute URI */ if(ref->scheme) { /* Copy over schema and authority */ result.scheme = ref->scheme; result.scheme_len = ref->scheme_len; result.authority = ref->authority; result.authority_len = ref->authority_len; /* Allocate path so it can be normalized below */ result.path_len = ref->path_len; path_buffer = RAPTOR_MALLOC(unsigned char*, result.path_len + 1); if(!path_buffer) { result_len = 0; goto resolve_tidy; } if(ref->path_len) memcpy(path_buffer, ref->path, ref->path_len); path_buffer[result.path_len] = '\0'; result.path = path_buffer; goto normalize; } /* now the reference URI must be schemeless, i.e. relative */ base = raptor_new_uri_detail(base_uri); if(!base) goto resolve_tidy; /* result URI must be of the base URI scheme */ result.scheme = base->scheme; result.scheme_len = base->scheme_len; /* an authority is given ( [user:pass@]hostname[:port] for http) * so the reference URI is like //authority */ if(ref->authority) { result.authority = ref->authority; result.authority_len = ref->authority_len; result.path = ref->path; result.path_len = ref->path_len; goto resolve_end; } /* no - so now we have path (maybe with query, fragment) relative to base */ result.authority = base->authority; result.authority_len = base->authority_len; if(ref->is_hierarchical || !base->is_hierarchical) { /* if the reference path is absolute OR the base URI * is a non-hierarchical URI then just copy the reference path * to the result and normalize. */ path_buffer = RAPTOR_MALLOC(unsigned char*, ref->path_len + 1); if(!path_buffer) { result_len = 0; goto resolve_tidy; } result.path = path_buffer; result.path_len = ref->path_len; if(ref->path) memcpy(path_buffer, ref->path, result.path_len); path_buffer[result.path_len] = '\0'; goto normalize; } /* need to resolve relative path */ /* Build the result path in path_buffer */ result.path_len = 0; if(base->path) result.path_len += base->path_len; else { /* Add a missing path - makes the base URI 1 character longer */ base->path = (unsigned char*)"/"; /* static, but copied and not free()d */ base->path_len = 1; base->uri_len++; result.path_len++; } if(ref->path) result.path_len += ref->path_len; /* the resulting path can be no longer than result.path_len */ path_buffer = RAPTOR_MALLOC(unsigned char*, result.path_len + 1); if(!path_buffer) { result_len = 0; goto resolve_tidy; } result.path = path_buffer; *path_buffer = '\0'; if(!ref->path) { /* If there is no reference path, copy the full base over */ result.path_len = base->path_len; memcpy(path_buffer, base->path, result.path_len); } else { /** Otherwise copy base path up to previous / and append ref path */ for(p = base->path + base->path_len - 1; p > base->path && *p != '/'; p--) ; if(p >= base->path) { result.path_len = p-base->path + 1; /* Found a /, copy everything before that to path_buffer */ memcpy(path_buffer, base->path, result.path_len); path_buffer[result.path_len] = '\0'; } memcpy(path_buffer + result.path_len, ref->path, ref->path_len + 1); result.path_len += ref->path_len; } path_buffer[result.path_len] = '\0'; normalize: result.path_len = raptor_uri_normalize_path(path_buffer, result.path_len); resolve_end: if(ref->query) { result.query = ref->query; result.query_len = ref->query_len; } if(ref->fragment) { result.fragment = ref->fragment; result.fragment_len = ref->fragment_len; } l = 0; if(result.scheme) l = result.scheme_len + 1; if(result.authority) l += 2 + result.authority_len; if(result.path) l += result.path_len; if(result.query) l += 1 + result.query_len; if(result.fragment) l += 1 + result.fragment_len; if(l > length) { /* Output buffer is too small */ result_len = 0; goto resolve_tidy; } p = buffer; if(result.scheme) { memcpy(p, result.scheme, result.scheme_len); p += result.scheme_len; *p++ = ':'; } if(result.authority) { *p++ = '/'; *p++ = '/'; memcpy(p, result.authority, result.authority_len); p+= result.authority_len; } if(result.path) { memcpy(p, result.path, result.path_len); p+= result.path_len; } if(result.query) { *p++ = '?'; memcpy(p, result.query, result.query_len); p+= result.query_len; } if(result.fragment) { *p++ = '#'; memcpy(p, result.fragment, result.fragment_len); p+= result.fragment_len; } *p = '\0'; result_len = p - buffer; resolve_tidy: if(path_buffer) RAPTOR_FREE(char*, path_buffer); if(base) raptor_free_uri_detail(base); if(ref) raptor_free_uri_detail(ref); #ifdef RAPTOR_DEBUG RAPTOR_ASSERT(result_len && strlen((const char*)buffer) != result_len, "URI string is not declared length"); #endif return result_len; } #endif #ifdef STANDALONE #include /* one more prototype */ int main(int argc, char *argv[]); static const char *program; static int check_resolve(const char *base_uri, const char *reference_uri, const char *result_uri) { unsigned char buffer[1024]; raptor_uri_resolve_uri_reference((const unsigned char*)base_uri, (const unsigned char*)reference_uri, buffer, sizeof(buffer)); if(strcmp((const char*)buffer, result_uri)) { fprintf(stderr, "%s: raptor_uri_resolve_uri_reference(%s, %s) FAILED giving '%s' != '%s'\n", program, base_uri, reference_uri, buffer, result_uri); return 1; } #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 2 fprintf(stderr, "%s: raptor_uri_resolve_uri_reference(%s, %s) OK giving '%s'\n", program, base_uri, reference_uri, buffer); #endif return 0; } static int check_parses(const char *uri_string) { raptor_uri_detail* ud; ud = raptor_new_uri_detail((unsigned const char*)uri_string); if(!ud) { fprintf(stderr, "%s: raptor_new_uri_detail(%s) FAILED to parse\n", program, uri_string); return 1; } #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 2 fprintf(stderr, "%s: raptor_new_uri_detail(%s) OK\n", program, uri_string); #endif raptor_free_uri_detail(ud); return 0; } int main(int argc, char *argv[]) { const char *base_uri="http://example.org/bpath/cpath/d;p?querystr#frag"; int failures = 0; program = raptor_basename(argv[0]); #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 fprintf(stderr, "%s: Using base URI '%s'\n", program, base_uri); #endif /* Tests from RFC2396 Appendix C * and RFC3986 Section 5 * * Modifications: * - add 'path' when items are path components to make easier to read * - use example.org instead of 'a' for the authority * - results are against the base_uri above */ /* Appendix C.1 / 5.4.1 Normal Examples */ failures += check_resolve(base_uri, "g:h", "g:h"); failures += check_resolve(base_uri, "gpath", "http://example.org/bpath/cpath/gpath"); failures += check_resolve(base_uri, "./gpath", "http://example.org/bpath/cpath/gpath"); failures += check_resolve(base_uri, "gpath/", "http://example.org/bpath/cpath/gpath/"); failures += check_resolve(base_uri, "/gpath", "http://example.org/gpath"); failures += check_resolve(base_uri, "//gpath", "http://gpath"); failures += check_resolve(base_uri, "?y", "http://example.org/bpath/cpath/d;p?y"); failures += check_resolve(base_uri, "gpath?y", "http://example.org/bpath/cpath/gpath?y"); failures += check_resolve(base_uri, "#s", "http://example.org/bpath/cpath/d;p?querystr#s"); failures += check_resolve(base_uri, "gpath#s", "http://example.org/bpath/cpath/gpath#s"); failures += check_resolve(base_uri, "gpath?y#s", "http://example.org/bpath/cpath/gpath?y#s"); failures += check_resolve(base_uri, ";x", "http://example.org/bpath/cpath/;x"); failures += check_resolve(base_uri, "gpath;x", "http://example.org/bpath/cpath/gpath;x"); failures += check_resolve(base_uri, "gpath;x?y#s", "http://example.org/bpath/cpath/gpath;x?y#s"); failures += check_resolve(base_uri, ".", "http://example.org/bpath/cpath/"); failures += check_resolve(base_uri, "./", "http://example.org/bpath/cpath/"); failures += check_resolve(base_uri, "..", "http://example.org/bpath/"); failures += check_resolve(base_uri, "../", "http://example.org/bpath/"); failures += check_resolve(base_uri, "../gpath", "http://example.org/bpath/gpath"); failures += check_resolve(base_uri, "../..", "http://example.org/"); failures += check_resolve(base_uri, "../../", "http://example.org/"); failures += check_resolve(base_uri, "../../gpath", "http://example.org/gpath"); /* Appendix C.2 / 5.4.2 Abnormal Examples */ failures += check_resolve(base_uri, "", "http://example.org/bpath/cpath/d;p?querystr"); /* This is a Normal Example in RFC 3986 */ failures += check_resolve(base_uri, "../../../gpath", "http://example.org/gpath"); /* RFC 3986 changed the answer here */ failures += check_resolve(base_uri, "../../../../gpath", "http://example.org/gpath"); /* RFC 3986 changed the answer here */ failures += check_resolve(base_uri, "/./gpath", "http://example.org/gpath"); /* RFC 3986 changed the answer here */ failures += check_resolve(base_uri, "/../gpath", "http://example.org/gpath"); /* RFC 3986 changed the answer here */ failures += check_resolve(base_uri, "gpath.", "http://example.org/bpath/cpath/gpath."); failures += check_resolve(base_uri, ".gpath", "http://example.org/bpath/cpath/.gpath"); failures += check_resolve(base_uri, "gpath..", "http://example.org/bpath/cpath/gpath.."); failures += check_resolve(base_uri, "..gpath", "http://example.org/bpath/cpath/..gpath"); failures += check_resolve(base_uri, "./../gpath", "http://example.org/bpath/gpath"); failures += check_resolve(base_uri, "./gpath/.", "http://example.org/bpath/cpath/gpath/"); failures += check_resolve(base_uri, "gpath/./hpath", "http://example.org/bpath/cpath/gpath/hpath"); failures += check_resolve(base_uri, "gpath/../hpath", "http://example.org/bpath/cpath/hpath"); failures += check_resolve(base_uri, "gpath;x = 1/./y", "http://example.org/bpath/cpath/gpath;x = 1/y"); failures += check_resolve(base_uri, "gpath;x = 1/../y", "http://example.org/bpath/cpath/y"); failures += check_resolve(base_uri, "gpath?y/./x", "http://example.org/bpath/cpath/gpath?y/./x"); failures += check_resolve(base_uri, "gpath?y/../x", "http://example.org/bpath/cpath/gpath?y/../x"); failures += check_resolve(base_uri, "gpath#s/./x", "http://example.org/bpath/cpath/gpath#s/./x"); failures += check_resolve(base_uri, "gpath#s/../x", "http://example.org/bpath/cpath/gpath#s/../x"); /* RFC 3986 makes this the strict answer but also allows * http://example.org/bpath/cpath/gauthority * for backward compatibility */ failures += check_resolve(base_uri, "http:gauthority", "http:gauthority"); /* Examples from 1.3 */ failures += check_parses("ftp://ftp.is.co.za/rfc/rfc1808.txt"); failures += check_parses("gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles"); failures += check_parses("http://www.math.uio.no/faq/compression-faq/part1.html"); failures += check_parses("mailto:mduerst@ifi.unizh.ch"); failures += check_parses("news:comp.infosystems.www.servers.unix"); failures += check_parses("telnet://melvyl.ucop.edu/"); failures += check_parses(""); /* This is a not-crashing test */ raptor_new_uri_detail(NULL); /* Extra checks not in RFC2396 */ /* RDF xml:base check that fragments and query strings are removed */ failures += check_resolve(base_uri, "gpath/../../../hpath", "http://example.org/hpath"); /* RFC3986 changed the answer to this test * Was "RDF xml:base check that extra ../ are not lost" * with answer "http://example.org/../../../absfile" */ failures += check_resolve("http://example.org/dir/file", "../../../absfile", "http://example.org/absfile"); /* RDF xml:base check that an absolute URI replaces */ failures += check_resolve("http://example.org/dir/file", "http://another.example.org/dir2/file2", "http://another.example.org/dir2/file2"); /* base URI and relative URI with no absolute path works */ failures += check_resolve("foo:", "not_scheme:blah", "foo:not_scheme:blah"); /* Issue#000177 http://bugs.librdf.org/mantis/view.php?id=177 */ failures += check_resolve("foo:1234", "9999", "foo:9999"); /* RDFa 1.1 test 0114 */ failures += check_resolve("http://example.org/file", "?foo=bar../baz", "http://example.org/file?foo=bar../baz"); /* BUG 556 - http://bugs.librdf.org/mantis/view.php?id=556 */ failures += check_resolve("http://example.com/folder1/folder2/", "http://example.com/folder1/folder2/../folder1/../entity1", "http://example.com/folder1/entity1"); return failures; } #endif