diff options
Diffstat (limited to 'src/raptor_ntriples.c')
-rw-r--r-- | src/raptor_ntriples.c | 723 |
1 files changed, 723 insertions, 0 deletions
diff --git a/src/raptor_ntriples.c b/src/raptor_ntriples.c new file mode 100644 index 0000000..3276e79 --- /dev/null +++ b/src/raptor_ntriples.c @@ -0,0 +1,723 @@ +/* -*- Mode: c; c-basic-offset: 2 -*- + * + * raptor_ntriples.c - Raptor N-Triples parsing utilities + * + * Copyright (C) 2013, David Beckett http://www.dajobe.org/ + * + * This package is Free Software and part of Redland http://librdf.org/ + * + * It is licensed under the following three licenses as alternatives: + * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version + * 2. GNU General Public License (GPL) V2 or any newer version + * 3. Apache License, V2.0 or any newer version + * + * You may not use this file except in compliance with at least one of + * the above three licenses. + * + * See LICENSE.html or LICENSE.txt at the top of this package for the + * complete terms and further detail along with the license texts for + * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. + * + */ + + +#ifdef HAVE_CONFIG_H +#include <raptor_config.h> +#endif + +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <stdarg.h> +#ifdef HAVE_ERRNO_H +#include <errno.h> +#endif +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif + + +/* Raptor includes */ +#include "raptor2.h" +#include "raptor_internal.h" + + +/* These are for 7-bit ASCII and not locale-specific */ +#define IS_ASCII_ALPHA(c) (((c) > 0x40 && (c) < 0x5B) || ((c) > 0x60 && (c) < 0x7B)) +#define IS_ASCII_UPPER(c) ((c) > 0x40 && (c) < 0x5B) +#define IS_ASCII_DIGIT(c) ((c) > 0x2F && (c) < 0x3A) +#define IS_ASCII_PRINT(c) ((c) > 0x1F && (c) < 0x7F) +#define TO_ASCII_LOWER(c) ((c)+0x20) + +typedef enum { + RAPTOR_TERM_CLASS_URI, /* ends on > */ + RAPTOR_TERM_CLASS_BNODEID, /* ends on first non [A-Za-z][A-Za-z0-9]* */ + RAPTOR_TERM_CLASS_STRING, /* ends on non-escaped " */ + RAPTOR_TERM_CLASS_LANGUAGE /* ends on first non [a-z0-9]+ ('-' [a-z0-9]+ )? */ +} raptor_ntriples_term_class; + + +static int +raptor_ntriples_term_valid(unsigned char c, int position, + raptor_ntriples_term_class term_class) +{ + int result = 0; + + switch(term_class) { + case RAPTOR_TERM_CLASS_URI: + /* ends on > */ + result = (c != '>'); + break; + + case RAPTOR_TERM_CLASS_BNODEID: + /* ends on first non [A-Za-z0-9_:][-.A-Za-z0-9]* */ + result = IS_ASCII_ALPHA(c) || IS_ASCII_DIGIT(c) || c == '_' || c == ':'; + if(position) + /* FIXME + * This isn't correct; '.' is allowed in positions 1..N-1 but + * this calling convention of character-by-character cannot + * check this. + */ + result = (result || c == '-' || c == '.'); + break; + + case RAPTOR_TERM_CLASS_STRING: + /* ends on " */ + result = (c != '"'); + break; + + case RAPTOR_TERM_CLASS_LANGUAGE: + /* ends on first non [a-zA-Z]+ ('-' [a-zA-Z0-9]+ )? + * Accept _ as synonym / typo for -. + */ + result = IS_ASCII_ALPHA(c); + if(position) + result = (result || IS_ASCII_DIGIT(c) || c == '-' || c == '_'); + break; + + default: + RAPTOR_DEBUG2("Unknown N-Triples term class %u", term_class); + } + + return result; +} + + +/* + * raptor_ntriples_parse_term_internal: + * @world: raptor world + * @locator: locator object (in/out) (or NULL) + * @start: pointer to starting character of string (in) + * @dest: destination of string (in) + * @lenp: pointer to length of string (in/out) + * @dest_lenp: pointer to length of destination string (out) + * @end_char: string ending character + * @class: string class + * + * INTERNAL - Parse an N-Triples term with escapes. + * + * Relies that @dest is long enough; it need only be as large as the + * input string @start since when UTF-8 encoding, the escapes are + * removed and the result is always less than or equal to length of + * input. + * + * N-Triples strings / URIs are written in ASCII at present; + * characters outside the printable ASCII range are discarded with a + * warning. See the grammar for full details of the allowed ranges. + * + * UTF-8 and the \u and \U esapes are both allowed. + * + * URIs may not have \t \b \n \r \f or raw ' ' or \u0020 or \u003C or \u003E + * + * Return value: Non 0 on failure + **/ +static int +raptor_ntriples_parse_term_internal(raptor_world* world, + raptor_locator* locator, + const unsigned char **start, + unsigned char *dest, + size_t *lenp, size_t *dest_lenp, + char end_char, + raptor_ntriples_term_class term_class) +{ + const unsigned char *p = *start; + unsigned char c = '\0'; + size_t ulen = 0; + unsigned long unichar = 0; + unsigned int position = 0; + int end_char_seen = 0; + + /* find end of string, fixing backslashed characters on the way */ + while(*lenp > 0) { + int unichar_width; + + c = *p; + + p++; + (*lenp)--; + if(locator) { + locator->column++; + locator->byte++; + } + + if(term_class == RAPTOR_TERM_CLASS_URI && c == ' ') { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, + "URI error - illegal character %d (0x%02X) found.", + c, RAPTOR_GOOD_CAST(unsigned int, c)); + return 1; + } + + if(c > 0x7f) { + /* just copy the UTF-8 bytes through */ + int unichar_len; + unichar_len = raptor_unicode_utf8_string_get_char(p - 1, 1 + *lenp, NULL); + if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > *lenp) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, + "UTF-8 encoding error at character %d (0x%02X) found.", + c, RAPTOR_GOOD_CAST(unsigned int, c)); + /* UTF-8 encoding had an error or ended in the middle of a string */ + return 1; + } + memmove(dest, p-1, unichar_len); + dest += unichar_len; + + unichar_len--; /* p, *lenp were moved on by 1 earlier */ + + p += unichar_len; + (*lenp) -= unichar_len; + if(locator) { + locator->column += unichar_len; + locator->byte += unichar_len; + } + continue; + } + + if(c != '\\') { + /* finish at non-backslashed end_char */ + if(end_char && c == end_char) { + end_char_seen = 1; + break; + } + + if(!raptor_ntriples_term_valid(c, position, term_class)) { + if(end_char) { + /* end char was expected, so finding an invalid thing is an error */ + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' (found '%c')", end_char, c); + return 0; + } else { + /* it's the end - so rewind 1 to save next char */ + p--; + (*lenp)++; + if(locator) { + locator->column--; + locator->byte--; + } + if(term_class == RAPTOR_TERM_CLASS_BNODEID && dest[-1] == '.') { + /* If bnode id ended on '.' move back one */ + dest--; + + p--; + (*lenp)++; + if(locator) { + locator->column--; + locator->byte--; + } + } + break; + } + } + + /* otherwise store and move on */ + *dest++ = c; + position++; + continue; + } + + if(!*lenp) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "\\ at end of input."); + return 0; + } + + c = *p; + + p++; + (*lenp)--; + if(locator) { + locator->column++; + locator->byte++; + } + + switch(c) { + case '"': + case '\\': + *dest++ = c; + break; + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + if(term_class == RAPTOR_TERM_CLASS_URI) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI error - illegal URI escape '\\%c'.", c); + return 1; + } + + if(c == 'b') + *dest++ = '\b'; + else if(c == 'f') + *dest++ = '\f'; + else if(c == 'n') + *dest++ = '\n'; + else if(c == 'r') + *dest++ = '\r'; + else /* 't' */ + *dest++ = '\t'; + break; + case '<': + case '>': + case '{': + case '}': + case '|': + case '^': + case '`': + /* Turtle 2013 allows these in URIs (as well as \" and \\) */ + *dest++ = c; + break; + + case 'u': + case 'U': + ulen = (c == 'u') ? 4 : 8; + + if(*lenp < ulen) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "%c over end of input.", c); + return 0; + } + + if(1) { + unsigned int ii; + int n = 0; + + for(ii = 0; ii < ulen; ii++) { + char cc = p[ii]; + if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "N-Triples string error - illegal hex digit %c in Unicode escape '%c%s...'", + cc, c, p); + n = 1; + break; + } + } + + if(n) + break; + + n = sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); + if(n != 1) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Uncode escape '%c%s...'", c, p); + break; + } + } + + p += ulen; + (*lenp) -= ulen; + if(locator) { + locator->column += RAPTOR_GOOD_CAST(int, ulen); + locator->byte += RAPTOR_GOOD_CAST(int, ulen); + } + + if(term_class == RAPTOR_TERM_CLASS_URI && + (unichar == 0x0020 || unichar == 0x003C || unichar == 0x003E)) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI error - illegal Unicode escape \\u%04lX in URI.", unichar); + break; + } + + if(unichar > raptor_unicode_max_codepoint) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX (max #x%lX).", unichar, raptor_unicode_max_codepoint); + break; + } + + unichar_width = raptor_unicode_utf8_string_put_char(unichar, dest, 4); + if(unichar_width < 0) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX.", unichar); + break; + } + + /* The destination length is set here to 4 since we know that in + * all cases, the UTF-8 encoded output sequence is always shorter + * than the input sequence, and the buffer is edited in place. + * \uXXXX: 6 bytes input - UTF-8 max 3 bytes output + * \uXXXXXXXX: 10 bytes input - UTF-8 max 4 bytes output + */ + dest += (int)unichar_width; + break; + + default: + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal string escape \\%c in \"%s\"", c, (char*)start); + return 0; + } + + position++; + } /* end while */ + + + if(end_char && !end_char_seen) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' before end of input.", end_char); + return 1; + } + + /* terminate dest, can be shorter than source */ + *dest = '\0'; + + if(dest_lenp) + *dest_lenp = p - *start; + + *start = p; + + return 0; +} + + +static int +raptor_parse_turtle_term_internal(raptor_world* world, + raptor_locator* locator, + const unsigned char **start, + unsigned char *dest, + size_t *len_p, size_t *dest_lenp, + raptor_uri** datatype_uri_p) +{ + const unsigned char *p = *start; + unsigned int position = 0; + /* 0 = xsd:integer; 1= xsd:decimal; 2= xsd:double */ + short dtype = 0; + int after_e = 0; + + while(*len_p > 0) { + unsigned char c = *p; + + if(after_e) { + if(!((c >= '0' && c <'9') || c == '+' || c == '-')) + break; + after_e = 0; + } else if((position > 0 && (c == '+' || c == '-')) || + !((c >= '0' && c <'9') || c == '.' || c == 'e' || c == 'E')) + break; + + if(c == '.') + dtype = 1; + else if(c == 'e' || c == 'E') { + dtype = 2; + after_e = 1; + } + + p++; + (*len_p)--; + if(locator) { + locator->column++; + locator->byte++; + } + + *dest++ = c; + + position++; + } + + *dest = '\0'; + + if(dest_lenp) + *dest_lenp = p - *start; + + *start = p; + + if(dtype == 0) + *datatype_uri_p = raptor_uri_copy(world->xsd_integer_uri); + else if (dtype == 1) + *datatype_uri_p = raptor_uri_copy(world->xsd_decimal_uri); + else + *datatype_uri_p = raptor_uri_copy(world->xsd_double_uri); + + return 0; +} + + +/* + * raptor_ntriples_parse_term: + * @world: raptor world + * @locator: raptor locator (in/out) (or NULL) + * @string: string input (in) + * @len_p: pointer to length of @string (in/out) + * @term_p: pointer to store term (out) + * @allow_turtle: non-0 to allow Turtle forms such as integers, boolean + * + * INTERNAL - Parse an N-Triples string into a #raptor_term + * + * The @len_p destination and @locator fields are modified as parsing + * proceeds to be used in error messages. The final value is written + * into the #raptor_term pointed at by @term_p + * + * Return value: number of bytes processed or 0 on failure + */ +size_t +raptor_ntriples_parse_term(raptor_world* world, raptor_locator* locator, + unsigned char *string, size_t *len_p, + raptor_term** term_p, int allow_turtle) +{ + unsigned char *p = string; + unsigned char *dest; + size_t term_length = 0; + + switch(*p) { + case '<': + dest = p; + + p++; + (*len_p)--; + if(locator) { + locator->column++; + locator->byte++; + } + + if(raptor_ntriples_parse_term_internal(world, locator, + (const unsigned char**)&p, + dest, len_p, &term_length, + '>', RAPTOR_TERM_CLASS_URI)) { + goto fail; + } + + if(1) { + raptor_uri *uri; + + /* Check for bad ordinal predicate */ + if(!strncmp((const char*)dest, + "http://www.w3.org/1999/02/22-rdf-syntax-ns#_", 44)) { + int ordinal = raptor_check_ordinal(dest + 44); + if(ordinal <= 0) + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal ordinal value %d in property '%s'.", ordinal, dest); + } + if(raptor_uri_uri_string_is_absolute(dest) <= 0) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI '%s' is not absolute.", dest); + goto fail; + } + + uri = raptor_new_uri(world, dest); + if(!uri) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Could not create URI for '%s'", (const char *)dest); + goto fail; + } + + *term_p = raptor_new_term_from_uri(world, uri); + raptor_free_uri(uri); + } + break; + + case '-': + case '+': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if(allow_turtle) { + raptor_uri* datatype_uri = NULL; + + dest = p; + + if(raptor_parse_turtle_term_internal(world, locator, + (const unsigned char**)&p, + dest, len_p, &term_length, + &datatype_uri)) { + goto fail; + } + + *term_p = raptor_new_term_from_literal(world, + dest, + datatype_uri, + NULL /* language */); + } else + goto fail; + break; + + case '"': + dest = p; + + p++; + (*len_p)--; + if(locator) { + locator->column++; + locator->byte++; + } + + if(raptor_ntriples_parse_term_internal(world, locator, + (const unsigned char**)&p, + dest, len_p, &term_length, + '"', RAPTOR_TERM_CLASS_STRING)) { + goto fail; + } + + if(1) { + unsigned char *object_literal_language = NULL; + unsigned char *object_literal_datatype = NULL; + raptor_uri* datatype_uri = NULL; + + if(*len_p && *p == '@') { + unsigned char *q; + size_t lang_len; + + object_literal_language = p; + + /* Skip - */ + p++; + (*len_p)--; + if(locator) { + locator->column++; + locator->byte++; + } + + if(!*len_p) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing language after \"string\"-"); + goto fail; + } + + if(raptor_ntriples_parse_term_internal(world, locator, + (const unsigned char**)&p, + object_literal_language, len_p, &lang_len, + '\0', RAPTOR_TERM_CLASS_LANGUAGE)) { + goto fail; + } + + if(!lang_len) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Invalid language tag at @%s", p); + goto fail; + } + + /* Normalize language to lowercase + * http://www.w3.org/TR/rdf-concepts/#dfn-language-identifier + * Also convert _ to - as synonym / typo. + */ + for(q = object_literal_language; *q; q++) { + if(IS_ASCII_UPPER(*q)) + *q = RAPTOR_GOOD_CAST(unsigned char, TO_ASCII_LOWER(*q)); + if(*q == '_') + *q = '-'; + } + + } + + if(*len_p > 1 && *p == '^' && p[1] == '^') { + + object_literal_datatype = p; + + /* Skip ^^ */ + p += 2; + *len_p -= 2; + if(locator) { + locator->column += 2; + locator->byte += 2; + } + + if(!*len_p || (*len_p && *p != '<')) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing datatype URI-ref in\"string\"^^<URI-ref> after ^^"); + goto fail; + } + + p++; + (*len_p)--; + if(locator) { + locator->column++; + locator->byte++; + } + + if(raptor_ntriples_parse_term_internal(world, locator, + (const unsigned char**)&p, + object_literal_datatype, len_p, NULL, + '>', RAPTOR_TERM_CLASS_URI)) { + goto fail; + } + + if(raptor_uri_uri_string_is_absolute(object_literal_datatype) <= 0) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Datatype URI '%s' is not absolute.", object_literal_datatype); + goto fail; + } + + } + + if(object_literal_datatype && object_literal_language) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Typed literal used with a language - ignoring the language"); + object_literal_language = NULL; + } + + if(object_literal_datatype) { + datatype_uri = raptor_new_uri(world, + object_literal_datatype); + if(!datatype_uri) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Could not create literal datatype uri '%s'", object_literal_datatype); + goto fail; + } + object_literal_language = NULL; + } + + *term_p = raptor_new_term_from_literal(world, + dest, + datatype_uri, + object_literal_language); + if(datatype_uri) + raptor_free_uri(datatype_uri); + } + + break; + + + case '_': + /* store where _ was */ + dest = p; + + p++; + (*len_p)--; + if(locator) { + locator->column++; + locator->byte++; + } + + if(!*len_p || (*len_p > 0 && *p != ':')) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal bNodeID - _ not followed by :"); + goto fail; + } + + /* Found ':' - move on */ + + p++; + (*len_p)--; + if(locator) { + locator->column++; + locator->byte++; + } + + if(raptor_ntriples_parse_term_internal(world, locator, + (const unsigned char**)&p, + dest, len_p, &term_length, + '\0', + RAPTOR_TERM_CLASS_BNODEID)) { + goto fail; + } + + if(!term_length) { + raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Bad or missing bNodeID after _:"); + goto fail; + } + + *term_p = raptor_new_term_from_blank(world, dest); + + break; + + default: + RAPTOR_DEBUG2("Unknown term type '%c'", *p); + goto fail; + } + + fail: + + return p - string; +} |