summaryrefslogtreecommitdiffstats
path: root/src/raptor_ntriples.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/raptor_ntriples.c723
1 files changed, 723 insertions, 0 deletions
diff --git a/src/raptor_ntriples.c b/src/raptor_ntriples.c
new file mode 100644
index 0000000..3276e79
--- /dev/null
+++ b/src/raptor_ntriples.c
@@ -0,0 +1,723 @@
+/* -*- Mode: c; c-basic-offset: 2 -*-
+ *
+ * raptor_ntriples.c - Raptor N-Triples parsing utilities
+ *
+ * Copyright (C) 2013, David Beckett http://www.dajobe.org/
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ * 2. GNU General Public License (GPL) V2 or any newer version
+ * 3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ *
+ * See LICENSE.html or LICENSE.txt at the top of this package for the
+ * complete terms and further detail along with the license texts for
+ * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
+ *
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <raptor_config.h>
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+
+/* Raptor includes */
+#include "raptor2.h"
+#include "raptor_internal.h"
+
+
+/* These are for 7-bit ASCII and not locale-specific */
+#define IS_ASCII_ALPHA(c) (((c) > 0x40 && (c) < 0x5B) || ((c) > 0x60 && (c) < 0x7B))
+#define IS_ASCII_UPPER(c) ((c) > 0x40 && (c) < 0x5B)
+#define IS_ASCII_DIGIT(c) ((c) > 0x2F && (c) < 0x3A)
+#define IS_ASCII_PRINT(c) ((c) > 0x1F && (c) < 0x7F)
+#define TO_ASCII_LOWER(c) ((c)+0x20)
+
+typedef enum {
+ RAPTOR_TERM_CLASS_URI, /* ends on > */
+ RAPTOR_TERM_CLASS_BNODEID, /* ends on first non [A-Za-z][A-Za-z0-9]* */
+ RAPTOR_TERM_CLASS_STRING, /* ends on non-escaped " */
+ RAPTOR_TERM_CLASS_LANGUAGE /* ends on first non [a-z0-9]+ ('-' [a-z0-9]+ )? */
+} raptor_ntriples_term_class;
+
+
+static int
+raptor_ntriples_term_valid(unsigned char c, int position,
+ raptor_ntriples_term_class term_class)
+{
+ int result = 0;
+
+ switch(term_class) {
+ case RAPTOR_TERM_CLASS_URI:
+ /* ends on > */
+ result = (c != '>');
+ break;
+
+ case RAPTOR_TERM_CLASS_BNODEID:
+ /* ends on first non [A-Za-z0-9_:][-.A-Za-z0-9]* */
+ result = IS_ASCII_ALPHA(c) || IS_ASCII_DIGIT(c) || c == '_' || c == ':';
+ if(position)
+ /* FIXME
+ * This isn't correct; '.' is allowed in positions 1..N-1 but
+ * this calling convention of character-by-character cannot
+ * check this.
+ */
+ result = (result || c == '-' || c == '.');
+ break;
+
+ case RAPTOR_TERM_CLASS_STRING:
+ /* ends on " */
+ result = (c != '"');
+ break;
+
+ case RAPTOR_TERM_CLASS_LANGUAGE:
+ /* ends on first non [a-zA-Z]+ ('-' [a-zA-Z0-9]+ )?
+ * Accept _ as synonym / typo for -.
+ */
+ result = IS_ASCII_ALPHA(c);
+ if(position)
+ result = (result || IS_ASCII_DIGIT(c) || c == '-' || c == '_');
+ break;
+
+ default:
+ RAPTOR_DEBUG2("Unknown N-Triples term class %u", term_class);
+ }
+
+ return result;
+}
+
+
+/*
+ * raptor_ntriples_parse_term_internal:
+ * @world: raptor world
+ * @locator: locator object (in/out) (or NULL)
+ * @start: pointer to starting character of string (in)
+ * @dest: destination of string (in)
+ * @lenp: pointer to length of string (in/out)
+ * @dest_lenp: pointer to length of destination string (out)
+ * @end_char: string ending character
+ * @class: string class
+ *
+ * INTERNAL - Parse an N-Triples term with escapes.
+ *
+ * Relies that @dest is long enough; it need only be as large as the
+ * input string @start since when UTF-8 encoding, the escapes are
+ * removed and the result is always less than or equal to length of
+ * input.
+ *
+ * N-Triples strings / URIs are written in ASCII at present;
+ * characters outside the printable ASCII range are discarded with a
+ * warning. See the grammar for full details of the allowed ranges.
+ *
+ * UTF-8 and the \u and \U esapes are both allowed.
+ *
+ * URIs may not have \t \b \n \r \f or raw ' ' or \u0020 or \u003C or \u003E
+ *
+ * Return value: Non 0 on failure
+ **/
+static int
+raptor_ntriples_parse_term_internal(raptor_world* world,
+ raptor_locator* locator,
+ const unsigned char **start,
+ unsigned char *dest,
+ size_t *lenp, size_t *dest_lenp,
+ char end_char,
+ raptor_ntriples_term_class term_class)
+{
+ const unsigned char *p = *start;
+ unsigned char c = '\0';
+ size_t ulen = 0;
+ unsigned long unichar = 0;
+ unsigned int position = 0;
+ int end_char_seen = 0;
+
+ /* find end of string, fixing backslashed characters on the way */
+ while(*lenp > 0) {
+ int unichar_width;
+
+ c = *p;
+
+ p++;
+ (*lenp)--;
+ if(locator) {
+ locator->column++;
+ locator->byte++;
+ }
+
+ if(term_class == RAPTOR_TERM_CLASS_URI && c == ' ') {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator,
+ "URI error - illegal character %d (0x%02X) found.",
+ c, RAPTOR_GOOD_CAST(unsigned int, c));
+ return 1;
+ }
+
+ if(c > 0x7f) {
+ /* just copy the UTF-8 bytes through */
+ int unichar_len;
+ unichar_len = raptor_unicode_utf8_string_get_char(p - 1, 1 + *lenp, NULL);
+ if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > *lenp) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator,
+ "UTF-8 encoding error at character %d (0x%02X) found.",
+ c, RAPTOR_GOOD_CAST(unsigned int, c));
+ /* UTF-8 encoding had an error or ended in the middle of a string */
+ return 1;
+ }
+ memmove(dest, p-1, unichar_len);
+ dest += unichar_len;
+
+ unichar_len--; /* p, *lenp were moved on by 1 earlier */
+
+ p += unichar_len;
+ (*lenp) -= unichar_len;
+ if(locator) {
+ locator->column += unichar_len;
+ locator->byte += unichar_len;
+ }
+ continue;
+ }
+
+ if(c != '\\') {
+ /* finish at non-backslashed end_char */
+ if(end_char && c == end_char) {
+ end_char_seen = 1;
+ break;
+ }
+
+ if(!raptor_ntriples_term_valid(c, position, term_class)) {
+ if(end_char) {
+ /* end char was expected, so finding an invalid thing is an error */
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' (found '%c')", end_char, c);
+ return 0;
+ } else {
+ /* it's the end - so rewind 1 to save next char */
+ p--;
+ (*lenp)++;
+ if(locator) {
+ locator->column--;
+ locator->byte--;
+ }
+ if(term_class == RAPTOR_TERM_CLASS_BNODEID && dest[-1] == '.') {
+ /* If bnode id ended on '.' move back one */
+ dest--;
+
+ p--;
+ (*lenp)++;
+ if(locator) {
+ locator->column--;
+ locator->byte--;
+ }
+ }
+ break;
+ }
+ }
+
+ /* otherwise store and move on */
+ *dest++ = c;
+ position++;
+ continue;
+ }
+
+ if(!*lenp) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "\\ at end of input.");
+ return 0;
+ }
+
+ c = *p;
+
+ p++;
+ (*lenp)--;
+ if(locator) {
+ locator->column++;
+ locator->byte++;
+ }
+
+ switch(c) {
+ case '"':
+ case '\\':
+ *dest++ = c;
+ break;
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ if(term_class == RAPTOR_TERM_CLASS_URI) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI error - illegal URI escape '\\%c'.", c);
+ return 1;
+ }
+
+ if(c == 'b')
+ *dest++ = '\b';
+ else if(c == 'f')
+ *dest++ = '\f';
+ else if(c == 'n')
+ *dest++ = '\n';
+ else if(c == 'r')
+ *dest++ = '\r';
+ else /* 't' */
+ *dest++ = '\t';
+ break;
+ case '<':
+ case '>':
+ case '{':
+ case '}':
+ case '|':
+ case '^':
+ case '`':
+ /* Turtle 2013 allows these in URIs (as well as \" and \\) */
+ *dest++ = c;
+ break;
+
+ case 'u':
+ case 'U':
+ ulen = (c == 'u') ? 4 : 8;
+
+ if(*lenp < ulen) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "%c over end of input.", c);
+ return 0;
+ }
+
+ if(1) {
+ unsigned int ii;
+ int n = 0;
+
+ for(ii = 0; ii < ulen; ii++) {
+ char cc = p[ii];
+ if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "N-Triples string error - illegal hex digit %c in Unicode escape '%c%s...'",
+ cc, c, p);
+ n = 1;
+ break;
+ }
+ }
+
+ if(n)
+ break;
+
+ n = sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
+ if(n != 1) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Uncode escape '%c%s...'", c, p);
+ break;
+ }
+ }
+
+ p += ulen;
+ (*lenp) -= ulen;
+ if(locator) {
+ locator->column += RAPTOR_GOOD_CAST(int, ulen);
+ locator->byte += RAPTOR_GOOD_CAST(int, ulen);
+ }
+
+ if(term_class == RAPTOR_TERM_CLASS_URI &&
+ (unichar == 0x0020 || unichar == 0x003C || unichar == 0x003E)) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI error - illegal Unicode escape \\u%04lX in URI.", unichar);
+ break;
+ }
+
+ if(unichar > raptor_unicode_max_codepoint) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX (max #x%lX).", unichar, raptor_unicode_max_codepoint);
+ break;
+ }
+
+ unichar_width = raptor_unicode_utf8_string_put_char(unichar, dest, 4);
+ if(unichar_width < 0) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX.", unichar);
+ break;
+ }
+
+ /* The destination length is set here to 4 since we know that in
+ * all cases, the UTF-8 encoded output sequence is always shorter
+ * than the input sequence, and the buffer is edited in place.
+ * \uXXXX: 6 bytes input - UTF-8 max 3 bytes output
+ * \uXXXXXXXX: 10 bytes input - UTF-8 max 4 bytes output
+ */
+ dest += (int)unichar_width;
+ break;
+
+ default:
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal string escape \\%c in \"%s\"", c, (char*)start);
+ return 0;
+ }
+
+ position++;
+ } /* end while */
+
+
+ if(end_char && !end_char_seen) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' before end of input.", end_char);
+ return 1;
+ }
+
+ /* terminate dest, can be shorter than source */
+ *dest = '\0';
+
+ if(dest_lenp)
+ *dest_lenp = p - *start;
+
+ *start = p;
+
+ return 0;
+}
+
+
+static int
+raptor_parse_turtle_term_internal(raptor_world* world,
+ raptor_locator* locator,
+ const unsigned char **start,
+ unsigned char *dest,
+ size_t *len_p, size_t *dest_lenp,
+ raptor_uri** datatype_uri_p)
+{
+ const unsigned char *p = *start;
+ unsigned int position = 0;
+ /* 0 = xsd:integer; 1= xsd:decimal; 2= xsd:double */
+ short dtype = 0;
+ int after_e = 0;
+
+ while(*len_p > 0) {
+ unsigned char c = *p;
+
+ if(after_e) {
+ if(!((c >= '0' && c <'9') || c == '+' || c == '-'))
+ break;
+ after_e = 0;
+ } else if((position > 0 && (c == '+' || c == '-')) ||
+ !((c >= '0' && c <'9') || c == '.' || c == 'e' || c == 'E'))
+ break;
+
+ if(c == '.')
+ dtype = 1;
+ else if(c == 'e' || c == 'E') {
+ dtype = 2;
+ after_e = 1;
+ }
+
+ p++;
+ (*len_p)--;
+ if(locator) {
+ locator->column++;
+ locator->byte++;
+ }
+
+ *dest++ = c;
+
+ position++;
+ }
+
+ *dest = '\0';
+
+ if(dest_lenp)
+ *dest_lenp = p - *start;
+
+ *start = p;
+
+ if(dtype == 0)
+ *datatype_uri_p = raptor_uri_copy(world->xsd_integer_uri);
+ else if (dtype == 1)
+ *datatype_uri_p = raptor_uri_copy(world->xsd_decimal_uri);
+ else
+ *datatype_uri_p = raptor_uri_copy(world->xsd_double_uri);
+
+ return 0;
+}
+
+
+/*
+ * raptor_ntriples_parse_term:
+ * @world: raptor world
+ * @locator: raptor locator (in/out) (or NULL)
+ * @string: string input (in)
+ * @len_p: pointer to length of @string (in/out)
+ * @term_p: pointer to store term (out)
+ * @allow_turtle: non-0 to allow Turtle forms such as integers, boolean
+ *
+ * INTERNAL - Parse an N-Triples string into a #raptor_term
+ *
+ * The @len_p destination and @locator fields are modified as parsing
+ * proceeds to be used in error messages. The final value is written
+ * into the #raptor_term pointed at by @term_p
+ *
+ * Return value: number of bytes processed or 0 on failure
+ */
+size_t
+raptor_ntriples_parse_term(raptor_world* world, raptor_locator* locator,
+ unsigned char *string, size_t *len_p,
+ raptor_term** term_p, int allow_turtle)
+{
+ unsigned char *p = string;
+ unsigned char *dest;
+ size_t term_length = 0;
+
+ switch(*p) {
+ case '<':
+ dest = p;
+
+ p++;
+ (*len_p)--;
+ if(locator) {
+ locator->column++;
+ locator->byte++;
+ }
+
+ if(raptor_ntriples_parse_term_internal(world, locator,
+ (const unsigned char**)&p,
+ dest, len_p, &term_length,
+ '>', RAPTOR_TERM_CLASS_URI)) {
+ goto fail;
+ }
+
+ if(1) {
+ raptor_uri *uri;
+
+ /* Check for bad ordinal predicate */
+ if(!strncmp((const char*)dest,
+ "http://www.w3.org/1999/02/22-rdf-syntax-ns#_", 44)) {
+ int ordinal = raptor_check_ordinal(dest + 44);
+ if(ordinal <= 0)
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal ordinal value %d in property '%s'.", ordinal, dest);
+ }
+ if(raptor_uri_uri_string_is_absolute(dest) <= 0) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI '%s' is not absolute.", dest);
+ goto fail;
+ }
+
+ uri = raptor_new_uri(world, dest);
+ if(!uri) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Could not create URI for '%s'", (const char *)dest);
+ goto fail;
+ }
+
+ *term_p = raptor_new_term_from_uri(world, uri);
+ raptor_free_uri(uri);
+ }
+ break;
+
+ case '-':
+ case '+':
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if(allow_turtle) {
+ raptor_uri* datatype_uri = NULL;
+
+ dest = p;
+
+ if(raptor_parse_turtle_term_internal(world, locator,
+ (const unsigned char**)&p,
+ dest, len_p, &term_length,
+ &datatype_uri)) {
+ goto fail;
+ }
+
+ *term_p = raptor_new_term_from_literal(world,
+ dest,
+ datatype_uri,
+ NULL /* language */);
+ } else
+ goto fail;
+ break;
+
+ case '"':
+ dest = p;
+
+ p++;
+ (*len_p)--;
+ if(locator) {
+ locator->column++;
+ locator->byte++;
+ }
+
+ if(raptor_ntriples_parse_term_internal(world, locator,
+ (const unsigned char**)&p,
+ dest, len_p, &term_length,
+ '"', RAPTOR_TERM_CLASS_STRING)) {
+ goto fail;
+ }
+
+ if(1) {
+ unsigned char *object_literal_language = NULL;
+ unsigned char *object_literal_datatype = NULL;
+ raptor_uri* datatype_uri = NULL;
+
+ if(*len_p && *p == '@') {
+ unsigned char *q;
+ size_t lang_len;
+
+ object_literal_language = p;
+
+ /* Skip - */
+ p++;
+ (*len_p)--;
+ if(locator) {
+ locator->column++;
+ locator->byte++;
+ }
+
+ if(!*len_p) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing language after \"string\"-");
+ goto fail;
+ }
+
+ if(raptor_ntriples_parse_term_internal(world, locator,
+ (const unsigned char**)&p,
+ object_literal_language, len_p, &lang_len,
+ '\0', RAPTOR_TERM_CLASS_LANGUAGE)) {
+ goto fail;
+ }
+
+ if(!lang_len) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Invalid language tag at @%s", p);
+ goto fail;
+ }
+
+ /* Normalize language to lowercase
+ * http://www.w3.org/TR/rdf-concepts/#dfn-language-identifier
+ * Also convert _ to - as synonym / typo.
+ */
+ for(q = object_literal_language; *q; q++) {
+ if(IS_ASCII_UPPER(*q))
+ *q = RAPTOR_GOOD_CAST(unsigned char, TO_ASCII_LOWER(*q));
+ if(*q == '_')
+ *q = '-';
+ }
+
+ }
+
+ if(*len_p > 1 && *p == '^' && p[1] == '^') {
+
+ object_literal_datatype = p;
+
+ /* Skip ^^ */
+ p += 2;
+ *len_p -= 2;
+ if(locator) {
+ locator->column += 2;
+ locator->byte += 2;
+ }
+
+ if(!*len_p || (*len_p && *p != '<')) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing datatype URI-ref in\"string\"^^<URI-ref> after ^^");
+ goto fail;
+ }
+
+ p++;
+ (*len_p)--;
+ if(locator) {
+ locator->column++;
+ locator->byte++;
+ }
+
+ if(raptor_ntriples_parse_term_internal(world, locator,
+ (const unsigned char**)&p,
+ object_literal_datatype, len_p, NULL,
+ '>', RAPTOR_TERM_CLASS_URI)) {
+ goto fail;
+ }
+
+ if(raptor_uri_uri_string_is_absolute(object_literal_datatype) <= 0) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Datatype URI '%s' is not absolute.", object_literal_datatype);
+ goto fail;
+ }
+
+ }
+
+ if(object_literal_datatype && object_literal_language) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Typed literal used with a language - ignoring the language");
+ object_literal_language = NULL;
+ }
+
+ if(object_literal_datatype) {
+ datatype_uri = raptor_new_uri(world,
+ object_literal_datatype);
+ if(!datatype_uri) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Could not create literal datatype uri '%s'", object_literal_datatype);
+ goto fail;
+ }
+ object_literal_language = NULL;
+ }
+
+ *term_p = raptor_new_term_from_literal(world,
+ dest,
+ datatype_uri,
+ object_literal_language);
+ if(datatype_uri)
+ raptor_free_uri(datatype_uri);
+ }
+
+ break;
+
+
+ case '_':
+ /* store where _ was */
+ dest = p;
+
+ p++;
+ (*len_p)--;
+ if(locator) {
+ locator->column++;
+ locator->byte++;
+ }
+
+ if(!*len_p || (*len_p > 0 && *p != ':')) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal bNodeID - _ not followed by :");
+ goto fail;
+ }
+
+ /* Found ':' - move on */
+
+ p++;
+ (*len_p)--;
+ if(locator) {
+ locator->column++;
+ locator->byte++;
+ }
+
+ if(raptor_ntriples_parse_term_internal(world, locator,
+ (const unsigned char**)&p,
+ dest, len_p, &term_length,
+ '\0',
+ RAPTOR_TERM_CLASS_BNODEID)) {
+ goto fail;
+ }
+
+ if(!term_length) {
+ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Bad or missing bNodeID after _:");
+ goto fail;
+ }
+
+ *term_p = raptor_new_term_from_blank(world, dest);
+
+ break;
+
+ default:
+ RAPTOR_DEBUG2("Unknown term type '%c'", *p);
+ goto fail;
+ }
+
+ fail:
+
+ return p - string;
+}