/* -*- Mode: c; c-basic-offset: 2 -*- * * turtle_common.c - Raptor Turtle common code * * Copyright (C) 2003-2007, David Beckett http://www.dajobe.org/ * Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/ * * This package is Free Software and part of Redland http://librdf.org/ * * It is licensed under the following three licenses as alternatives: * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version * 2. GNU General Public License (GPL) V2 or any newer version * 3. Apache License, V2.0 or any newer version * * You may not use this file except in compliance with at least one of * the above three licenses. * * See LICENSE.html or LICENSE.txt at the top of this package for the * complete terms and further detail along with the license texts for * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. * * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #ifdef HAVE_ERRNO_H #include #endif #ifdef HAVE_STDLIB_H #include #endif /* Raptor includes */ #include "raptor2.h" #include "raptor_internal.h" #include #define YY_NO_UNISTD_H 1 #define YYSTYPE TURTLE_PARSER_STYPE #include #include /** * raptor_stringbuffer_append_turtle_string: * @stringbuffer: String buffer to add to * @text: turtle string to decode * @len: length of string * @delim: terminating delimiter for string - only ', " or > are allowed * @error_handler: error handling function * @error_data: error handler data * * Append to a stringbuffer a Turtle-escaped string. * * The passed in string is handled according to the Turtle string * escape rules giving a UTF-8 encoded output of the Unicode codepoints. * * The Turtle escapes are \b \f \n \r \t \\ * \uXXXX \UXXXXXXXX where X is [A-F0-9] * * Turtle 2013 allows \ with -_~.!$&\'()*+,;=/?#@% * * URIs may not have \t \b \n \r \f or raw ' ' or \u0020 or \u003C or \u003E * * Return value: non-0 on failure **/ int raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, const unsigned char *text, size_t len, int delim, raptor_simple_message_handler error_handler, void *error_data, int is_uri) { size_t i; const unsigned char *s; unsigned char *d; unsigned char *string = RAPTOR_MALLOC(unsigned char*, len + 1); const char* label = (is_uri ? "URI" : "string"); if(!string) return -1; for(s = text, d = string, i = 0; i < len; s++, i++) { unsigned char c=*s; if(c == ' ' && is_uri) { error_handler(error_data, "Turtle %s error - character '%c'", label, c); RAPTOR_FREE(char*, string); return 1; } if(c == '\\' ) { s++; i++; c = *s; if(c == 'n' || c == 'r' || c == 't' || c == 'b' || c == 'f') { if(is_uri) { error_handler(error_data, "Turtle %s error - illegal URI escape '\\%c'", label, c); RAPTOR_FREE(char*, string); return 1; } if(c == 'n') *d++ = '\n'; else if(c == 'r') *d++ = '\r'; else if(c == 't') *d++ = '\t'; else if(c == 'b') *d++ = '\b'; else /* 'f' */ *d++ = '\f'; } else if(c == '\\' || c == delim || c == '-' || c == '_' || c == '~' || c == '.' || c == '!' || c == '$' || c == '&' || c == '\'' || c == '(' || c == ')' || c == '*' || c == '+' || c == ',' || c == ';' ||c == '=' || c == '/' || c == '?' || c == '#' || c == '@' ||c == '%') *d++ = c; else if(c == 'u' || c == 'U') { size_t ulen = (c == 'u') ? 4 : 8; unsigned long unichar = 0; int n; int unichar_width; size_t ii; s++; i++; if(i+ulen > len) { error_handler(error_data, "Turtle %s error - \\%c over end of line", label, c); RAPTOR_FREE(char*, string); return 1; } for(ii = 0; ii < ulen; ii++) { char cc = s[ii]; if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { error_handler(error_data, "Turtle %s error - illegal hex digit %c in Unicode escape '%c%s...'", label, cc, c, s); RAPTOR_FREE(char*, string); return 1; } } n = sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { error_handler(error_data, "Turtle %s error - illegal Unicode escape '%c%s...'", label, c, s); RAPTOR_FREE(char*, string); return 1; } s+= ulen-1; i+= ulen-1; if(is_uri && (unichar == 0x0020 || unichar == 0x003C || unichar == 0x003E)) { error_handler(error_data, "Turtle %s error - illegal Unicode escape \\u%04lX in URI.", label, unichar); break; } if(unichar > raptor_unicode_max_codepoint) { error_handler(error_data, "Turtle %s error - illegal Unicode character with code point #x%lX (max #x%lX).", label, unichar, raptor_unicode_max_codepoint); RAPTOR_FREE(char*, string); return 1; } unichar_width = raptor_unicode_utf8_string_put_char(unichar, d, len-(d-string)); if(unichar_width < 0) { error_handler(error_data, "Turtle %s error - illegal Unicode character with code point #x%lX.", label, unichar); RAPTOR_FREE(char*, string); return 1; } d += (size_t)unichar_width; } else { /* don't handle \x where x isn't one of: \t \n \r \\ (delim) */ error_handler(error_data, "Turtle %s error - illegal escape \\%c (#x%02X) in \"%s\"", label, c, c, text); } } else *d++=c; } *d='\0'; /* calculate output string size */ len = d-string; #ifdef __clang_analyzer__ /* clang --analyze does not know about ownership of next call */ free(string); string = NULL; #endif /* string gets owned by the stringbuffer after this */ return raptor_stringbuffer_append_counted_string(stringbuffer, string, len, 0); } /** * raptor_turtle_expand_qname_escapes: * @name: turtle qname string to decode * @len: length of name * @error_handler: error handling function * @error_data: error handler data * * Expands Turtle escapes for the given turtle qname string * * The passed in string is handled according to the Turtle string * escape rules giving a UTF-8 encoded output of the Unicode codepoints. * * The Turtle escapes are \b \f \n \r \t \\ * \uXXXX \UXXXXXXXX where X is [A-F0-9] * * Turtle 2013 allows \ with -_~.!$&\'()*+,;=/?#@% * * Return value: new length or 0 on failure **/ size_t raptor_turtle_expand_qname_escapes(unsigned char *name, size_t len, raptor_simple_message_handler error_handler, void *error_data) { size_t i; const unsigned char *s; unsigned char *d; if(!name) return 0; for(s = name, d = name, i = 0; i < len; s++, i++) { unsigned char c=*s; if(c == '\\' ) { s++; i++; c = *s; if(c == 'n') *d++ = '\n'; else if(c == 'r') *d++ = '\r'; else if(c == 't') *d++ = '\t'; else if(c == 'b') *d++ = '\b'; else if(c == 'f') *d++ = '\f'; else if(c == '\\' || c == '-' || c == '_' || c == '~' || c == '.' || c == '!' || c == '$' || c == '&' || c == '\'' || c == '(' || c == ')' || c == '*' || c == '+' || c == ',' || c == ';' ||c == '=' || c == '/' || c == '?' || c == '#' || c == '@' ||c == '%') *d++ = c; else if(c == 'u' || c == 'U') { size_t ulen = (c == 'u') ? 4 : 8; unsigned long unichar = 0; int n; int unichar_width; size_t ii; s++; i++; if(i+ulen > len) { error_handler(error_data, "Turtle name error - \\%c over end of line", c); return 0; } for(ii = 0; ii < ulen; ii++) { char cc = s[ii]; if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { error_handler(error_data, "Turtle name error - illegal hex digit %c in Unicode escape '%c%s...'", cc, c, s); return 0; } } n = sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { error_handler(error_data, "Turtle name error - illegal Uncode escape '%c%s...'", c, s); return 0; } s+= ulen-1; i+= ulen-1; if(unichar > raptor_unicode_max_codepoint) { error_handler(error_data, "Turtle name error - illegal Unicode character with code point #x%lX (max #x%lX).", unichar, raptor_unicode_max_codepoint); return 0; } unichar_width = raptor_unicode_utf8_string_put_char(unichar, d, len - (d-name)); if(unichar_width < 0) { error_handler(error_data, "Turtle name error - illegal Unicode character with code point #x%lX.", unichar); return 0; } d += (size_t)unichar_width; } else { /* don't handle \x where x isn't one of: \t \n \r \\ (delim) */ error_handler(error_data, "Turtle name error - illegal escape \\%c (#x%02X) in \"%s\"", c, c, name); } } else *d++ = c; } *d='\0'; /* calculate output string size */ len = d - name; /* string gets owned by the stringbuffer after this */ return len; }