diff options
Diffstat (limited to 'src/turtle_common.c')
-rw-r--r-- | src/turtle_common.c | 336 |
1 files changed, 336 insertions, 0 deletions
diff --git a/src/turtle_common.c b/src/turtle_common.c new file mode 100644 index 0000000..c822b34 --- /dev/null +++ b/src/turtle_common.c @@ -0,0 +1,336 @@ +/* -*- Mode: c; c-basic-offset: 2 -*- + * + * turtle_common.c - Raptor Turtle common code + * + * Copyright (C) 2003-2007, David Beckett http://www.dajobe.org/ + * Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/ + * + * This package is Free Software and part of Redland http://librdf.org/ + * + * It is licensed under the following three licenses as alternatives: + * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version + * 2. GNU General Public License (GPL) V2 or any newer version + * 3. Apache License, V2.0 or any newer version + * + * You may not use this file except in compliance with at least one of + * the above three licenses. + * + * See LICENSE.html or LICENSE.txt at the top of this package for the + * complete terms and further detail along with the license texts for + * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. + * + * + */ + +#ifdef HAVE_CONFIG_H +#include <raptor_config.h> +#endif + +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <stdarg.h> +#ifdef HAVE_ERRNO_H +#include <errno.h> +#endif +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif + +/* Raptor includes */ +#include "raptor2.h" +#include "raptor_internal.h" + +#include <turtle_parser.h> +#define YY_NO_UNISTD_H 1 +#define YYSTYPE TURTLE_PARSER_STYPE +#include <turtle_lexer.h> +#include <turtle_common.h> + +/** + * raptor_stringbuffer_append_turtle_string: + * @stringbuffer: String buffer to add to + * @text: turtle string to decode + * @len: length of string + * @delim: terminating delimiter for string - only ', " or > are allowed + * @error_handler: error handling function + * @error_data: error handler data + * + * Append to a stringbuffer a Turtle-escaped string. + * + * The passed in string is handled according to the Turtle string + * escape rules giving a UTF-8 encoded output of the Unicode codepoints. + * + * The Turtle escapes are \b \f \n \r \t \\ + * \uXXXX \UXXXXXXXX where X is [A-F0-9] + * + * Turtle 2013 allows \ with -_~.!$&\'()*+,;=/?#@% + * + * URIs may not have \t \b \n \r \f or raw ' ' or \u0020 or \u003C or \u003E + * + * Return value: non-0 on failure + **/ +int +raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, + const unsigned char *text, + size_t len, int delim, + raptor_simple_message_handler error_handler, + void *error_data, + int is_uri) +{ + size_t i; + const unsigned char *s; + unsigned char *d; + unsigned char *string = RAPTOR_MALLOC(unsigned char*, len + 1); + const char* label = (is_uri ? "URI" : "string"); + + if(!string) + return -1; + + for(s = text, d = string, i = 0; i < len; s++, i++) { + unsigned char c=*s; + + if(c == ' ' && is_uri) { + error_handler(error_data, + "Turtle %s error - character '%c'", label, c); + RAPTOR_FREE(char*, string); + return 1; + } + + if(c == '\\' ) { + s++; i++; + c = *s; + if(c == 'n' || c == 'r' || c == 't' || c == 'b' || c == 'f') { + if(is_uri) { + error_handler(error_data, + "Turtle %s error - illegal URI escape '\\%c'", label, c); + RAPTOR_FREE(char*, string); + return 1; + } + if(c == 'n') + *d++ = '\n'; + else if(c == 'r') + *d++ = '\r'; + else if(c == 't') + *d++ = '\t'; + else if(c == 'b') + *d++ = '\b'; + else /* 'f' */ + *d++ = '\f'; + } else if(c == '\\' || c == delim || + c == '-' || c == '_' || c == '~' || c == '.' || c == '!' || + c == '$' || c == '&' || c == '\'' || c == '(' || c == ')' || + c == '*' || c == '+' || c == ',' || c == ';' ||c == '=' || + c == '/' || c == '?' || c == '#' || c == '@' ||c == '%') + *d++ = c; + else if(c == 'u' || c == 'U') { + size_t ulen = (c == 'u') ? 4 : 8; + unsigned long unichar = 0; + int n; + int unichar_width; + size_t ii; + + s++; i++; + if(i+ulen > len) { + error_handler(error_data, + "Turtle %s error - \\%c over end of line", label, c); + RAPTOR_FREE(char*, string); + return 1; + } + + for(ii = 0; ii < ulen; ii++) { + char cc = s[ii]; + if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { + error_handler(error_data, + "Turtle %s error - illegal hex digit %c in Unicode escape '%c%s...'", + label, cc, c, s); + RAPTOR_FREE(char*, string); + return 1; + } + } + + n = sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); + if(n != 1) { + error_handler(error_data, + "Turtle %s error - illegal Unicode escape '%c%s...'", + label, c, s); + RAPTOR_FREE(char*, string); + return 1; + } + + s+= ulen-1; + i+= ulen-1; + + if(is_uri && (unichar == 0x0020 || unichar == 0x003C || unichar == 0x003E)) { + error_handler(error_data, + "Turtle %s error - illegal Unicode escape \\u%04lX in URI.", label, unichar); + break; + } + + if(unichar > raptor_unicode_max_codepoint) { + error_handler(error_data, + "Turtle %s error - illegal Unicode character with code point #x%lX (max #x%lX).", + label, unichar, raptor_unicode_max_codepoint); + RAPTOR_FREE(char*, string); + return 1; + } + + unichar_width = raptor_unicode_utf8_string_put_char(unichar, d, + len-(d-string)); + if(unichar_width < 0) { + error_handler(error_data, + "Turtle %s error - illegal Unicode character with code point #x%lX.", + label, unichar); + RAPTOR_FREE(char*, string); + return 1; + } + d += (size_t)unichar_width; + + } else { + /* don't handle \x where x isn't one of: \t \n \r \\ (delim) */ + error_handler(error_data, + "Turtle %s error - illegal escape \\%c (#x%02X) in \"%s\"", + label, c, c, text); + } + } else + *d++=c; + } + *d='\0'; + + /* calculate output string size */ + len = d-string; + +#ifdef __clang_analyzer__ + /* clang --analyze does not know about ownership of next call */ + free(string); string = NULL; +#endif + /* string gets owned by the stringbuffer after this */ + return raptor_stringbuffer_append_counted_string(stringbuffer, + string, len, 0); + +} + + +/** + * raptor_turtle_expand_qname_escapes: + * @name: turtle qname string to decode + * @len: length of name + * @error_handler: error handling function + * @error_data: error handler data + * + * Expands Turtle escapes for the given turtle qname string + * + * The passed in string is handled according to the Turtle string + * escape rules giving a UTF-8 encoded output of the Unicode codepoints. + * + * The Turtle escapes are \b \f \n \r \t \\ + * \uXXXX \UXXXXXXXX where X is [A-F0-9] + * + * Turtle 2013 allows \ with -_~.!$&\'()*+,;=/?#@% + * + * Return value: new length or 0 on failure + **/ +size_t +raptor_turtle_expand_qname_escapes(unsigned char *name, + size_t len, + raptor_simple_message_handler error_handler, + void *error_data) +{ + size_t i; + const unsigned char *s; + unsigned char *d; + + if(!name) + return 0; + + for(s = name, d = name, i = 0; i < len; s++, i++) { + unsigned char c=*s; + + if(c == '\\' ) { + s++; i++; + c = *s; + if(c == 'n') + *d++ = '\n'; + else if(c == 'r') + *d++ = '\r'; + else if(c == 't') + *d++ = '\t'; + else if(c == 'b') + *d++ = '\b'; + else if(c == 'f') + *d++ = '\f'; + else if(c == '\\' || + c == '-' || c == '_' || c == '~' || c == '.' || c == '!' || + c == '$' || c == '&' || c == '\'' || c == '(' || c == ')' || + c == '*' || c == '+' || c == ',' || c == ';' ||c == '=' || + c == '/' || c == '?' || c == '#' || c == '@' ||c == '%') + *d++ = c; + else if(c == 'u' || c == 'U') { + size_t ulen = (c == 'u') ? 4 : 8; + unsigned long unichar = 0; + int n; + int unichar_width; + size_t ii; + + s++; i++; + if(i+ulen > len) { + error_handler(error_data, + "Turtle name error - \\%c over end of line", c); + return 0; + } + + for(ii = 0; ii < ulen; ii++) { + char cc = s[ii]; + if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { + error_handler(error_data, + "Turtle name error - illegal hex digit %c in Unicode escape '%c%s...'", + cc, c, s); + return 0; + } + } + + n = sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); + if(n != 1) { + error_handler(error_data, + "Turtle name error - illegal Uncode escape '%c%s...'", + c, s); + return 0; + } + + s+= ulen-1; + i+= ulen-1; + + if(unichar > raptor_unicode_max_codepoint) { + error_handler(error_data, + "Turtle name error - illegal Unicode character with code point #x%lX (max #x%lX).", + unichar, raptor_unicode_max_codepoint); + return 0; + } + + unichar_width = raptor_unicode_utf8_string_put_char(unichar, d, + len - (d-name)); + if(unichar_width < 0) { + error_handler(error_data, + "Turtle name error - illegal Unicode character with code point #x%lX.", + unichar); + return 0; + } + d += (size_t)unichar_width; + + } else { + /* don't handle \x where x isn't one of: \t \n \r \\ (delim) */ + error_handler(error_data, + "Turtle name error - illegal escape \\%c (#x%02X) in \"%s\"", + c, c, name); + } + } else + *d++ = c; + } + *d='\0'; + + /* calculate output string size */ + len = d - name; + + /* string gets owned by the stringbuffer after this */ + return len; +} |