summaryrefslogtreecommitdiffstats
path: root/src/turtle_common.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/turtle_common.c')
-rw-r--r--src/turtle_common.c336
1 files changed, 336 insertions, 0 deletions
diff --git a/src/turtle_common.c b/src/turtle_common.c
new file mode 100644
index 0000000..c822b34
--- /dev/null
+++ b/src/turtle_common.c
@@ -0,0 +1,336 @@
+/* -*- Mode: c; c-basic-offset: 2 -*-
+ *
+ * turtle_common.c - Raptor Turtle common code
+ *
+ * Copyright (C) 2003-2007, David Beckett http://www.dajobe.org/
+ * Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ * 2. GNU General Public License (GPL) V2 or any newer version
+ * 3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ *
+ * See LICENSE.html or LICENSE.txt at the top of this package for the
+ * complete terms and further detail along with the license texts for
+ * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
+ *
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <raptor_config.h>
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+/* Raptor includes */
+#include "raptor2.h"
+#include "raptor_internal.h"
+
+#include <turtle_parser.h>
+#define YY_NO_UNISTD_H 1
+#define YYSTYPE TURTLE_PARSER_STYPE
+#include <turtle_lexer.h>
+#include <turtle_common.h>
+
+/**
+ * raptor_stringbuffer_append_turtle_string:
+ * @stringbuffer: String buffer to add to
+ * @text: turtle string to decode
+ * @len: length of string
+ * @delim: terminating delimiter for string - only ', " or &gt; are allowed
+ * @error_handler: error handling function
+ * @error_data: error handler data
+ *
+ * Append to a stringbuffer a Turtle-escaped string.
+ *
+ * The passed in string is handled according to the Turtle string
+ * escape rules giving a UTF-8 encoded output of the Unicode codepoints.
+ *
+ * The Turtle escapes are \b \f \n \r \t \\
+ * \uXXXX \UXXXXXXXX where X is [A-F0-9]
+ *
+ * Turtle 2013 allows \ with -_~.!$&\'()*+,;=/?#@%
+ *
+ * URIs may not have \t \b \n \r \f or raw ' ' or \u0020 or \u003C or \u003E
+ *
+ * Return value: non-0 on failure
+ **/
+int
+raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer,
+ const unsigned char *text,
+ size_t len, int delim,
+ raptor_simple_message_handler error_handler,
+ void *error_data,
+ int is_uri)
+{
+ size_t i;
+ const unsigned char *s;
+ unsigned char *d;
+ unsigned char *string = RAPTOR_MALLOC(unsigned char*, len + 1);
+ const char* label = (is_uri ? "URI" : "string");
+
+ if(!string)
+ return -1;
+
+ for(s = text, d = string, i = 0; i < len; s++, i++) {
+ unsigned char c=*s;
+
+ if(c == ' ' && is_uri) {
+ error_handler(error_data,
+ "Turtle %s error - character '%c'", label, c);
+ RAPTOR_FREE(char*, string);
+ return 1;
+ }
+
+ if(c == '\\' ) {
+ s++; i++;
+ c = *s;
+ if(c == 'n' || c == 'r' || c == 't' || c == 'b' || c == 'f') {
+ if(is_uri) {
+ error_handler(error_data,
+ "Turtle %s error - illegal URI escape '\\%c'", label, c);
+ RAPTOR_FREE(char*, string);
+ return 1;
+ }
+ if(c == 'n')
+ *d++ = '\n';
+ else if(c == 'r')
+ *d++ = '\r';
+ else if(c == 't')
+ *d++ = '\t';
+ else if(c == 'b')
+ *d++ = '\b';
+ else /* 'f' */
+ *d++ = '\f';
+ } else if(c == '\\' || c == delim ||
+ c == '-' || c == '_' || c == '~' || c == '.' || c == '!' ||
+ c == '$' || c == '&' || c == '\'' || c == '(' || c == ')' ||
+ c == '*' || c == '+' || c == ',' || c == ';' ||c == '=' ||
+ c == '/' || c == '?' || c == '#' || c == '@' ||c == '%')
+ *d++ = c;
+ else if(c == 'u' || c == 'U') {
+ size_t ulen = (c == 'u') ? 4 : 8;
+ unsigned long unichar = 0;
+ int n;
+ int unichar_width;
+ size_t ii;
+
+ s++; i++;
+ if(i+ulen > len) {
+ error_handler(error_data,
+ "Turtle %s error - \\%c over end of line", label, c);
+ RAPTOR_FREE(char*, string);
+ return 1;
+ }
+
+ for(ii = 0; ii < ulen; ii++) {
+ char cc = s[ii];
+ if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) {
+ error_handler(error_data,
+ "Turtle %s error - illegal hex digit %c in Unicode escape '%c%s...'",
+ label, cc, c, s);
+ RAPTOR_FREE(char*, string);
+ return 1;
+ }
+ }
+
+ n = sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
+ if(n != 1) {
+ error_handler(error_data,
+ "Turtle %s error - illegal Unicode escape '%c%s...'",
+ label, c, s);
+ RAPTOR_FREE(char*, string);
+ return 1;
+ }
+
+ s+= ulen-1;
+ i+= ulen-1;
+
+ if(is_uri && (unichar == 0x0020 || unichar == 0x003C || unichar == 0x003E)) {
+ error_handler(error_data,
+ "Turtle %s error - illegal Unicode escape \\u%04lX in URI.", label, unichar);
+ break;
+ }
+
+ if(unichar > raptor_unicode_max_codepoint) {
+ error_handler(error_data,
+ "Turtle %s error - illegal Unicode character with code point #x%lX (max #x%lX).",
+ label, unichar, raptor_unicode_max_codepoint);
+ RAPTOR_FREE(char*, string);
+ return 1;
+ }
+
+ unichar_width = raptor_unicode_utf8_string_put_char(unichar, d,
+ len-(d-string));
+ if(unichar_width < 0) {
+ error_handler(error_data,
+ "Turtle %s error - illegal Unicode character with code point #x%lX.",
+ label, unichar);
+ RAPTOR_FREE(char*, string);
+ return 1;
+ }
+ d += (size_t)unichar_width;
+
+ } else {
+ /* don't handle \x where x isn't one of: \t \n \r \\ (delim) */
+ error_handler(error_data,
+ "Turtle %s error - illegal escape \\%c (#x%02X) in \"%s\"",
+ label, c, c, text);
+ }
+ } else
+ *d++=c;
+ }
+ *d='\0';
+
+ /* calculate output string size */
+ len = d-string;
+
+#ifdef __clang_analyzer__
+ /* clang --analyze does not know about ownership of next call */
+ free(string); string = NULL;
+#endif
+ /* string gets owned by the stringbuffer after this */
+ return raptor_stringbuffer_append_counted_string(stringbuffer,
+ string, len, 0);
+
+}
+
+
+/**
+ * raptor_turtle_expand_qname_escapes:
+ * @name: turtle qname string to decode
+ * @len: length of name
+ * @error_handler: error handling function
+ * @error_data: error handler data
+ *
+ * Expands Turtle escapes for the given turtle qname string
+ *
+ * The passed in string is handled according to the Turtle string
+ * escape rules giving a UTF-8 encoded output of the Unicode codepoints.
+ *
+ * The Turtle escapes are \b \f \n \r \t \\
+ * \uXXXX \UXXXXXXXX where X is [A-F0-9]
+ *
+ * Turtle 2013 allows \ with -_~.!$&\'()*+,;=/?#@%
+ *
+ * Return value: new length or 0 on failure
+ **/
+size_t
+raptor_turtle_expand_qname_escapes(unsigned char *name,
+ size_t len,
+ raptor_simple_message_handler error_handler,
+ void *error_data)
+{
+ size_t i;
+ const unsigned char *s;
+ unsigned char *d;
+
+ if(!name)
+ return 0;
+
+ for(s = name, d = name, i = 0; i < len; s++, i++) {
+ unsigned char c=*s;
+
+ if(c == '\\' ) {
+ s++; i++;
+ c = *s;
+ if(c == 'n')
+ *d++ = '\n';
+ else if(c == 'r')
+ *d++ = '\r';
+ else if(c == 't')
+ *d++ = '\t';
+ else if(c == 'b')
+ *d++ = '\b';
+ else if(c == 'f')
+ *d++ = '\f';
+ else if(c == '\\' ||
+ c == '-' || c == '_' || c == '~' || c == '.' || c == '!' ||
+ c == '$' || c == '&' || c == '\'' || c == '(' || c == ')' ||
+ c == '*' || c == '+' || c == ',' || c == ';' ||c == '=' ||
+ c == '/' || c == '?' || c == '#' || c == '@' ||c == '%')
+ *d++ = c;
+ else if(c == 'u' || c == 'U') {
+ size_t ulen = (c == 'u') ? 4 : 8;
+ unsigned long unichar = 0;
+ int n;
+ int unichar_width;
+ size_t ii;
+
+ s++; i++;
+ if(i+ulen > len) {
+ error_handler(error_data,
+ "Turtle name error - \\%c over end of line", c);
+ return 0;
+ }
+
+ for(ii = 0; ii < ulen; ii++) {
+ char cc = s[ii];
+ if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) {
+ error_handler(error_data,
+ "Turtle name error - illegal hex digit %c in Unicode escape '%c%s...'",
+ cc, c, s);
+ return 0;
+ }
+ }
+
+ n = sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
+ if(n != 1) {
+ error_handler(error_data,
+ "Turtle name error - illegal Uncode escape '%c%s...'",
+ c, s);
+ return 0;
+ }
+
+ s+= ulen-1;
+ i+= ulen-1;
+
+ if(unichar > raptor_unicode_max_codepoint) {
+ error_handler(error_data,
+ "Turtle name error - illegal Unicode character with code point #x%lX (max #x%lX).",
+ unichar, raptor_unicode_max_codepoint);
+ return 0;
+ }
+
+ unichar_width = raptor_unicode_utf8_string_put_char(unichar, d,
+ len - (d-name));
+ if(unichar_width < 0) {
+ error_handler(error_data,
+ "Turtle name error - illegal Unicode character with code point #x%lX.",
+ unichar);
+ return 0;
+ }
+ d += (size_t)unichar_width;
+
+ } else {
+ /* don't handle \x where x isn't one of: \t \n \r \\ (delim) */
+ error_handler(error_data,
+ "Turtle name error - illegal escape \\%c (#x%02X) in \"%s\"",
+ c, c, name);
+ }
+ } else
+ *d++ = c;
+ }
+ *d='\0';
+
+ /* calculate output string size */
+ len = d - name;
+
+ /* string gets owned by the stringbuffer after this */
+ return len;
+}