summaryrefslogtreecommitdiffstats
path: root/src/raptor_librdfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/raptor_librdfa.c')
-rw-r--r--src/raptor_librdfa.c398
1 files changed, 398 insertions, 0 deletions
diff --git a/src/raptor_librdfa.c b/src/raptor_librdfa.c
new file mode 100644
index 0000000..60eded0
--- /dev/null
+++ b/src/raptor_librdfa.c
@@ -0,0 +1,398 @@
+/* -*- Mode: c; c-basic-offset: 2 -*-
+ *
+ * raptor_librdfa.c - Raptor RDFA Parser via librdfa implementation
+ *
+ * Copyright (C) 2008, David Beckett http://www.dajobe.org/
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ * 2. GNU General Public License (GPL) V2 or any newer version
+ * 3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ *
+ * See LICENSE.html or LICENSE.txt at the top of this package for the
+ * complete terms and further detail along with the license texts for
+ * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
+ *
+ *
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <raptor_config.h>
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+/* Raptor includes */
+#include "raptor2.h"
+#include "raptor_internal.h"
+
+#include "rdfa.h"
+#include "rdfa_utils.h"
+
+
+#define RAPTOR_DEFAULT_RDFA_VERSION 0
+
+/*
+ * RDFA parser object
+ */
+struct raptor_librdfa_parser_context_s {
+ /* librdfa object */
+ rdfacontext* context;
+
+ /* static statement for use in passing to user code */
+ raptor_statement statement;
+
+ /* 10 for 1.0 11 for 1.1 or otherwise is default (== 1.1) */
+ int rdfa_version;
+};
+
+
+typedef struct raptor_librdfa_parser_context_s raptor_librdfa_parser_context;
+
+
+static int
+raptor_librdfa_parse_init(raptor_parser* rdf_parser, const char *name)
+{
+ raptor_librdfa_parser_context *librdfa_parser;
+ int rdfa_version = RAPTOR_DEFAULT_RDFA_VERSION;
+
+ librdfa_parser = (raptor_librdfa_parser_context*)rdf_parser->context;
+
+ raptor_statement_init(&rdf_parser->statement, rdf_parser->world);
+
+ if(!strcmp(name, "rdfa11"))
+ rdfa_version = 11;
+ else if(!strcmp(name, "rdfa10"))
+ rdfa_version = 10;
+
+ librdfa_parser->rdfa_version = rdfa_version;
+
+ return 0;
+}
+
+
+static void
+raptor_librdfa_parse_terminate(raptor_parser* rdf_parser)
+{
+ raptor_librdfa_parser_context *librdfa_parser;
+
+ librdfa_parser = (raptor_librdfa_parser_context*)rdf_parser->context;
+
+ if(librdfa_parser->context) {
+ rdfa_parse_end(librdfa_parser->context);
+ rdfa_free_context(librdfa_parser->context);
+ librdfa_parser->context = NULL;
+ }
+}
+
+
+static void
+raptor_librdfa_generate_statement(rdftriple* triple, void* callback_data)
+{
+ raptor_parser* parser = (raptor_parser*)callback_data;
+ raptor_statement *s = &parser->statement;
+ raptor_term *subject_term = NULL;
+ raptor_term *predicate_term = NULL;
+ raptor_uri *predicate_uri = NULL;
+ raptor_term *object_term = NULL;
+
+ if(!parser->emitted_default_graph) {
+ raptor_parser_start_graph(parser, NULL, 0);
+ parser->emitted_default_graph++;
+ }
+
+ if(!parser->statement_handler)
+ goto cleanup;
+
+ if(!triple->subject || !triple->predicate || !triple->object) {
+#ifdef RAPTOR_DEBUG
+ RAPTOR_FATAL1("Triple has NULL parts\n");
+#else
+ rdfa_free_triple(triple);
+ return;
+#endif
+ }
+
+ if(triple->predicate[0] == '_') {
+ raptor_parser_warning(parser,
+ "Ignoring RDFa triple with blank node predicate %s.",
+ triple->predicate);
+ rdfa_free_triple(triple);
+ return;
+ }
+
+ if(triple->object_type == RDF_TYPE_NAMESPACE_PREFIX) {
+#ifdef RAPTOR_DEBUG
+ RAPTOR_FATAL1("Triple has namespace object type\n");
+#else
+ rdfa_free_triple(triple);
+ return;
+#endif
+ }
+
+ if((triple->subject[0] == '_') && (triple->subject[1] == ':')) {
+ subject_term = raptor_new_term_from_blank(parser->world,
+ (const unsigned char*)triple->subject + 2);
+ } else {
+ raptor_uri* subject_uri;
+
+ subject_uri = raptor_new_uri(parser->world,
+ (const unsigned char*)triple->subject);
+ subject_term = raptor_new_term_from_uri(parser->world, subject_uri);
+ raptor_free_uri(subject_uri);
+ subject_uri = NULL;
+ }
+ s->subject = subject_term;
+
+
+ predicate_uri = raptor_new_uri(parser->world,
+ (const unsigned char*)triple->predicate);
+ if(!predicate_uri)
+ goto cleanup;
+
+ predicate_term = raptor_new_term_from_uri(parser->world, predicate_uri);
+ raptor_free_uri(predicate_uri);
+ predicate_uri = NULL;
+ s->predicate = predicate_term;
+
+
+ if(triple->object_type == RDF_TYPE_IRI) {
+ if((triple->object[0] == '_') && (triple->object[1] == ':')) {
+ object_term = raptor_new_term_from_blank(parser->world,
+ (const unsigned char*)triple->object + 2);
+ } else {
+ raptor_uri* object_uri;
+ object_uri = raptor_new_uri(parser->world,
+ (const unsigned char*)triple->object);
+ if(!object_uri)
+ goto cleanup;
+
+ object_term = raptor_new_term_from_uri(parser->world, object_uri);
+ raptor_free_uri(object_uri);
+ }
+ } else if(triple->object_type == RDF_TYPE_PLAIN_LITERAL) {
+ object_term = raptor_new_term_from_literal(parser->world,
+ (const unsigned char*)triple->object,
+ NULL,
+ (const unsigned char*)triple->language);
+
+ } else if(triple->object_type == RDF_TYPE_XML_LITERAL) {
+ raptor_uri* datatype_uri;
+ datatype_uri = raptor_new_uri_from_counted_string(parser->world,
+ (const unsigned char*)raptor_xml_literal_datatype_uri_string,
+ raptor_xml_literal_datatype_uri_string_len);
+ object_term = raptor_new_term_from_literal(parser->world,
+ (const unsigned char*)triple->object,
+ datatype_uri,
+ NULL);
+ raptor_free_uri(datatype_uri);
+ } else if(triple->object_type == RDF_TYPE_TYPED_LITERAL) {
+ raptor_uri *datatype_uri = NULL;
+ const unsigned char* language = (const unsigned char*)triple->language;
+
+ if(triple->datatype) {
+ /* If datatype, no language allowed */
+ language = NULL;
+ datatype_uri = raptor_new_uri(parser->world,
+ (const unsigned char*)triple->datatype);
+ if(!datatype_uri)
+ goto cleanup;
+ }
+
+ object_term = raptor_new_term_from_literal(parser->world,
+ (const unsigned char*)triple->object,
+ datatype_uri,
+ language);
+ raptor_free_uri(datatype_uri);
+ } else {
+ raptor_log_error_formatted(parser->world, RAPTOR_LOG_LEVEL_ERROR, NULL,
+ "Triple has unknown object term type %u",
+ s->object->type);
+ goto cleanup;
+ }
+ s->object = object_term;
+
+ /* Generate statement */
+ (*parser->statement_handler)(parser->user_data, s);
+
+ cleanup:
+ rdfa_free_triple(triple);
+
+ if(subject_term)
+ raptor_free_term(subject_term);
+ if(predicate_term)
+ raptor_free_term(predicate_term);
+ if(object_term)
+ raptor_free_term(object_term);
+}
+
+
+static void
+raptor_librdfa_sax2_new_namespace_handler(void *user_data,
+ raptor_namespace* nspace)
+{
+ raptor_parser* rdf_parser;
+ rdf_parser = (raptor_parser*)user_data;
+ raptor_parser_start_namespace(rdf_parser, nspace);
+}
+
+
+
+static int
+raptor_librdfa_parse_start(raptor_parser* rdf_parser)
+{
+ raptor_locator *locator = &rdf_parser->locator;
+ raptor_librdfa_parser_context *librdfa_parser;
+ int rc;
+ char* base_uri_string = NULL;
+
+ librdfa_parser = (raptor_librdfa_parser_context*)rdf_parser->context;
+
+ locator->line = -1;
+ locator->column = -1;
+ locator->byte = 0;
+
+ if(rdf_parser->base_uri)
+ base_uri_string = (char*)raptor_uri_as_string(rdf_parser->base_uri);
+ else
+ /* base URI is required for rdfa - checked in rdfa_create_context() */
+ return 1;
+
+ if(librdfa_parser->context)
+ rdfa_free_context(librdfa_parser->context);
+ librdfa_parser->context = rdfa_create_context(base_uri_string);
+ if(!librdfa_parser->context)
+ return 1;
+
+ librdfa_parser->context->namespace_handler = raptor_librdfa_sax2_new_namespace_handler;
+ librdfa_parser->context->namespace_handler_user_data = rdf_parser;
+ librdfa_parser->context->world = rdf_parser->world;
+ librdfa_parser->context->locator = &rdf_parser->locator;
+
+ librdfa_parser->context->callback_data = rdf_parser;
+ /* returns triples */
+ rdfa_set_default_graph_triple_handler(librdfa_parser->context,
+ raptor_librdfa_generate_statement);
+
+ /* returns RDFa Processing Graph error triples - not used by raptor */
+ rdfa_set_processor_graph_triple_handler(librdfa_parser->context, NULL);
+
+ librdfa_parser->context->raptor_rdfa_version = librdfa_parser->rdfa_version;
+
+ rc = rdfa_parse_start(librdfa_parser->context);
+ if(rc != RDFA_PARSE_SUCCESS)
+ return 1;
+
+ return 0;
+}
+
+
+static int
+raptor_librdfa_parse_chunk(raptor_parser* rdf_parser,
+ const unsigned char *s, size_t len,
+ int is_end)
+{
+ raptor_librdfa_parser_context *librdfa_parser;
+ int rval;
+
+ librdfa_parser = (raptor_librdfa_parser_context*)rdf_parser->context;
+ rval = rdfa_parse_chunk(librdfa_parser->context, (char*)s, len, is_end);
+
+ if(is_end) {
+ if(rdf_parser->emitted_default_graph) {
+ raptor_parser_end_graph(rdf_parser, NULL, 0);
+ rdf_parser->emitted_default_graph--;
+ }
+ }
+
+ return rval != RDFA_PARSE_SUCCESS;
+}
+
+static int
+raptor_librdfa_parse_recognise_syntax(raptor_parser_factory* factory,
+ const unsigned char *buffer, size_t len,
+ const unsigned char *identifier,
+ const unsigned char *suffix,
+ const char *mime_type)
+{
+ int score = 0;
+
+ if(identifier) {
+ if(strstr((const char*)identifier, "RDFa"))
+ score = 10;
+ }
+
+ if(buffer && len) {
+#define HAS_RDFA_1 (raptor_memstr((const char*)buffer, len, "-//W3C//DTD XHTML+RDFa 1.0//EN") != NULL)
+#define HAS_RDFA_2 (raptor_memstr((const char*)buffer, len, "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd") != NULL)
+
+ if(HAS_RDFA_1 || HAS_RDFA_2)
+ score = 10;
+ }
+
+ return score;
+}
+
+
+static const char* const rdfa_names[4] = { "rdfa", "rdfa11", "rdfa10", NULL };
+
+static const char* const rdfa_uri_strings[3] = {
+ "http://www.w3.org/ns/formats/RDFa",
+ "http://www.w3.org/TR/rdfa/",
+ NULL
+};
+
+#define RDFA_TYPES_COUNT 2
+static const raptor_type_q html_types[RDFA_TYPES_COUNT + 1] = {
+ { "text/html", 9, 6},
+ { "application/xhtml+xml", 21, 8},
+ { NULL, 0, 0}
+};
+
+static int
+raptor_librdfa_parser_register_factory(raptor_parser_factory *factory)
+{
+ int rc = 0;
+
+ factory->desc.names = rdfa_names;
+
+ factory->desc.mime_types = html_types;
+
+ factory->desc.label = "RDF/A via librdfa";
+ factory->desc.uri_strings = rdfa_uri_strings;
+
+ factory->desc.flags = RAPTOR_SYNTAX_NEED_BASE_URI;
+
+ factory->context_length = sizeof(raptor_librdfa_parser_context);
+
+ factory->init = raptor_librdfa_parse_init;
+ factory->terminate = raptor_librdfa_parse_terminate;
+ factory->start = raptor_librdfa_parse_start;
+ factory->chunk = raptor_librdfa_parse_chunk;
+ factory->recognise_syntax = raptor_librdfa_parse_recognise_syntax;
+
+ return rc;
+}
+
+
+int
+raptor_init_parser_rdfa(raptor_world* world)
+{
+ return !raptor_world_register_parser_factory(world,
+ &raptor_librdfa_parser_register_factory);
+}