diff options
Diffstat (limited to 'src/raptor_grddl.c')
-rw-r--r-- | src/raptor_grddl.c | 2131 |
1 files changed, 2131 insertions, 0 deletions
diff --git a/src/raptor_grddl.c b/src/raptor_grddl.c new file mode 100644 index 0000000..b54d8c8 --- /dev/null +++ b/src/raptor_grddl.c @@ -0,0 +1,2131 @@ +/* -*- Mode: c; c-basic-offset: 2 -*- + * + * raptor_grddl.c - Raptor GRDDL (+microformats) Parser implementation + * + * Copyright (C) 2005-2010, David Beckett http://www.dajobe.org/ + * Copyright (C) 2005, University of Bristol, UK http://www.bristol.ac.uk/ + * + * This package is Free Software and part of Redland http://librdf.org/ + * + * It is licensed under the following three licenses as alternatives: + * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version + * 2. GNU General Public License (GPL) V2 or any newer version + * 3. Apache License, V2.0 or any newer version + * + * You may not use this file except in compliance with at least one of + * the above three licenses. + * + * See LICENSE.html or LICENSE.txt at the top of this package for the + * complete terms and further detail along with the license texts for + * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. + * + */ + +/* + * Specifications: + * Gleaning Resource Descriptions from Dialects of Languages (GRDDL) + * W3C Recommendation 11 September 2007 + * http://www.w3.org/TR/2007/REC-grddl-20070911/ + * http://www.w3.org/TR/grddl/ + * + */ + + +#ifdef HAVE_CONFIG_H +#include <raptor_config.h> +#endif + +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <stdarg.h> +#ifdef HAVE_ERRNO_H +#include <errno.h> +#endif +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif + +/* Raptor includes */ +#include "raptor2.h" +#include "raptor_internal.h" + +#include <libxml/xpath.h> +/* for xmlXPathRegisterNs() */ +#include <libxml/xpathInternals.h> +#include <libxml/xinclude.h> +#include <libxml/HTMLparser.h> + +#include <libxslt/xslt.h> +#include <libxslt/transform.h> +#include <libxslt/xsltutils.h> +#include <libxslt/security.h> + + +/* + * libxslt API notes + * + * Inputs to an XSLT transformation process with libxslt are: + * 1. A set of (key:value) parameters. + * 2. An xsltStylesheetPtr for the XSLT sheet + * Which could be made from a file or an xmlDoc; and the xmlDoc. + * made from a file or memory buffer. + * 3. An xmlDoc for the XML source + * Which could be made from a file or a memory buffer. + * + */ + + +static void raptor_grddl_filter_triples(void *user_data, raptor_statement *statement); + +static void raptor_grddl_xsltGenericError_handler(void *user_data, const char *msg, ...) RAPTOR_PRINTF_FORMAT(2, 0); + +static void raptor_libxslt_set_global_state(raptor_parser *rdf_parser); +static void raptor_libxslt_reset_global_state(raptor_parser *rdf_parser); + + +typedef struct +{ + /* transformation (XSLT) or profile URI */ + raptor_uri* uri; + /* base URI in effect when the above was found */ + raptor_uri* base_uri; +} grddl_xml_context; + + +/* + * XSLT parser object + */ +struct raptor_grddl_parser_context_s { + raptor_world* world; + raptor_parser* rdf_parser; + + /* HTML document ctxt */ + htmlParserCtxtPtr html_ctxt; + /* XML document ctxt */ + xmlParserCtxtPtr xml_ctxt; + + /* Create xpath evaluation context */ + xmlXPathContextPtr xpathCtx; + + /* parser for dealing with the result */ + raptor_parser* internal_parser; + /* ... constructed with this name */ + const char* internal_parser_name; + + /* URI of root namespace of document */ + raptor_uri* root_ns_uri; + + /* List of transformation URIs for document */ + raptor_sequence* doc_transform_uris; + + /* Copy of the user data statement_handler overwritten to point to + * raptor_grddl_filter_triples() + */ + void* saved_user_data; + raptor_statement_handler saved_statement_handler; + + /* URI data-view:namespaceTransformation */ + raptor_uri* namespace_transformation_uri; + + /* URI data-view:profileTransformation */ + raptor_uri* profile_transformation_uri; + + /* List of namespace / <head profile> URIs */ + raptor_sequence* profile_uris; + + /* List of visited URIs */ + raptor_sequence* visited_uris; + + /* Depth of GRDDL parsers - 0 means that the lists above + * are owned by this parser: visited_uris + * */ + int grddl_depth; + + /* Content-Type of top-level document */ + char* content_type; + + /* Check content type once */ + int content_type_check; + + /* stringbuffer to use to store retrieved document */ + raptor_stringbuffer* sb; + + /* non-0 to perform an additional RDF/XML parse on a retrieved document + * because it has been identified as RDF/XML. */ + int process_this_as_rdfxml; + + /* non-0 to perform GRDL processing on document */ + int grddl_processing; + + /* non-0 to perform XML Include processing on document */ + int xinclude_processing; + + /* non-0 to perform HTML Base processing on document */ + int html_base_processing; + + /* non-0 to perform HTML <link> processing on document */ + int html_link_processing; + + xmlGenericErrorFunc saved_xsltGenericError; + void *saved_xsltGenericErrorContext; + + xsltSecurityPrefsPtr saved_xsltSecurityPrefs; +}; + + +typedef struct raptor_grddl_parser_context_s raptor_grddl_parser_context; + + +static void +raptor_grddl_xsltGenericError_handler(void *user_data, const char *msg, ...) +{ + raptor_parser* rdf_parser = (raptor_parser*)user_data; + va_list arguments; + size_t msg_len; + size_t length; + char *nmsg; + + if(!msg || *msg == '\n') + return; + + va_start(arguments, msg); + + msg_len = strlen(msg); + +#define PREFIX "libxslt error: " +#define PREFIX_LENGTH 15 + length = PREFIX_LENGTH + msg_len + 1; + nmsg = RAPTOR_MALLOC(char*, length); + if(nmsg) { + memcpy(nmsg, PREFIX, PREFIX_LENGTH); + memcpy(nmsg + PREFIX_LENGTH, msg, msg_len + 1); + if(nmsg[length-1] == '\n') + nmsg[length-1] = '\0'; + } + + PRAGMA_IGNORE_WARNING_FORMAT_NONLITERAL_START + raptor_parser_log_error_varargs(rdf_parser, RAPTOR_LOG_LEVEL_ERROR, + nmsg ? nmsg : msg, arguments); + PRAGMA_IGNORE_WARNING_END + + if(nmsg) + RAPTOR_FREE(char*, nmsg); + + va_end(arguments); +} + + +static grddl_xml_context* +raptor_new_xml_context(raptor_world* world, raptor_uri* uri, + raptor_uri* base_uri) +{ + grddl_xml_context* xml_context; + + xml_context = RAPTOR_MALLOC(grddl_xml_context*, sizeof(*xml_context)); + if(uri) + uri = raptor_uri_copy(uri); + if(base_uri) + base_uri = raptor_uri_copy(base_uri); + xml_context->uri = uri; + xml_context->base_uri = base_uri; + + return xml_context; +} + + +static void +grddl_free_xml_context(void* userdata) +{ + grddl_xml_context* xml_context = (grddl_xml_context*)userdata; + + if(xml_context->uri) + raptor_free_uri(xml_context->uri); + if(xml_context->base_uri) + raptor_free_uri(xml_context->base_uri); + RAPTOR_FREE(grddl_xml_context, xml_context); +} + + +static int +raptor_grddl_parse_init_common(raptor_parser* rdf_parser, const char *name) +{ + raptor_grddl_parser_context* grddl_parser; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + grddl_parser->world = rdf_parser->world; + grddl_parser->rdf_parser = rdf_parser; + + /* Sequence of grddl_xml_context* URIs of XSLT sheets to transform + * the document */ + grddl_parser->doc_transform_uris = raptor_new_sequence((raptor_data_free_handler)grddl_free_xml_context, NULL); + + grddl_parser->grddl_processing = 1; + grddl_parser->xinclude_processing = 1; + grddl_parser->html_base_processing = 0; + grddl_parser->html_link_processing = 1; + + return 0; +} + + +/* 58 == strlen(grddl_namespaceTransformation_uri_string) */ +#define GRDDL_NAMESPACETRANSFORMATION_URI_STRING_LEN 58 +static const unsigned char * const grddl_namespaceTransformation_uri_string = (const unsigned char*)"http://www.w3.org/2003/g/data-view#namespaceTransformation"; + +/* 56 == strlen(grddl_profileTransformation_uri_string) */ +#define GRDDL_PROFILETRANSFORMATION_URI_STRING_LEN 56 +static const unsigned char * const grddl_profileTransformation_uri_string = (const unsigned char*)"http://www.w3.org/2003/g/data-view#profileTransformation"; + + +static int +raptor_grddl_parse_init(raptor_parser* rdf_parser, const char *name) +{ + raptor_grddl_parser_context* grddl_parser; + raptor_world* world = rdf_parser->world; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + raptor_grddl_parse_init_common(rdf_parser, name); + + /* Sequence of URIs from <head profile> */ + grddl_parser->profile_uris = raptor_new_sequence((raptor_data_free_handler)grddl_free_xml_context, NULL); + + grddl_parser->namespace_transformation_uri = raptor_new_uri_from_counted_string(world, grddl_namespaceTransformation_uri_string, GRDDL_NAMESPACETRANSFORMATION_URI_STRING_LEN); + grddl_parser->profile_transformation_uri = raptor_new_uri_from_counted_string(world, grddl_profileTransformation_uri_string, GRDDL_PROFILETRANSFORMATION_URI_STRING_LEN); + + /* Sequence of URIs visited - may be overwritten if this is not + * the depth 0 grddl parser + */ + grddl_parser->visited_uris = raptor_new_sequence((raptor_data_free_handler)raptor_free_uri, (raptor_data_print_handler)raptor_uri_print); + + return 0; +} + + +static void +raptor_grddl_parse_terminate(raptor_parser *rdf_parser) +{ + raptor_grddl_parser_context *grddl_parser; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + if(grddl_parser->xml_ctxt) { + if(grddl_parser->xml_ctxt->myDoc) { + xmlFreeDoc(grddl_parser->xml_ctxt->myDoc); + grddl_parser->xml_ctxt->myDoc = NULL; + } + xmlFreeParserCtxt(grddl_parser->xml_ctxt); + } + + if(grddl_parser->html_ctxt) { + if(grddl_parser->html_ctxt->myDoc) { + xmlFreeDoc(grddl_parser->html_ctxt->myDoc); + grddl_parser->html_ctxt->myDoc = NULL; + } + htmlFreeParserCtxt(grddl_parser->html_ctxt); + } + + if(grddl_parser->xpathCtx) + xmlXPathFreeContext(grddl_parser->xpathCtx); + + if(grddl_parser->internal_parser) + raptor_free_parser(grddl_parser->internal_parser); + + if(grddl_parser->root_ns_uri) + raptor_free_uri(grddl_parser->root_ns_uri); + + if(grddl_parser->doc_transform_uris) + raptor_free_sequence(grddl_parser->doc_transform_uris); + + if(grddl_parser->profile_uris) + raptor_free_sequence(grddl_parser->profile_uris); + + if(grddl_parser->namespace_transformation_uri) + raptor_free_uri(grddl_parser->namespace_transformation_uri); + + if(grddl_parser->profile_transformation_uri) + raptor_free_uri(grddl_parser->profile_transformation_uri); + + if(!grddl_parser->grddl_depth) { + if(grddl_parser->visited_uris) + raptor_free_sequence(grddl_parser->visited_uris); + } + + if(grddl_parser->content_type) + RAPTOR_FREE(char*, grddl_parser->content_type); + + if(grddl_parser->sb) + raptor_free_stringbuffer(grddl_parser->sb); +} + + +static void +raptor_grddl_parser_add_parent(raptor_parser *rdf_parser, + raptor_grddl_parser_context* parent_grddl_parser) +{ + raptor_grddl_parser_context* grddl_parser; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + /* Do not set parent twice */ + if(grddl_parser->visited_uris == parent_grddl_parser->visited_uris) + return; + + /* free any sequence here */ + if(grddl_parser->visited_uris) + raptor_free_sequence(grddl_parser->visited_uris); + + /* share parent's list and do not free it here */ + grddl_parser->visited_uris = parent_grddl_parser->visited_uris; + grddl_parser->grddl_depth = parent_grddl_parser->grddl_depth + 1; + + grddl_parser->saved_user_data = parent_grddl_parser->rdf_parser; + grddl_parser->saved_statement_handler = raptor_grddl_filter_triples; +} + + + +static int +raptor_grddl_parse_start(raptor_parser *rdf_parser) +{ + raptor_grddl_parser_context* grddl_parser; + raptor_locator *locator = &rdf_parser->locator; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + locator->line = 1; + + grddl_parser->content_type_check = 0; + grddl_parser->process_this_as_rdfxml = 0; + + return 0; +} + + +#define MATCH_IS_VALUE_LIST 1 +#define MATCH_IS_PROFILE 2 +#define MATCH_IS_HARDCODED 4 +/* stop looking for other hardcoded matches */ +#define MATCH_LAST 8 +static struct { + const xmlChar* xpath; + int flags; + const xmlChar* xslt_sheet_uri; +} match_table[]={ + /* XHTML document where the GRDDL profile is in + * <link ref='transform' href='url'> inside the html <head> + * Value of @rel is a space-separated list of link types. + */ + { + (const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/html:link[contains(@rel,\"transformation\")]/@href", + 0, + NULL + } + , + /* XHTML document where the GRDDL profile is in + * <a rel='transform' href='url'> inside the html <body> + * Value of @rel is a space-separated list of link types. + */ + { + (const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/../..//html:a[contains(@rel,\"transformation\")]/@href", + 0, + NULL + } + , + /* XML document linking to transform via attribute dataview:transformation + * on the root element. + * Example: http://www.w3.org/2004/01/rdxh/grddl-p3p-example + **/ + { + (const xmlChar*)"/*/@dataview:transformation", + MATCH_IS_VALUE_LIST, + NULL + } + , + /* hCalendar microformat http://microformats.org/wiki/hcalendar */ + { + (const xmlChar*)"//*[contains(concat(' ', concat(normalize-space(@class),' ')),' vevent ')]", + MATCH_IS_HARDCODED, + (const xmlChar*)"http://www.w3.org/2002/12/cal/glean-hcal.xsl" + } + , + /* hReview microformat http://microformats.org/wiki/review */ + { + (const xmlChar*)"//*[contains(concat(' ', concat(normalize-space(@class),' ')),' hreview ')]", + MATCH_IS_HARDCODED | MATCH_LAST, /* stop here since hCard is inside hReview */ + (const xmlChar*)"http://www.w3.org/2001/sw/grddl-wg/doc29/hreview2rdfxml.xsl" + } + , + /* hCard microformat http://microformats.org/wiki/hcard */ + { + (const xmlChar*)"//*[contains(concat(' ', concat(normalize-space(@class),' ')),' vcard ')]", + MATCH_IS_HARDCODED, + (const xmlChar*)"http://www.w3.org/2006/vcard/hcard2rdf.xsl" + } + , + { + NULL, + 0, + NULL + } +}; + + +static const char* const grddl_namespace_uris_ignore_list[] = { + "http://www.w3.org/1999/xhtml", + "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "http://www.w3.org/2001/XMLSchema", + NULL +}; + + +/* add URI to XSLT transformation URI list */ +static void +raptor_grddl_add_transform_xml_context(raptor_grddl_parser_context* grddl_parser, + grddl_xml_context* xml_context) +{ + int i; + raptor_uri* uri = xml_context->uri; + int size; + +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 + RAPTOR_DEBUG2("Found document transformation URI '%s'\n", + raptor_uri_as_string(uri)); +#endif + + size = raptor_sequence_size(grddl_parser->doc_transform_uris); + for(i = 0; i < size; i++) { + grddl_xml_context* xc; + xc = (grddl_xml_context*)raptor_sequence_get_at(grddl_parser->doc_transform_uris, i); + if(raptor_uri_equals(uri, xc->uri)) { +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 + RAPTOR_DEBUG2("Already seen XSLT URI '%s'\n", raptor_uri_as_string(uri)); +#endif + grddl_free_xml_context(xml_context); + return; + } + } + + RAPTOR_DEBUG3("Adding new document transformation XSLT URI %s with base URI %s\n", + (uri ? (const char*)raptor_uri_as_string(uri): "(NONE)"), + (xml_context->base_uri ? (const char*)raptor_uri_as_string(xml_context->base_uri) : "(NONE)")); + + raptor_sequence_push(grddl_parser->doc_transform_uris, xml_context); +} + + +static void +raptor_grddl_filter_triples(void *user_data, raptor_statement *statement) +{ + raptor_parser* rdf_parser = (raptor_parser*)user_data; + raptor_grddl_parser_context* grddl_parser; + int i; + raptor_uri* predicate_uri; + int size; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + /* Look for a triple <uri> <uri> <uri> */ + if(statement->subject->type != RAPTOR_TERM_TYPE_URI || + statement->predicate->type != RAPTOR_TERM_TYPE_URI || + statement->object->type != RAPTOR_TERM_TYPE_URI) + return; + +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 2 + RAPTOR_DEBUG2("Parser %p: Relaying statement: ", RAPTOR_VOIDP(rdf_parser)); + raptor_statement_print(statement, stderr); + fputc('\n', stderr); +#endif + +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 + RAPTOR_DEBUG3("Parser %p: Checking against %d profile URIs\n", + RAPTOR_VOIDP(rdf_parser), + raptor_sequence_size(grddl_parser->profile_uris)); +#endif + + /* Look for(i = 0, root namespace URI) + * <document-root-element-namespace-URI> data-view:namespaceTransformation ?tr + * or (i>0, profile URIs) + * <document-root-element-namespace-URI> data-view:profileTransformation ?tr + * and then ?tr becomes a new document transformation URI + */ + predicate_uri = grddl_parser->namespace_transformation_uri; + size = raptor_sequence_size(grddl_parser->profile_uris); + for(i = 0; i < size; i++) { + grddl_xml_context* xml_context; + raptor_uri* profile_uri; + grddl_xml_context* new_xml_context; + + xml_context = (grddl_xml_context*)raptor_sequence_get_at(grddl_parser->profile_uris, i); + profile_uri = xml_context->uri; + + if(i == 1) + predicate_uri = grddl_parser->profile_transformation_uri; + + if(!profile_uri) + continue; + + if(raptor_uri_equals(statement->subject->value.uri, profile_uri) && + raptor_uri_equals(statement->predicate->value.uri, predicate_uri)) { + raptor_uri* uri = statement->object->value.uri; + +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 + RAPTOR_DEBUG4("Parser %p: Matches profile URI #%d '%s'\n", + RAPTOR_VOIDP(rdf_parser), + i, raptor_uri_as_string(profile_uri)); +#endif + + new_xml_context = raptor_new_xml_context(rdf_parser->world, uri, + rdf_parser->base_uri); + raptor_grddl_add_transform_xml_context(grddl_parser, new_xml_context); + } else { +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 + RAPTOR_DEBUG4("Parser %p: Failed to match profile URI #%d '%s'\n", + RAPTOR_VOIDP(rdf_parser), + i, raptor_uri_as_string(profile_uri)); +#endif + } + + } + +} + + +static int +raptor_grddl_ensure_internal_parser(raptor_parser* rdf_parser, + const char* parser_name, int filter) +{ + raptor_grddl_parser_context* grddl_parser; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + if(!grddl_parser->internal_parser_name || + !strcmp(parser_name, "guess") || + strcmp(grddl_parser->internal_parser_name, parser_name)) { + /* construct a new parser if none in use or not what is required */ + if(grddl_parser->internal_parser) { + unsigned int our_emit_flags = rdf_parser->emit_graph_marks; + + /* copy back bit flags from parser about to be destroyed */ + raptor_parser_copy_flags_state(rdf_parser, + grddl_parser->internal_parser); + + /* restore whatever graph makrs state we had here */ + rdf_parser->emit_graph_marks = our_emit_flags ? 1 : 0; + + RAPTOR_DEBUG3("Parser %p: Freeing internal %s parser.\n", + RAPTOR_VOIDP(rdf_parser), + grddl_parser->internal_parser_name); + + raptor_free_parser(grddl_parser->internal_parser); + grddl_parser->internal_parser = NULL; + grddl_parser->internal_parser_name = NULL; + } + + RAPTOR_DEBUG3("Parser %p: Allocating new internal %s parser.\n", + RAPTOR_VOIDP(rdf_parser), parser_name); + grddl_parser->internal_parser = raptor_new_parser(rdf_parser->world, + parser_name); + if(!grddl_parser->internal_parser) { + raptor_parser_error(rdf_parser, "Failed to create %s parser", + parser_name); + return 1; + } + + /* initialise the new parser with the outer state */ + grddl_parser->internal_parser_name = parser_name; + if(raptor_parser_copy_user_state(grddl_parser->internal_parser, rdf_parser)) + return 1; + + /* Disable graph marks in newly constructed internal parser */ + grddl_parser->internal_parser->emit_graph_marks = 0; + + grddl_parser->saved_user_data = rdf_parser->user_data; + grddl_parser->saved_statement_handler = rdf_parser->statement_handler; + } + + /* Filter the triples for profile/namespace URIs */ + if(filter) { + grddl_parser->internal_parser->user_data = rdf_parser; + grddl_parser->internal_parser->statement_handler = raptor_grddl_filter_triples; + } else { + grddl_parser->internal_parser->user_data = grddl_parser->saved_user_data; + grddl_parser->internal_parser->statement_handler = grddl_parser->saved_statement_handler; + } + + return 0; +} + + +/* Run a GRDDL transform using a pre-parsed XSLT stylesheet already + * formed into a libxml document (with URI) + */ +static int +raptor_grddl_run_grddl_transform_doc(raptor_parser* rdf_parser, + grddl_xml_context* xml_context, + xmlDocPtr xslt_doc, + xmlDocPtr doc) +{ + raptor_world* world = rdf_parser->world; + raptor_grddl_parser_context* grddl_parser; + int ret = 0; + xsltStylesheetPtr sheet = NULL; + xmlDocPtr res = NULL; + xmlChar *doc_txt = NULL; + int doc_txt_len = 0; + const char* parser_name; + const char* params[7]; + const unsigned char* base_uri_string; + size_t base_uri_len; + raptor_uri* xslt_uri; + raptor_uri* base_uri; + char *quoted_base_uri = NULL; + xsltTransformContextPtr userCtxt = NULL; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + xslt_uri = xml_context->uri; + base_uri = xml_context->base_uri ? xml_context->base_uri : xml_context->uri; + + base_uri_string = raptor_uri_as_counted_string(base_uri, &base_uri_len); + + RAPTOR_DEBUG3("Running GRDDL transform with XSLT URI '%s' with doc base URI '%s'\n", + raptor_uri_as_string(xslt_uri), + base_uri_string); + + raptor_libxslt_set_global_state(rdf_parser); + + /* This calls xsltGetDefaultSecurityPrefs() */ + sheet = xsltParseStylesheetDoc(xslt_doc); + if(!sheet) { + raptor_parser_error(rdf_parser, "Failed to parse stylesheet in '%s'", + raptor_uri_as_string(xslt_uri)); + ret = 1; + goto cleanup_xslt; + } + + /* This calls xsltGetDefaultSecurityPrefs() */ + userCtxt = xsltNewTransformContext(sheet, doc); + + /* set per-transform security preferences */ + if(world->xslt_security_preferences) + xsltSetCtxtSecurityPrefs((xsltSecurityPrefs*)world->xslt_security_preferences, + userCtxt); + + /* set per-transform generic error handler */ + xsltSetTransformErrorFunc(userCtxt, rdf_parser, + raptor_grddl_xsltGenericError_handler); + + + /* + * Define 'base', 'Base' and 'url' params to allow some XSLT sheets to work: + * base: + * http://www.w3.org/2000/07/uri43/uri.xsl + * Base: + * http://www.w3.org/2000/08/w3c-synd/home2rss.xsl + * url: (optional) + * http://www.w3.org/2001/sw/grddl-wg/td/RDFa2RDFXML.xsl + */ + quoted_base_uri = RAPTOR_MALLOC(char*, base_uri_len + 3); + quoted_base_uri[0] = '\''; + memcpy(quoted_base_uri + 1, (const char*)base_uri_string, base_uri_len); + quoted_base_uri[base_uri_len + 1] = '\''; + quoted_base_uri[base_uri_len + 2] = '\0'; + + params[0] = "base"; + params[1] = (const char*)quoted_base_uri; + params[2] = "Base"; + params[3] = (const char*)quoted_base_uri; + params[4] = "url"; + params[5] = (const char*)quoted_base_uri; + params[6] = NULL; + + res = xsltApplyStylesheetUser(sheet, doc, params, NULL, NULL, userCtxt); + + if(!res) { + raptor_parser_error(rdf_parser, "Failed to apply stylesheet in '%s'", + raptor_uri_as_string(xslt_uri)); + ret = 1; + goto cleanup_xslt; + } + + if(res->type == XML_HTML_DOCUMENT_NODE) { + if(sheet->method != NULL) + xmlFree(sheet->method); + sheet->method = (xmlChar*)xmlMalloc(5); + memcpy(sheet->method, "html", 5); + } + + /* write the resulting XML to a string */ + xsltSaveResultToString(&doc_txt, &doc_txt_len, res, sheet); + + if(!doc_txt || !doc_txt_len) { + raptor_parser_warning(rdf_parser, "XSLT returned an empty document"); + goto cleanup_xslt; + } + + RAPTOR_DEBUG4("XSLT returned %d bytes document method %s media type %s\n", + doc_txt_len, + (sheet->method ? (const char*)sheet->method : "NULL"), + (sheet->mediaType ? (const char*)sheet->mediaType : "NULL")); + + /* Set mime types for XSLT <xsl:output method> content */ + if(sheet->mediaType == NULL && sheet->method) { + if(!(strcmp((const char*)sheet->method, "text"))) { + sheet->mediaType = (xmlChar*)xmlMalloc(11); + memcpy(sheet->mediaType, "text/plain",11); + } else if(!(strcmp((const char*)sheet->method, "xml"))) { + sheet->mediaType = (xmlChar*)xmlMalloc(16); + memcpy(sheet->mediaType, "application/xml",16); + } else if(!(strcmp((const char*)sheet->method, "html"))) { + sheet->mediaType = (xmlChar*)xmlMalloc(10); + memcpy(sheet->mediaType, "text/html",10); + } + } + + /* Assume all that all media XML is RDF/XML and also that + * with no information at all we have RDF/XML + */ + if(!sheet->mediaType || + (sheet->mediaType && + !strcmp((const char*)sheet->mediaType, "application/xml"))) { + if(sheet->mediaType) + xmlFree(sheet->mediaType); + sheet->mediaType = (xmlChar*)xmlMalloc(20); + memcpy(sheet->mediaType, "application/rdf+xml",20); + } + + parser_name = raptor_world_guess_parser_name(rdf_parser->world, NULL, + (const char*)sheet->mediaType, + doc_txt, doc_txt_len, NULL); + if(!parser_name) { + RAPTOR_DEBUG3("Parser %p: Guessed no parser from mime type '%s' and content - ending", + RAPTOR_VOIDP(rdf_parser), sheet->mediaType); + goto cleanup_xslt; + } + + RAPTOR_DEBUG4("Parser %p: Guessed parser %s from mime type '%s' and content\n", + RAPTOR_VOIDP(rdf_parser), parser_name, sheet->mediaType); + + if(!strcmp((const char*)parser_name, "grddl")) { + RAPTOR_DEBUG2("Parser %p: Ignoring guess to run grddl parser - ending", + RAPTOR_VOIDP(rdf_parser)); + goto cleanup_xslt; + } + + ret = raptor_grddl_ensure_internal_parser(rdf_parser, parser_name, 0); + if(ret) + goto cleanup_xslt; + + if(grddl_parser->internal_parser) { + /* generate the triples */ + ret = raptor_parser_parse_start(grddl_parser->internal_parser, base_uri); + if(!ret) + ret = raptor_parser_parse_chunk(grddl_parser->internal_parser, + doc_txt, doc_txt_len, 1); + } + + cleanup_xslt: + + if(userCtxt) + xsltFreeTransformContext(userCtxt); + + if(quoted_base_uri) + RAPTOR_FREE(char*, quoted_base_uri); + + if(doc_txt) + xmlFree(doc_txt); + + if(res) + xmlFreeDoc(res); + + if(sheet) + xsltFreeStylesheet(sheet); + + raptor_libxslt_reset_global_state(rdf_parser); + + return ret; +} + + +typedef struct +{ + raptor_parser* rdf_parser; + xmlParserCtxtPtr xc; + raptor_uri* base_uri; +} raptor_grddl_xml_parse_bytes_context; + + +static void +raptor_grddl_uri_xml_parse_bytes(raptor_www* www, + void *userdata, + const void *ptr, size_t size, size_t nmemb) +{ + raptor_grddl_xml_parse_bytes_context* xpbc; + size_t len = size * nmemb; + int rc = 0; + + xpbc = (raptor_grddl_xml_parse_bytes_context*)userdata; + + if(!xpbc->xc) { + xmlParserCtxtPtr xc; + + xc = xmlCreatePushParserCtxt(NULL, NULL, + (const char*)ptr, RAPTOR_BAD_CAST(int, len), + (const char*)raptor_uri_as_string(xpbc->base_uri)); + if(!xc) + rc = 1; + else { + int libxml_options = 0; + +#ifdef RAPTOR_LIBXML_XML_PARSE_NONET + if(RAPTOR_OPTIONS_GET_NUMERIC(xpbc->rdf_parser, RAPTOR_OPTION_NO_NET)) + libxml_options |= XML_PARSE_NONET; +#endif +#ifdef HAVE_XMLCTXTUSEOPTIONS + xmlCtxtUseOptions(xc, libxml_options); +#endif + + xc->replaceEntities = 1; + xc->loadsubset = 1; + } + xpbc->xc = xc; + } else + rc = xmlParseChunk(xpbc->xc, (const char*)ptr, RAPTOR_BAD_CAST(int, len), 0); + + if(rc) + raptor_parser_error(xpbc->rdf_parser, "XML Parsing failed"); +} + + +#define FETCH_IGNORE_ERRORS 1 +#define FETCH_ACCEPT_XSLT 2 + +static int +raptor_grddl_fetch_uri(raptor_parser* rdf_parser, + raptor_uri* uri, + raptor_www_write_bytes_handler write_bytes_handler, + void* write_bytes_user_data, + raptor_www_content_type_handler content_type_handler, + void* content_type_user_data, + int flags) +{ + raptor_www *www; + const char *accept_h; + int ret = 0; + int ignore_errors = (flags & FETCH_IGNORE_ERRORS); + + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET)) { + if(!raptor_uri_uri_string_is_file_uri(raptor_uri_as_string(uri))) + return 1; + } + + www = raptor_new_www(rdf_parser->world); + if(!www) + return 1; + + if(raptor_www_set_user_agent2(www, "grddl/0.1", 0)) + goto cleanup_www; + + if(flags & FETCH_ACCEPT_XSLT) { + if(raptor_www_set_http_accept2(www, "application/xml", 0)) + goto cleanup_www; + } else { + accept_h = raptor_parser_get_accept_header(rdf_parser); + if(accept_h) { + ret = raptor_www_set_http_accept2(www, accept_h, 0); + RAPTOR_FREE(char*, accept_h); + if(ret) + goto cleanup_www; + } + } + if(rdf_parser->uri_filter) + raptor_www_set_uri_filter(www, rdf_parser->uri_filter, + rdf_parser->uri_filter_user_data); + if(ignore_errors) + raptor_world_internal_set_ignore_errors(rdf_parser->world, 1); + + raptor_www_set_write_bytes_handler(www, write_bytes_handler, + write_bytes_user_data); + raptor_www_set_content_type_handler(www, content_type_handler, + content_type_user_data); + + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_WWW_TIMEOUT) > 0) + raptor_www_set_connection_timeout(www, + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_WWW_TIMEOUT)); + + ret = raptor_www_fetch(www, uri); + + raptor_free_www(www); + + if(ignore_errors) + raptor_world_internal_set_ignore_errors(rdf_parser->world, 0); + + return ret; + +cleanup_www: + raptor_free_www(www); + + return 1; +} + + +/* Run a GRDDL transform using a XSLT stylesheet at a given URI */ +static int +raptor_grddl_run_grddl_transform_uri(raptor_parser* rdf_parser, + grddl_xml_context* xml_context, + xmlDocPtr doc) +{ + xmlParserCtxtPtr xslt_ctxt = NULL; + raptor_grddl_xml_parse_bytes_context xpbc; + int ret = 0; + raptor_uri* xslt_uri; + raptor_uri* base_uri; + raptor_uri* old_locator_uri; + raptor_locator *locator = &rdf_parser->locator; + + xslt_uri = xml_context->uri; + base_uri = xml_context->base_uri ? xml_context->base_uri : xml_context->uri; + + RAPTOR_DEBUG3("Running GRDDL transform with XSLT URI %s and base URI %s\n", + raptor_uri_as_string(xslt_uri), + raptor_uri_as_string(base_uri)); + + /* make an xsltStylesheetPtr via the raptor_grddl_uri_xml_parse_bytes + * callback as bytes are returned + */ + xpbc.xc = NULL; + xpbc.rdf_parser = rdf_parser; + xpbc.base_uri = base_uri; + + old_locator_uri = locator->uri; + locator->uri = xslt_uri; + ret = raptor_grddl_fetch_uri(rdf_parser, + xslt_uri, + raptor_grddl_uri_xml_parse_bytes, &xpbc, + NULL, NULL, + FETCH_ACCEPT_XSLT); + xslt_ctxt = xpbc.xc; + if(ret) { + locator->uri = old_locator_uri; + raptor_parser_warning(rdf_parser, "Fetching XSLT document URI '%s' failed", + raptor_uri_as_string(xslt_uri)); + ret = 0; + } else { + xmlParseChunk(xpbc.xc, NULL, 0, 1); + + ret = raptor_grddl_run_grddl_transform_doc(rdf_parser, + xml_context, + xslt_ctxt->myDoc, + doc); + locator->uri = old_locator_uri; + } + + if(xslt_ctxt) + xmlFreeParserCtxt(xslt_ctxt); + + return ret; +} + + +static int +raptor_grddl_seen_uri(raptor_grddl_parser_context* grddl_parser, + raptor_uri* uri) +{ + int i; + int seen = 0; + raptor_sequence* seq = grddl_parser->visited_uris; + int size; + + size = raptor_sequence_size(seq); + for(i = 0; i < size; i++) { + raptor_uri* vuri = (raptor_uri*)raptor_sequence_get_at(seq, i); + if(raptor_uri_equals(uri, vuri)) { + seen = 1; + break; + } + } + +#ifdef RAPTOR_DEBUG + if(seen) + RAPTOR_DEBUG2("Already seen URI '%s'\n", raptor_uri_as_string(uri)); +#endif + + return seen; +} + + +static void +raptor_grddl_done_uri(raptor_grddl_parser_context* grddl_parser, + raptor_uri* uri) +{ + if(!grddl_parser->visited_uris) + return; + + if(!raptor_grddl_seen_uri(grddl_parser, uri)) { + raptor_sequence* seq = grddl_parser->visited_uris; + raptor_sequence_push(seq, raptor_uri_copy(uri)); + } +} + + +static raptor_sequence* +raptor_grddl_run_xpath_match(raptor_parser* rdf_parser, + xmlDocPtr doc, + const xmlChar* xpathExpr, + int flags) +{ + raptor_grddl_parser_context* grddl_parser; + /* Evaluate xpath expression */ + xmlXPathObjectPtr xpathObj = NULL; + raptor_sequence* seq = NULL; + xmlNodeSetPtr nodes; + int i; + int size; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + seq = raptor_new_sequence((raptor_data_free_handler)grddl_free_xml_context, NULL); + + /* Evaluate xpath expression */ + xpathObj = xmlXPathEvalExpression(xpathExpr, + grddl_parser->xpathCtx); + if(!xpathObj) { + raptor_parser_error(rdf_parser, + "Unable to evaluate XPath expression \"%s\"", + xpathExpr); + raptor_free_sequence(seq); seq = NULL; + goto cleanup_xpath_match; + } + + nodes = xpathObj->nodesetval; + if(!nodes || xmlXPathNodeSetIsEmpty(nodes)) { +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 + RAPTOR_DEBUG3("No match found with XPath expression \"%s\" over '%s'\n", + xpathExpr, raptor_uri_as_string(rdf_parser->base_uri)); +#endif + raptor_free_sequence(seq); seq = NULL; + goto cleanup_xpath_match; + } + +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 + RAPTOR_DEBUG3("Found match with XPath expression \"%s\" over '%s'\n", + xpathExpr, raptor_uri_as_string(rdf_parser->base_uri)); +#endif + + size = xmlXPathNodeSetGetLength(nodes); + for(i = 0; i < size; i++) { + xmlNodePtr node = nodes->nodeTab[i]; + const unsigned char* uri_string = NULL; + xmlChar *base_uri_string; + raptor_uri* base_uri = NULL; + raptor_uri* uri = NULL; + + if(node->type != XML_ATTRIBUTE_NODE && + node->type != XML_ELEMENT_NODE) { + raptor_parser_error(rdf_parser, "Got unexpected node type %u", + node->type); + continue; + } + + + /* xmlNodeGetBase() returns base URI or NULL and must be freed + * with xmlFree() + */ + if(grddl_parser->html_base_processing) { + xmlElementType savedType = doc->type; + doc->type = XML_HTML_DOCUMENT_NODE; + base_uri_string = xmlNodeGetBase(doc, node); + doc->type = savedType; + } else + base_uri_string = xmlNodeGetBase(doc, node); + + + if(node->type == XML_ATTRIBUTE_NODE) + uri_string = (const unsigned char*)node->children->content; + else { /* XML_ELEMENT_NODE */ + if(node->ns) + uri_string = (const unsigned char*)node->ns->href; + } + + + if(base_uri_string) { + base_uri = raptor_new_uri(rdf_parser->world, base_uri_string); + xmlFree(base_uri_string); +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 + RAPTOR_DEBUG2("XML base URI of match is '%s'\n", + raptor_uri_as_string(base_uri)); +#endif + } else if(rdf_parser->base_uri) + base_uri = raptor_uri_copy(rdf_parser->base_uri); + else + base_uri = NULL; + + if(uri_string && (flags & MATCH_IS_VALUE_LIST)) { + char *start; + char *end; + char* buffer; + size_t list_len = strlen((const char*)uri_string); + + buffer = RAPTOR_MALLOC(char*, list_len + 1); + memcpy(buffer, uri_string, list_len + 1); + + for(start = end = buffer; end; start = end+1) { + grddl_xml_context* xml_context; + + end = strchr(start, ' '); + if(end) + *end = '\0'; + + if(start == end) + continue; + + RAPTOR_DEBUG2("Got list match URI '%s'\n", start); + + uri = raptor_new_uri_relative_to_base(rdf_parser->world, + base_uri, + (const unsigned char*)start); + if(flags & MATCH_IS_PROFILE && + !strcmp((const char*)raptor_uri_as_string(uri), + "http://www.w3.org/2003/g/data-view'")) { + raptor_free_uri(uri); + continue; + } + + xml_context = raptor_new_xml_context(rdf_parser->world, uri, base_uri); + raptor_sequence_push(seq, xml_context); + } + RAPTOR_FREE(char*, buffer); + } else if(flags & MATCH_IS_HARDCODED) { + RAPTOR_DEBUG2("Got hardcoded XSLT match for %s\n", xpathExpr); + /* return at first match, that's enough */ + if(base_uri) + raptor_free_uri(base_uri); + break; + } else if(uri_string) { + grddl_xml_context* xml_context; + RAPTOR_DEBUG2("Got single match URI '%s'\n", uri_string); + + uri = raptor_new_uri_relative_to_base(rdf_parser->world, base_uri, + uri_string); + xml_context = raptor_new_xml_context(rdf_parser->world, uri, base_uri); + raptor_sequence_push(seq, xml_context); + raptor_free_uri(uri); + } + + if(base_uri) + raptor_free_uri(base_uri); + } + + cleanup_xpath_match: + if(xpathObj) + xmlXPathFreeObject(xpathObj); + + return seq; +} + + +static void +raptor_grddl_check_recursive_content_type_handler(raptor_www* www, + void* userdata, + const char* content_type) +{ + raptor_parser* rdf_parser = (raptor_parser*)userdata; + raptor_grddl_parser_context* grddl_parser; + size_t len; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + if(!content_type) + return; + + len = strlen(content_type)+1; + if(grddl_parser->content_type) + RAPTOR_FREE(char*, grddl_parser->content_type); + grddl_parser->content_type = RAPTOR_MALLOC(char*, len + 1); + memcpy(grddl_parser->content_type, content_type, len + 1); + + if(!strncmp(content_type, "application/rdf+xml", 19)) { + grddl_parser->process_this_as_rdfxml = 1; + + RAPTOR_DEBUG2("Parser %p: Found RDF/XML content type\n", + RAPTOR_VOIDP(rdf_parser)); + raptor_parser_save_content(rdf_parser, 1); + } + + if(!strncmp(content_type, "text/html", 9) || + !strncmp(content_type, "application/html+xml", 20)) { + RAPTOR_DEBUG3("Parser %p: Found HTML content type '%s'\n", + RAPTOR_VOIDP(rdf_parser), content_type); + grddl_parser->html_base_processing = 1; + } + +} + +#define RECURSIVE_FLAGS_IGNORE_ERRORS 1 +#define RECURSIVE_FLAGS_FILTER 2 + +static int +raptor_grddl_run_recursive(raptor_parser* rdf_parser, raptor_uri* uri, + const char *parser_name, int flags) +{ + raptor_grddl_parser_context* grddl_parser; + raptor_www_content_type_handler content_type_handler = NULL; + int ret = 0; + const unsigned char* ibuffer = NULL; + size_t ibuffer_len = 0; + raptor_parse_bytes_context rpbc; + int ignore_errors = (flags & RECURSIVE_FLAGS_IGNORE_ERRORS) > 0; + int filter = (flags & RECURSIVE_FLAGS_FILTER) > 0; + int fetch_uri_flags = 0; + int is_grddl=!strcmp(parser_name, "grddl"); + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + if(raptor_grddl_seen_uri(grddl_parser, uri)) + return 0; + + if(is_grddl) + content_type_handler = raptor_grddl_check_recursive_content_type_handler; + + if(raptor_grddl_ensure_internal_parser(rdf_parser, parser_name, filter)) + return !ignore_errors; + + RAPTOR_DEBUG3("Running recursive %s operation on URI '%s'\n", + parser_name, raptor_uri_as_string(uri)); + + if(is_grddl) + raptor_grddl_parser_add_parent(grddl_parser->internal_parser, grddl_parser); + + rpbc.rdf_parser = grddl_parser->internal_parser; + rpbc.base_uri = NULL; + rpbc.final_uri = NULL; + rpbc.started = 0; + + if(ignore_errors) + fetch_uri_flags |=FETCH_IGNORE_ERRORS; + + if(raptor_grddl_fetch_uri(grddl_parser->internal_parser, + uri, + raptor_parser_parse_uri_write_bytes, &rpbc, + content_type_handler, grddl_parser->internal_parser, + fetch_uri_flags)) { + if(!ignore_errors) + raptor_parser_warning(rdf_parser, + "Fetching GRDDL document URI '%s' failed\n", + raptor_uri_as_string(uri)); + ret = 0; + goto tidy; + } + + if(ignore_errors) + raptor_world_internal_set_ignore_errors(rdf_parser->world, 1); + + raptor_parser_parse_chunk(grddl_parser->internal_parser, NULL, 0, 1); + + /* If content was saved, process it as RDF/XML */ + ibuffer = raptor_parser_get_content(grddl_parser->internal_parser, + &ibuffer_len); + if(ibuffer && strcmp(parser_name, "rdfxml")) { + RAPTOR_DEBUG2("Running additional RDF/XML parse on URI '%s' content\n", + raptor_uri_as_string(uri)); + + if(raptor_grddl_ensure_internal_parser(rdf_parser, "rdfxml", 1)) + ret = 1; + else { + if(raptor_parser_parse_start(grddl_parser->internal_parser, uri)) + ret = 1; + else { + ret = raptor_parser_parse_chunk(grddl_parser->internal_parser, ibuffer, + ibuffer_len, 1); + } + } + + raptor_parser_save_content(grddl_parser->internal_parser, 0); + } + + if(ibuffer) + RAPTOR_FREE(char*, ibuffer); + + if(rpbc.final_uri) + raptor_free_uri(rpbc.final_uri); + + if(ignore_errors) { + raptor_world_internal_set_ignore_errors(rdf_parser->world, 0); + ret = 0; + } + + tidy: + + return ret; +} + + +static void +raptor_grddl_libxml_discard_error(void* user_data, const char *msg, ...) +{ + return; +} + + +static int +raptor_grddl_parse_chunk(raptor_parser* rdf_parser, + const unsigned char *s, size_t len, + int is_end) +{ + raptor_grddl_parser_context* grddl_parser; + int i; + int ret = 0; + const unsigned char* uri_string; + raptor_uri* uri; + /* XML document DOM */ + xmlDocPtr doc; + int expri; + unsigned char* buffer = NULL; + size_t buffer_len = 0; + int buffer_is_libxml = 0; + int loop; + + if(!is_end && !rdf_parser->emitted_default_graph) { + /* Cannot tell if we have a statement yet but must ensure that + * the start default graph mark is done once and done before any + * statements. + */ + raptor_parser_start_graph(rdf_parser, NULL, 0); + rdf_parser->emitted_default_graph++; + } + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + if(grddl_parser->content_type && !grddl_parser->content_type_check) { + grddl_parser->content_type_check++; + if(!strncmp(grddl_parser->content_type, "application/rdf+xml", 19)) { + RAPTOR_DEBUG3("Parser %p: Found document with type '%s' is RDF/XML\n", + RAPTOR_VOIDP(rdf_parser), grddl_parser->content_type); + grddl_parser->process_this_as_rdfxml = 1; + } + if(!strncmp(grddl_parser->content_type, "text/html", 9) || + !strncmp(grddl_parser->content_type, "application/html+xml", 20)) { + RAPTOR_DEBUG3("Parser %p: Found document with type '%s' is HTML\n", + RAPTOR_VOIDP(rdf_parser), grddl_parser->content_type); + grddl_parser->html_base_processing = 1; + } + } + + if(!grddl_parser->sb) + grddl_parser->sb = raptor_new_stringbuffer(); + + raptor_stringbuffer_append_counted_string(grddl_parser->sb, s, len, 1); + + if(!is_end) + return 0; + + buffer_len = raptor_stringbuffer_length(grddl_parser->sb); + buffer = RAPTOR_MALLOC(unsigned char*, buffer_len + 1); + if(buffer) + raptor_stringbuffer_copy_to_string(grddl_parser->sb, + buffer, buffer_len); + + + uri_string = raptor_uri_as_string(rdf_parser->base_uri); + + /* Discard parsing errors */ + raptor_world_internal_set_ignore_errors(rdf_parser->world, 1); + + RAPTOR_DEBUG4("Parser %p: URI %s: processing %d bytes of content\n", + RAPTOR_VOIDP(rdf_parser), uri_string, (int)buffer_len); + + for(loop = 0; loop < 2; loop++) { + int rc; + + if(loop == 0) { + int libxml_options = 0; + + RAPTOR_DEBUG2("Parser %p: Creating an XML parser\n", + RAPTOR_VOIDP(rdf_parser)); + + /* try to create an XML parser context */ + grddl_parser->xml_ctxt = xmlCreatePushParserCtxt(NULL, NULL, + (const char*)buffer, + RAPTOR_BAD_CAST(int, buffer_len), + (const char*)uri_string); + if(!grddl_parser->xml_ctxt) { + RAPTOR_DEBUG2("Parser %p: Creating an XML parser failed\n", + RAPTOR_VOIDP(rdf_parser)); + continue; + } + +#ifdef RAPTOR_LIBXML_XML_PARSE_NONET + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET)) + libxml_options |= XML_PARSE_NONET; +#endif +#ifdef HAVE_XMLCTXTUSEOPTIONS + xmlCtxtUseOptions(grddl_parser->xml_ctxt, libxml_options); +#endif + + + grddl_parser->xml_ctxt->vctxt.warning = raptor_grddl_libxml_discard_error; + grddl_parser->xml_ctxt->vctxt.error = raptor_grddl_libxml_discard_error; + + grddl_parser->xml_ctxt->replaceEntities = 1; + grddl_parser->xml_ctxt->loadsubset = 1; + } else { /* loop is 1 */ + + /* try to create an HTML parser context */ + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_HTML_TAG_SOUP)) { + xmlCharEncoding enc; + int options; + + RAPTOR_DEBUG2("Parser %p: Creating an HTML parser\n", + RAPTOR_VOIDP(rdf_parser)); + + enc = xmlDetectCharEncoding((const unsigned char*)buffer, + RAPTOR_BAD_CAST(int, buffer_len)); + grddl_parser->html_ctxt = htmlCreatePushParserCtxt(/*sax*/ NULL, + /*user_data*/ NULL, + (const char *)buffer, + RAPTOR_BAD_CAST(int, buffer_len), + (const char *)uri_string, + enc); + if(!grddl_parser->html_ctxt) { + RAPTOR_DEBUG2("Parser %p: Creating an HTML parser failed\n", + RAPTOR_VOIDP(rdf_parser)); + continue; + } + + /* HTML parser */ + grddl_parser->html_ctxt->replaceEntities = 1; + grddl_parser->html_ctxt->loadsubset = 1; + + grddl_parser->html_ctxt->vctxt.error = raptor_grddl_libxml_discard_error; + + /* HTML_PARSE_NOWARNING disables sax->warning, vxtxt.warning */ + /* HTML_PARSE_NOERROR disables sax->error, vctxt.error */ + options = HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING; +#ifdef HTML_PARSE_RECOVER + options |= HTML_PARSE_RECOVER; +#endif +#ifdef RAPTOR_LIBXML_HTML_PARSE_NONET + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET)) + options |= HTML_PARSE_NONET; +#endif + + htmlCtxtUseOptions(grddl_parser->html_ctxt, options); + + } else /* No HTML tag soup allowed so continue loop */ + continue; + } + + + if(grddl_parser->html_ctxt) { + RAPTOR_DEBUG2("Parser %p: Parsing as HTML\n", RAPTOR_VOIDP(rdf_parser)); + rc = htmlParseChunk(grddl_parser->html_ctxt, (const char*)s, 0, 1); + RAPTOR_DEBUG3("Parser %p: Parsing as HTML %s\n", + RAPTOR_VOIDP(rdf_parser), + (rc ? "failed" : "succeeded")); + if(rc) { + if(grddl_parser->html_ctxt->myDoc) { + xmlFreeDoc(grddl_parser->html_ctxt->myDoc); + grddl_parser->html_ctxt->myDoc = NULL; + } + htmlFreeParserCtxt(grddl_parser->html_ctxt); + grddl_parser->html_ctxt = NULL; + } + } else { + RAPTOR_DEBUG2("Parser %p: Parsing as XML\n", RAPTOR_VOIDP(rdf_parser)); + rc = xmlParseChunk(grddl_parser->xml_ctxt, (const char*)s, 0, 1); + RAPTOR_DEBUG3("Parser %p: Parsing as XML %s\n", RAPTOR_VOIDP(rdf_parser), + (rc ? "failed" : "succeeded")); + if(rc) { + if(grddl_parser->xml_ctxt->myDoc) { + xmlFreeDoc(grddl_parser->xml_ctxt->myDoc); + grddl_parser->xml_ctxt->myDoc = NULL; + } + xmlFreeParserCtxt(grddl_parser->xml_ctxt); + grddl_parser->xml_ctxt = NULL; + } + } + + if(!rc) + break; + + } + + /* Restore error handling */ + raptor_world_internal_set_ignore_errors(rdf_parser->world, 0); + + if(!grddl_parser->html_ctxt && !grddl_parser->xml_ctxt) { + raptor_parser_error(rdf_parser, "Failed to create HTML or XML parsers"); + ret = 1; + goto tidy; + } + + raptor_grddl_done_uri(grddl_parser, rdf_parser->base_uri); + + if(grddl_parser->html_ctxt) + doc = grddl_parser->html_ctxt->myDoc; + else + doc = grddl_parser->xml_ctxt->myDoc; + if(!doc) { + raptor_parser_error(rdf_parser, + "Failed to create XML DOM for GRDDL document"); + ret = 1; + goto tidy; + } + + if(!grddl_parser->grddl_processing) + goto transform; + + + if(grddl_parser->xinclude_processing) { + RAPTOR_DEBUG3("Parser %p: Running XInclude processing on URI '%s'\n", + RAPTOR_VOIDP(rdf_parser), + raptor_uri_as_string(rdf_parser->base_uri)); + if(xmlXIncludeProcess(doc) < 0) { + raptor_parser_error(rdf_parser, + "XInclude processing failed for GRDDL document"); + ret = 1; + goto tidy; + } else { + int blen; + + /* write the result of XML Include to buffer */ + RAPTOR_FREE(char*, buffer); + xmlDocDumpFormatMemory(doc, (xmlChar**)&buffer, &blen, + 1 /* indent the result */); + buffer_len = blen; + buffer_is_libxml = 1; + + RAPTOR_DEBUG3("Parser %p: XML Include processing returned %d bytes document\n", + RAPTOR_VOIDP(rdf_parser), (int)buffer_len); + } + } + + + RAPTOR_DEBUG3("Parser %p: Running top-level GRDDL on URI '%s'\n", + RAPTOR_VOIDP(rdf_parser), + raptor_uri_as_string(rdf_parser->base_uri)); + + /* Work out if there is a root namespace URI */ + if(1) { + xmlNodePtr xnp; + xmlNsPtr rootNs = NULL; + const unsigned char* ns_uri_string = NULL; + + xnp = xmlDocGetRootElement(doc); + if(xnp) { + rootNs = xnp->ns; + if(rootNs) + ns_uri_string = (const unsigned char*)(rootNs->href); + } + + if(ns_uri_string) { + int n; + + RAPTOR_DEBUG3("Parser %p: Root namespace URI is %s\n", + RAPTOR_VOIDP(rdf_parser), ns_uri_string); + + if(!strcmp((const char*)ns_uri_string, + (const char*)raptor_rdf_namespace_uri) && + !strcmp((const char*)xnp->name, "RDF")) { + RAPTOR_DEBUG3("Parser %p: Root element of %s is rdf:RDF - process this as RDF/XML later\n", + RAPTOR_VOIDP(rdf_parser), + raptor_uri_as_string(rdf_parser->base_uri)); + grddl_parser->process_this_as_rdfxml = 1; + } + + for(n = 0; grddl_namespace_uris_ignore_list[n]; n++) { + if(!strcmp(grddl_namespace_uris_ignore_list[n], + (const char*)ns_uri_string)) { + /* ignore this namespace */ + RAPTOR_DEBUG3("Parser %p: Ignoring GRDDL for namespace URI '%s'\n", + RAPTOR_VOIDP(rdf_parser), ns_uri_string); + ns_uri_string = NULL; + break; + } + } + if(ns_uri_string) { + grddl_xml_context* xml_context; + + grddl_parser->root_ns_uri = raptor_new_uri_relative_to_base(rdf_parser->world, + rdf_parser->base_uri, + ns_uri_string); + xml_context = raptor_new_xml_context(rdf_parser->world, + grddl_parser->root_ns_uri, + rdf_parser->base_uri); + raptor_sequence_push(grddl_parser->profile_uris, xml_context); + + RAPTOR_DEBUG3("Parser %p: Processing GRDDL namespace URI '%s'\n", + RAPTOR_VOIDP(rdf_parser), + raptor_uri_as_string(grddl_parser->root_ns_uri)); + raptor_grddl_run_recursive(rdf_parser, grddl_parser->root_ns_uri, + "grddl", + RECURSIVE_FLAGS_IGNORE_ERRORS | + RECURSIVE_FLAGS_FILTER); + } + + } + } + + /* Always put something at the start of the list even if NULL + * so later it can be searched for in output triples + */ + if(!grddl_parser->root_ns_uri) { + grddl_xml_context* xml_context; + xml_context = raptor_new_xml_context(rdf_parser->world, NULL, NULL); + raptor_sequence_push(grddl_parser->profile_uris, xml_context); + } + + + /* Create the XPath evaluation context */ + if(!grddl_parser->xpathCtx) { + grddl_parser->xpathCtx = xmlXPathNewContext(doc); + if(!grddl_parser->xpathCtx) { + raptor_parser_error(rdf_parser, + "Failed to create XPath context for GRDDL document"); + ret = 1; + goto tidy; + } + + xmlXPathRegisterNs(grddl_parser->xpathCtx, + (const xmlChar*)"html", + (const xmlChar*)"http://www.w3.org/1999/xhtml"); + xmlXPathRegisterNs(grddl_parser->xpathCtx, + (const xmlChar*)"dataview", + (const xmlChar*)"http://www.w3.org/2003/g/data-view#"); + } + + /* Try <head profile> URIs */ + if(1) { + raptor_sequence* result; + result = raptor_grddl_run_xpath_match(rdf_parser, doc, + (const xmlChar*)"/html:html/html:head/@profile", + MATCH_IS_VALUE_LIST | MATCH_IS_PROFILE); + if(result) { + int size; + + RAPTOR_DEBUG4("Parser %p: Found %d <head profile> URIs in URI '%s'\n", + RAPTOR_VOIDP(rdf_parser), raptor_sequence_size(result), + raptor_uri_as_string(rdf_parser->base_uri)); + + + /* Store profile URIs, skipping NULLs or the GRDDL profile itself */ + while(raptor_sequence_size(result)) { + grddl_xml_context* xml_context; + + xml_context = (grddl_xml_context*)raptor_sequence_unshift(result); + if(!xml_context) + continue; + uri = xml_context->uri; + if(!strcmp("http://www.w3.org/2003/g/data-view", + (const char*)raptor_uri_as_string(uri))) { + RAPTOR_DEBUG3("Ignoring <head profile> of URI %s: URI %s\n", + raptor_uri_as_string(rdf_parser->base_uri), + raptor_uri_as_string(uri)); + grddl_free_xml_context(xml_context); + continue; + } + raptor_sequence_push(grddl_parser->profile_uris, xml_context); + } + raptor_free_sequence(result); + + + /* Recursive GRDDL through all the <head profile> URIs */ + size = raptor_sequence_size(grddl_parser->profile_uris); + for(i = 1; i < size; i++) { + grddl_xml_context* xml_context; + + xml_context = (grddl_xml_context*)raptor_sequence_get_at(grddl_parser->profile_uris, i); + uri = xml_context->uri; + if(!uri) + continue; + + RAPTOR_DEBUG4("Processing <head profile> #%d of URI %s: URI %s\n", + i, raptor_uri_as_string(rdf_parser->base_uri), + raptor_uri_as_string(uri)); + ret = raptor_grddl_run_recursive(rdf_parser, uri, + "grddl", + RECURSIVE_FLAGS_IGNORE_ERRORS| + RECURSIVE_FLAGS_FILTER); + } + } + + } /* end head profile URIs */ + + + /* Try XHTML document with alternate forms + * <link type="application/rdf+xml" href="URI" /> + * Value of @href is a URI + */ + if(grddl_parser->html_link_processing && + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_HTML_LINK)) { + raptor_sequence* result; + result = raptor_grddl_run_xpath_match(rdf_parser, doc, + (const xmlChar*)"/html:html/html:head/html:link[@type=\"application/rdf+xml\"]/@href", + 0); + if(result) { + RAPTOR_DEBUG4("Parser %p: Found %d <link> URIs in URI '%s'\n", + RAPTOR_VOIDP(rdf_parser), raptor_sequence_size(result), + raptor_uri_as_string(rdf_parser->base_uri)); + + /* Recursively parse all the <link> URIs, skipping NULLs */ + i = 0; + while(raptor_sequence_size(result)) { + grddl_xml_context* xml_context; + + xml_context = (grddl_xml_context*)raptor_sequence_unshift(result); + if(!xml_context) + continue; + + uri = xml_context->uri; + if(uri) { + RAPTOR_DEBUG4("Processing <link> #%d of URI %s: URI %s\n", + i, raptor_uri_as_string(rdf_parser->base_uri), + raptor_uri_as_string(uri)); + i++; + ret = raptor_grddl_run_recursive(rdf_parser, uri, "guess", + RECURSIVE_FLAGS_IGNORE_ERRORS); + } + grddl_free_xml_context(xml_context); + } + + raptor_free_sequence(result); + } + } + + + /* Try all XPaths */ + for(expri = 0; match_table[expri].xpath; expri++) { + raptor_sequence* result; + int flags = match_table[expri].flags; + + if((flags & MATCH_IS_HARDCODED) && + !RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_MICROFORMATS)) + continue; + + result = raptor_grddl_run_xpath_match(rdf_parser, doc, + match_table[expri].xpath, flags); + if(result) { + if(match_table[expri].xslt_sheet_uri) { + grddl_xml_context* xml_context; + + /* Ignore what matched, use a hardcoded XSLT URI */ + uri_string = match_table[expri].xslt_sheet_uri; + RAPTOR_DEBUG3("Parser %p: Using hard-coded XSLT URI '%s'\n", + RAPTOR_VOIDP(rdf_parser), uri_string); + + raptor_free_sequence(result); + result = raptor_new_sequence((raptor_data_free_handler)grddl_free_xml_context, NULL); + + uri = raptor_new_uri_relative_to_base(rdf_parser->world, + rdf_parser->base_uri, uri_string); + + xml_context = raptor_new_xml_context(rdf_parser->world, uri, + rdf_parser->base_uri); + raptor_sequence_push(result, xml_context); + + raptor_free_uri(uri); + } + + while(raptor_sequence_size(result)) { + grddl_xml_context* xml_context; + + xml_context = (grddl_xml_context*)raptor_sequence_unshift(result); + if(!xml_context) + break; + + raptor_grddl_add_transform_xml_context(grddl_parser, xml_context); + } + raptor_free_sequence(result); + + if(flags & MATCH_LAST) + break; + } + + + if(rdf_parser->failed) + break; + + } /* end XPath expression loop */ + + if(rdf_parser->failed) { + ret = 1; + goto tidy; + } + + + /* Process this document's content buffer as RDF/XML */ + if(grddl_parser->process_this_as_rdfxml && buffer) { + RAPTOR_DEBUG3("Parser %p: Running additional RDF/XML parse on root document URI '%s' content\n", + RAPTOR_VOIDP(rdf_parser), + raptor_uri_as_string(rdf_parser->base_uri)); + + if(raptor_grddl_ensure_internal_parser(rdf_parser, "rdfxml", 0)) + ret = 1; + else { + if(raptor_parser_parse_start(grddl_parser->internal_parser, + rdf_parser->base_uri)) + ret = 1; + else { + ret = raptor_parser_parse_chunk(grddl_parser->internal_parser, buffer, + buffer_len, 1); + } + } + + } + + + /* Apply all transformation URIs seen */ + transform: + while(raptor_sequence_size(grddl_parser->doc_transform_uris)) { + grddl_xml_context* xml_context; + + xml_context = (grddl_xml_context*)raptor_sequence_unshift(grddl_parser->doc_transform_uris); + ret = raptor_grddl_run_grddl_transform_uri(rdf_parser, xml_context, doc); + grddl_free_xml_context(xml_context); + if(ret) + break; + } + + if(rdf_parser->emitted_default_graph) { + /* May or may not have generated statements but we must close the + * start default graph mark above + */ + raptor_parser_end_graph(rdf_parser, NULL, 0); + rdf_parser->emitted_default_graph--; + } + + + tidy: + if(buffer) { + if(buffer_is_libxml) + xmlFree((xmlChar*)buffer); + else + RAPTOR_FREE(char*, buffer); + } + + if(grddl_parser->sb) { + raptor_free_stringbuffer(grddl_parser->sb); + grddl_parser->sb = NULL; + } + + if(grddl_parser->xml_ctxt) { + if(grddl_parser->xml_ctxt->myDoc) { + xmlFreeDoc(grddl_parser->xml_ctxt->myDoc); + grddl_parser->xml_ctxt->myDoc = NULL; + } + xmlFreeParserCtxt(grddl_parser->xml_ctxt); + grddl_parser->xml_ctxt = NULL; + } + if(grddl_parser->html_ctxt) { + if(grddl_parser->html_ctxt->myDoc) { + xmlFreeDoc(grddl_parser->html_ctxt->myDoc); + grddl_parser->html_ctxt->myDoc = NULL; + } + xmlFreeParserCtxt(grddl_parser->html_ctxt); + grddl_parser->html_ctxt = NULL; + } + + if(grddl_parser->xpathCtx) { + xmlXPathFreeContext(grddl_parser->xpathCtx); + grddl_parser->xpathCtx = NULL; + } + + return (ret != 0); +} + + +static int +raptor_grddl_parse_recognise_syntax(raptor_parser_factory* factory, + const unsigned char *buffer, size_t len, + const unsigned char *identifier, + const unsigned char *suffix, + const char *mime_type) +{ + int score = 0; + + if(suffix) { + if(!strcmp((const char*)suffix, "xhtml")) + score = 4; + if(!strcmp((const char*)suffix, "html")) + score = 2; + } else if(identifier) { + if(strstr((const char*)identifier, "xhtml")) + score = 4; + } + + return score; +} + + +static void +raptor_grddl_parse_content_type_handler(raptor_parser* rdf_parser, + const char* content_type) +{ + raptor_grddl_parser_context* grddl_parser; + + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + if(content_type) { + size_t len = strlen(content_type) + 1; + if(grddl_parser->content_type) + RAPTOR_FREE(char*, grddl_parser->content_type); + + grddl_parser->content_type = RAPTOR_MALLOC(char*, len + 1); + memcpy(grddl_parser->content_type, content_type, len + 1); + } +} + + + +static const char* const grddl_names[2] = { "grddl", NULL }; + +#define GRDDL_TYPES_COUNT 2 +static const raptor_type_q grddl_types[GRDDL_TYPES_COUNT + 1] = { + { "text/html", 9, 2}, + { "application/xhtml+xml", 21, 4}, + { NULL, 0, 0} +}; + +static int +raptor_grddl_parser_register_factory(raptor_parser_factory *factory) +{ + int rc = 0; + + factory->desc.names = grddl_names; + + factory->desc.mime_types = grddl_types; + + factory->desc.label = "Gleaning Resource Descriptions from Dialects of Languages"; + factory->desc.uri_strings = NULL; + + factory->desc.flags = RAPTOR_SYNTAX_NEED_BASE_URI; + + factory->context_length = sizeof(raptor_grddl_parser_context); + + factory->init = raptor_grddl_parse_init; + factory->terminate = raptor_grddl_parse_terminate; + factory->start = raptor_grddl_parse_start; + factory->chunk = raptor_grddl_parse_chunk; + factory->recognise_syntax = raptor_grddl_parse_recognise_syntax; + factory->content_type_handler= raptor_grddl_parse_content_type_handler; + + return rc; +} + + +int +raptor_init_parser_grddl_common(raptor_world* world) +{ +#ifdef HAVE_XSLTINIT + xsltInit(); +#endif + + if(!world->xslt_security_preferences && + !world->xslt_security_preferences_policy) { + xsltSecurityPrefsPtr raptor_xslt_sec = NULL; + + raptor_xslt_sec = xsltNewSecurityPrefs(); + + /* no read from file (read from URI with scheme = file) */ + xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_READ_FILE, + xsltSecurityForbid); + + /* no create/write to file */ + xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_WRITE_FILE, + xsltSecurityForbid); + + /* no create directory */ + xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_CREATE_DIRECTORY, + xsltSecurityForbid); + + /* yes read from URI with scheme != file (XSLT_SECPREF_READ_NETWORK) */ + + /* no write to network (you can 'write' with GET params anyway) */ + xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_WRITE_NETWORK, + xsltSecurityForbid); + + world->xslt_security_preferences = (void*)raptor_xslt_sec; + } + + return 0; +} + + +int +raptor_init_parser_grddl(raptor_world* world) +{ + return !raptor_world_register_parser_factory(world, + &raptor_grddl_parser_register_factory); +} + + +void +raptor_terminate_parser_grddl_common(raptor_world *world) +{ + if(world->xslt_security_preferences && + !world->xslt_security_preferences_policy) { + + /* Free the security preferences object owned by raptor world */ + xsltFreeSecurityPrefs((xsltSecurityPrefsPtr)world->xslt_security_preferences); + world->xslt_security_preferences = NULL; + } + + xsltCleanupGlobals(); +} + + + +/* + * Save libxslt global state that needs overwriting. + * + * Initialise the global state with raptor GRDDL parser values. + * + * Restored by raptor_libxslt_reset_global_state() + */ +static void +raptor_libxslt_set_global_state(raptor_parser *rdf_parser) +{ + raptor_grddl_parser_context* grddl_parser; + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + /* save global (libxslt-wide) generic error handler */ + grddl_parser->saved_xsltGenericError = xsltGenericError; + grddl_parser->saved_xsltGenericErrorContext = xsltGenericErrorContext; + + /* set global (libxslt-wide) generic error handler to raptor GRDDL parser */ + xsltSetGenericErrorFunc(rdf_parser, + raptor_grddl_xsltGenericError_handler); + + /* save global (libxslt-wide) default security prefs */ + grddl_parser->saved_xsltSecurityPrefs = xsltGetDefaultSecurityPrefs(); + + if(grddl_parser->world->xslt_security_preferences && + !grddl_parser->world->xslt_security_preferences_policy) { + /* set global (libxslt-wide) security preferences to raptor */ + xsltSetDefaultSecurityPrefs((xsltSecurityPrefs*)grddl_parser->world->xslt_security_preferences); + } +} + + +/* + * Restore libxslt global state that raptor_libxslt_set_global_state() + * overwrote back to the original values. + * + */ +static void +raptor_libxslt_reset_global_state(raptor_parser* rdf_parser) +{ + raptor_grddl_parser_context* grddl_parser; + grddl_parser = (raptor_grddl_parser_context*)rdf_parser->context; + + /* restore global (libxslt-wide) default security prefs */ + xsltSetDefaultSecurityPrefs(grddl_parser->saved_xsltSecurityPrefs); + + /* restore global (libxslt-wide) generic error handler */ + xsltSetGenericErrorFunc(grddl_parser->saved_xsltGenericErrorContext, + grddl_parser->saved_xsltGenericError); +} + |