diff options
Diffstat (limited to 'src/raptor_rdfxml.c')
-rw-r--r-- | src/raptor_rdfxml.c | 3224 |
1 files changed, 3224 insertions, 0 deletions
diff --git a/src/raptor_rdfxml.c b/src/raptor_rdfxml.c new file mode 100644 index 0000000..2a742d0 --- /dev/null +++ b/src/raptor_rdfxml.c @@ -0,0 +1,3224 @@ +/* -*- Mode: c; c-basic-offset: 2 -*- + * + * raptor_rdfxml.c - Raptor RDF/XML Parser + * + * Copyright (C) 2000-2008, David Beckett http://www.dajobe.org/ + * Copyright (C) 2000-2005, University of Bristol, UK http://www.bristol.ac.uk/ + * + * This package is Free Software and part of Redland http://librdf.org/ + * + * It is licensed under the following three licenses as alternatives: + * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version + * 2. GNU General Public License (GPL) V2 or any newer version + * 3. Apache License, V2.0 or any newer version + * + * You may not use this file except in compliance with at least one of + * the above three licenses. + * + * See LICENSE.html or LICENSE.txt at the top of this package for the + * complete terms and further detail along with the license texts for + * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. + * + * + */ + + +#ifdef HAVE_CONFIG_H +#include <raptor_config.h> +#endif + +#include <stdio.h> +#include <string.h> +#ifdef HAVE_STRINGS_H +#include <strings.h> +#endif +#include <ctype.h> +#include <stdarg.h> +#ifdef HAVE_ERRNO_H +#include <errno.h> +#endif +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif + +/* Raptor includes */ +#include "raptor2.h" +#include "raptor_internal.h" + + +/* Define these for far too much output */ +#undef RAPTOR_DEBUG_VERBOSE +#undef RAPTOR_DEBUG_CDATA + + +/* Raptor structures */ + +typedef enum { + /* Catch uninitialised state */ + RAPTOR_STATE_INVALID = 0, + + /* Skipping current tree of elements - used to recover finding + * illegal content, when parsling permissively. + */ + RAPTOR_STATE_SKIPPING, + + /* Not in RDF grammar yet - searching for a start element. + * + * This can be <rdf:RDF> (goto NODE_ELEMENT_LIST) but since it is optional, + * the start element can also be one of + * http://www.w3.org/TR/rdf-syntax-grammar/#nodeElementURIs + * + * If RDF content is assumed, go straight to OBJ + */ + RAPTOR_STATE_UNKNOWN, + + /* A list of node elements + * http://www.w3.org/TR/rdf-syntax-grammar/#nodeElementList + */ + RAPTOR_STATE_NODE_ELEMENT_LIST, + + /* Found an <rdf:Description> */ + RAPTOR_STATE_DESCRIPTION, + + /* Found a property element + * http://www.w3.org/TR/rdf-syntax-grammar/#propertyElt + */ + RAPTOR_STATE_PROPERTYELT, + + /* A property element that is an ordinal - rdf:li, rdf:_n + */ + RAPTOR_STATE_MEMBER_PROPERTYELT, + + /* Found a node element + * http://www.w3.org/TR/rdf-syntax-grammar/#nodeElement + */ + RAPTOR_STATE_NODE_ELEMENT, + + /* A property element with rdf:parseType="Literal" + * http://www.w3.org/TR/rdf-syntax-grammar/#parseTypeLiteralPropertyElt + */ + RAPTOR_STATE_PARSETYPE_LITERAL, + + /* A property element with rdf:parseType="Resource" + * http://www.w3.org/TR/rdf-syntax-grammar/#parseTypeResourcePropertyElt + */ + RAPTOR_STATE_PARSETYPE_RESOURCE, + + /* A property element with rdf:parseType="Collection" + * http://www.w3.org/TR/rdf-syntax-grammar/#parseTypeCollectionPropertyElt + * + * (This also handles daml:Collection) + */ + RAPTOR_STATE_PARSETYPE_COLLECTION, + + /* A property element with a rdf:parseType attribute and a value + * not "Literal" or "Resource" + * http://www.w3.org/TR/rdf-syntax-grammar/#parseTypeOtherPropertyElt + */ + RAPTOR_STATE_PARSETYPE_OTHER, + + RAPTOR_STATE_PARSETYPE_LAST = RAPTOR_STATE_PARSETYPE_OTHER + + +} raptor_state; + + +static const char* const raptor_state_names[RAPTOR_STATE_PARSETYPE_LAST+2] = { + "INVALID", + "SKIPPING", + "UNKNOWN", + "nodeElementList", + "propertyElt", + "Description", + "propertyElt", + "memberPropertyElt", + "nodeElement", + "parseTypeLiteral", + "parseTypeResource", + "parseTypeCollection", + "parseTypeOther" +}; + + +static const char * raptor_rdfxml_state_as_string(raptor_state state) +{ + if(state < 1 || state > RAPTOR_STATE_PARSETYPE_LAST) + state = (raptor_state)0; + return raptor_state_names[(int)state]; +} + + +/* + * raptor_rdfxml_check_propertyElement_name: + * @name: rdf namespace term + * + * Check if an rdf namespace name is allowed to be used as a Node Element. + * + * Return value: < 0 if unknown rdf namespace term, 0 if known and not allowed, > 0 if known and allowed + */ +static int +raptor_rdfxml_check_nodeElement_name(const char *name) +{ + int i; + + if(*name == '_') + return 1; + + for(i = 0; raptor_rdf_ns_terms_info[i].name; i++) + if(!strcmp(raptor_rdf_ns_terms_info[i].name, name)) + return raptor_rdf_ns_terms_info[i].allowed_as_nodeElement; + + return -1; +} + + +/* + * raptor_rdfxml_check_propertyElement_name: + * @name: rdf namespace term + * + * Check if an rdf namespace name is allowed to be used as a Property Element. + * + * Return value: < 0 if unknown rdf namespace term, 0 if known and not allowed, > 0 if known and allowed + */ +static int +raptor_rdfxml_check_propertyElement_name(const char *name) +{ + int i; + + if(*name == '_') + return 1; + + for(i = 0; raptor_rdf_ns_terms_info[i].name; i++) + if(!strcmp(raptor_rdf_ns_terms_info[i].name, (const char*)name)) + return raptor_rdf_ns_terms_info[i].allowed_as_propertyElement; + + return -1; +} + + +static int +raptor_rdfxml_check_propertyAttribute_name(const char *name) +{ + int i; + + if(*name == '_') + return 1; + + for(i = 0; raptor_rdf_ns_terms_info[i].name; i++) + if(!strcmp(raptor_rdf_ns_terms_info[i].name, (const char*)name)) + return raptor_rdf_ns_terms_info[i].allowed_as_propertyAttribute; + + return -1; +} + + +typedef enum { + /* undetermined yet - whitespace is stored */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_UNKNOWN, + + /* literal content - no elements, cdata allowed, whitespace significant + * <propElement> blah </propElement> + */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LITERAL, + + /* parseType literal content (WF XML) - all content preserved + * <propElement rdf:parseType="Literal"><em>blah</em></propElement> + */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_XML_LITERAL, + + /* top-level nodes - 0+ elements expected, no cdata, whitespace ignored, + * any non-whitespace cdata is error + * only used for <rdf:RDF> or implict <rdf:RDF> + */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_NODES, + + /* properties - 0+ elements expected, no cdata, whitespace ignored, + * any non-whitespace cdata is error + * <nodeElement><prop1>blah</prop1> <prop2>blah</prop2> </nodeElement> + */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTIES, + + /* property content - all content preserved + * any content type changes when first non-whitespace found + * <propElement>... + */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTY_CONTENT, + + /* resource URI given - no element, no cdata, whitespace ignored, + * any non-whitespace cdata is error + * <propElement rdf:resource="uri"/> + * <propElement rdf:resource="uri"></propElement> + */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE, + + /* skipping content - all content is preserved + * Used when skipping content for unknown parseType-s, + * error recovery, some other reason + */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PRESERVED, + + /* parseType Collection - all content preserved + * Parsing of this determined by RDF/XML (Revised) closed collection rules + * <propElement rdf:parseType="Collection">...</propElement> + */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_COLLECTION, + + /* Like above but handles "daml:collection" */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION, + + /* dummy for use in strings below */ + RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LAST + +} raptor_rdfxml_element_content_type; + + +static const struct { + const char * name; + int whitespace_significant; + /* non-blank cdata */ + int cdata_allowed; + /* XML element content */ + int element_allowed; + /* Do RDF-specific processing? (property attributes, rdf: attributes, ...) */ + int rdf_processing; +} rdf_content_type_info[RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LAST]={ + {"Unknown", 1, 1, 1, 0 }, + {"Literal", 1, 1, 0, 0 }, + {"XML Literal", 1, 1, 1, 0 }, + {"Nodes", 0, 0, 1, 1 }, + {"Properties", 0, 1, 1, 1 }, + {"Property Content",1, 1, 1, 1 }, + {"Resource", 0, 0, 0, 0 }, + {"Preserved", 1, 1, 1, 0 }, + {"Collection", 1, 1, 1, 1 }, + {"DAML Collection", 1, 1, 1, 1 }, +}; + + + +static const char * +raptor_rdfxml_element_content_type_as_string(raptor_rdfxml_element_content_type type) +{ + if(type >= RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LAST) + return "INVALID"; + + return rdf_content_type_info[type].name; +} + + + + + +/* + * Raptor Element/attributes on stack + */ +struct raptor_rdfxml_element_s { + raptor_world* world; + + raptor_xml_element *xml_element; + + /* NULL at bottom of stack */ + struct raptor_rdfxml_element_s *parent; + + /* attributes declared in M&S */ + const unsigned char * rdf_attr[RDF_NS_LAST + 1]; + /* how many of above seen */ + int rdf_attr_count; + + /* state that this production matches */ + raptor_state state; + + /* how to handle the content inside this XML element */ + raptor_rdfxml_element_content_type content_type; + + + /* starting state for children of this element */ + raptor_state child_state; + + /* starting content type for children of this element */ + raptor_rdfxml_element_content_type child_content_type; + + + /* Reified statement identifier */ + raptor_term* reified; + + unsigned const char* reified_id; + + /* Bag identifier */ + raptor_term* bag; + int last_bag_ordinal; /* starts at 0, so first predicate is rdf:_1 */ + + /* Subject identifier (URI/anon ID), type, source + * + * When the XML element represents a node, this is the identifier + */ + raptor_term* subject; + + /* Predicate URI + * + * When the XML element represents a node or predicate, + * this is the identifier of the predicate + */ + raptor_term* predicate; + + /* Object identifier (URI/anon ID), type, source + * + * When this XML element generates a statement that needs an object, + * possibly from a child element, this is the identifier of the object + */ + raptor_term* object; + + /* URI of datatype of literal */ + raptor_uri *object_literal_datatype; + + /* last ordinal used, so initialising to 0 works, emitting rdf:_1 first */ + int last_ordinal; + + /* If this element's parseType is a Collection + * this identifies the anon node of current tail of the collection(list). + */ + const unsigned char *tail_id; + + /* RDF/XML specific checks */ + + /* all cdata so far is whitespace */ + unsigned int content_cdata_all_whitespace; +}; + +typedef struct raptor_rdfxml_element_s raptor_rdfxml_element; + + +#define RAPTOR_RDFXML_N_CONCEPTS 5 + +/* + * Raptor parser object + */ +struct raptor_rdfxml_parser_s { + raptor_sax2 *sax2; + + /* stack of elements - elements add after current_element */ + raptor_rdfxml_element *root_element; + raptor_rdfxml_element *current_element; + + raptor_uri* concepts[RAPTOR_RDFXML_N_CONCEPTS]; + + /* set of seen rdf:ID / rdf:bagID values (with in-scope base URI) */ + raptor_id_set* id_set; + + void *xml_content; + size_t xml_content_length; + raptor_iostream* iostream; + + /* writer for building parseType="Literal" content */ + raptor_xml_writer* xml_writer; +}; + + + + +/* static variables */ + +#define RAPTOR_DAML_NS_URI(rdf_xml_parser) rdf_xml_parser->concepts[0] + +#define RAPTOR_DAML_List_URI(rdf_xml_parser) rdf_xml_parser->concepts[1] +#define RAPTOR_DAML_first_URI(rdf_xml_parser) rdf_xml_parser->concepts[2] +#define RAPTOR_DAML_rest_URI(rdf_xml_parser) rdf_xml_parser->concepts[3] +#define RAPTOR_DAML_nil_URI(rdf_xml_parser) rdf_xml_parser->concepts[4] + +/* RAPTOR_RDFXML_N_CONCEPTS defines size of array */ + + +/* prototypes for element functions */ +static raptor_rdfxml_element* raptor_rdfxml_element_pop(raptor_rdfxml_parser *rdf_parser); +static void raptor_rdfxml_element_push(raptor_rdfxml_parser *rdf_parser, raptor_rdfxml_element* element); + +static int raptor_rdfxml_record_ID(raptor_parser *rdf_parser, raptor_rdfxml_element *element, const unsigned char *id); + +/* prototypes for grammar functions */ +static void raptor_rdfxml_start_element_grammar(raptor_parser *parser, raptor_rdfxml_element *element); +static void raptor_rdfxml_end_element_grammar(raptor_parser *parser, raptor_rdfxml_element *element); +static void raptor_rdfxml_cdata_grammar(raptor_parser *parser, const unsigned char *s, int len, int is_cdata); + + +/* prototype for statement related functions */ +static void raptor_rdfxml_generate_statement(raptor_parser *rdf_parser, raptor_term *subject, raptor_uri *predicate_uri, raptor_term *object, raptor_term *reified, raptor_rdfxml_element *bag_element); + + + +/* Prototypes for parsing data functions */ +static int raptor_rdfxml_parse_init(raptor_parser* rdf_parser, const char *name); +static void raptor_rdfxml_parse_terminate(raptor_parser *rdf_parser); +static int raptor_rdfxml_parse_start(raptor_parser* rdf_parser); +static int raptor_rdfxml_parse_chunk(raptor_parser* rdf_parser, const unsigned char *buffer, size_t len, int is_end); +static void raptor_rdfxml_update_document_locator(raptor_parser *rdf_parser); + +static raptor_uri* raptor_rdfxml_inscope_base_uri(raptor_parser *rdf_parser); + + +static raptor_rdfxml_element* +raptor_rdfxml_element_pop(raptor_rdfxml_parser *rdf_xml_parser) +{ + raptor_rdfxml_element *element = rdf_xml_parser->current_element; + + if(!element) + return NULL; + + rdf_xml_parser->current_element = element->parent; + if(rdf_xml_parser->root_element == element) /* just deleted root */ + rdf_xml_parser->root_element = NULL; + + return element; +} + + +static void +raptor_rdfxml_element_push(raptor_rdfxml_parser *rdf_xml_parser, raptor_rdfxml_element* element) +{ + element->parent = rdf_xml_parser->current_element; + rdf_xml_parser->current_element = element; + if(!rdf_xml_parser->root_element) + rdf_xml_parser->root_element = element; +} + + +static void +raptor_free_rdfxml_element(raptor_rdfxml_element *element) +{ + int i; + + /* Free special RDF M&S attributes */ + for(i = 0; i <= RDF_NS_LAST; i++) + if(element->rdf_attr[i]) + RAPTOR_FREE(char*, element->rdf_attr[i]); + + if(element->subject) + raptor_free_term(element->subject); + if(element->predicate) + raptor_free_term(element->predicate); + if(element->object) + raptor_free_term(element->object); + if(element->bag) + raptor_free_term(element->bag); + if(element->reified) + raptor_free_term(element->reified); + + if(element->tail_id) + RAPTOR_FREE(char*, (char*)element->tail_id); + if(element->object_literal_datatype) + raptor_free_uri(element->object_literal_datatype); + + if(element->reified_id) + RAPTOR_FREE(char*, (char*)element->reified_id); + + RAPTOR_FREE(raptor_rdfxml_element, element); +} + + +static void +raptor_rdfxml_sax2_new_namespace_handler(void *user_data, + raptor_namespace* nspace) +{ + raptor_parser* rdf_parser; + const unsigned char* namespace_name; + size_t namespace_name_len; + raptor_uri* uri = raptor_namespace_get_uri(nspace); + + rdf_parser = (raptor_parser*)user_data; + raptor_parser_start_namespace(rdf_parser, nspace); + + if(!uri) + return; + + namespace_name = raptor_uri_as_counted_string(uri, &namespace_name_len); + + if(namespace_name_len == raptor_rdf_namespace_uri_len-1 && + !strncmp((const char*)namespace_name, + (const char*)raptor_rdf_namespace_uri, + namespace_name_len)) { + const unsigned char *prefix = raptor_namespace_get_prefix(nspace); + raptor_parser_warning(rdf_parser, + "Declaring a namespace with prefix %s to URI %s - one letter short of the RDF namespace URI and probably a mistake.", + prefix, namespace_name); + } + + if(namespace_name_len > raptor_rdf_namespace_uri_len && + !strncmp((const char*)namespace_name, + (const char*)raptor_rdf_namespace_uri, + raptor_rdf_namespace_uri_len)) { + raptor_parser_error(rdf_parser, + "Declaring a namespace URI %s to which the RDF namespace URI is a prefix is forbidden.", + namespace_name); + } +} + + + +static void +raptor_rdfxml_start_element_handler(void *user_data, + raptor_xml_element* xml_element) +{ + raptor_parser* rdf_parser; + raptor_rdfxml_parser* rdf_xml_parser; + raptor_rdfxml_element* element; + int ns_attributes_count = 0; + raptor_qname** named_attrs = NULL; + int i; + int count_bumped = 0; + + rdf_parser = (raptor_parser*)user_data; + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + if(rdf_parser->failed) + return; + + raptor_rdfxml_update_document_locator(rdf_parser); + + /* Create new element structure */ + element = RAPTOR_CALLOC(raptor_rdfxml_element*, 1, sizeof(*element)); + if(!element) { + raptor_parser_fatal_error(rdf_parser, "Out of memory"); + rdf_parser->failed = 1; + return; + } + element->world = rdf_parser->world; + element->xml_element = xml_element; + + raptor_rdfxml_element_push(rdf_xml_parser, element); + + named_attrs = raptor_xml_element_get_attributes(xml_element); + ns_attributes_count = raptor_xml_element_get_attributes_count(xml_element); + + /* RDF-specific processing of attributes */ + if(ns_attributes_count) { + raptor_qname** new_named_attrs; + int offset = 0; + raptor_rdfxml_element* parent_element; + + parent_element = element->parent; + + /* Allocate new array to move namespaced-attributes to if + * rdf processing is performed + */ + new_named_attrs = RAPTOR_CALLOC(raptor_qname**, ns_attributes_count, + sizeof(raptor_qname*)); + if(!new_named_attrs) { + raptor_parser_fatal_error(rdf_parser, "Out of memory"); + rdf_parser->failed = 1; + return; + } + + for(i = 0; i < ns_attributes_count; i++) { + raptor_qname* attr = named_attrs[i]; + + /* If: + * 1 We are handling RDF content and RDF processing is allowed on + * this element + * OR + * 2 We are not handling RDF content and + * this element is at the top level (top level Desc. / typedNode) + * i.e. we have no parent + * then handle the RDF attributes + */ + if((parent_element && + rdf_content_type_info[parent_element->child_content_type].rdf_processing) || + !parent_element) { + + /* Save pointers to some RDF M&S attributes */ + + /* If RDF namespace-prefixed attributes */ + if(attr->nspace && attr->nspace->is_rdf_ms) { + const unsigned char *attr_name = attr->local_name; + int j; + + for(j = 0; j <= RDF_NS_LAST; j++) + if(!strcmp((const char*)attr_name, + raptor_rdf_ns_terms_info[j].name)) { + element->rdf_attr[j] = attr->value; + element->rdf_attr_count++; + /* Delete it if it was stored elsewhere */ +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG3("Found RDF namespace attribute '%s' URI %s\n", + (char*)attr_name, attr->value); +#endif + /* make sure value isn't deleted from qname structure */ + attr->value = NULL; + raptor_free_qname(attr); + attr = NULL; + break; + } + } /* end if RDF namespaced-prefixed attributes */ + + if(!attr) + continue; + + /* If non namespace-prefixed RDF attributes found on an element */ + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_ALLOW_NON_NS_ATTRIBUTES) && + !attr->nspace) { + const unsigned char *attr_name = attr->local_name; + int j; + + for(j = 0; j <= RDF_NS_LAST; j++) + if(!strcmp((const char*)attr_name, + raptor_rdf_ns_terms_info[j].name)) { + element->rdf_attr[j] = attr->value; + element->rdf_attr_count++; + if(!raptor_rdf_ns_terms_info[j].allowed_unprefixed_on_attribute) + raptor_parser_warning(rdf_parser, + "Using rdf attribute '%s' without the RDF namespace has been deprecated.", + attr_name); + + /* Delete it if it was stored elsewhere */ + /* make sure value isn't deleted from qname structure */ + attr->value = NULL; + raptor_free_qname(attr); + attr = NULL; + break; + } + } /* end if non-namespace prefixed RDF attributes */ + + if(!attr) + continue; + + } /* end if leave literal XML alone */ + + if(attr) + new_named_attrs[offset++] = attr; + } + + /* new attribute count is set from attributes that haven't been skipped */ + ns_attributes_count = offset; + if(!ns_attributes_count) { + /* all attributes were deleted so delete the new array */ + RAPTOR_FREE(raptor_qname_array, new_named_attrs); + new_named_attrs = NULL; + } + + RAPTOR_FREE(raptor_qname_array, named_attrs); + named_attrs = new_named_attrs; + raptor_xml_element_set_attributes(xml_element, + named_attrs, ns_attributes_count); + } /* end if ns_attributes_count */ + + + /* start from unknown; if we have a parent, it may set this */ + element->state = RAPTOR_STATE_UNKNOWN; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_UNKNOWN; + + if(element->parent && + element->parent->child_content_type != RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_UNKNOWN) { + element->content_type = element->parent->child_content_type; + + if(element->parent->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE && + element->content_type != RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_COLLECTION && + element->content_type != RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION) { + raptor_qname* parent_el_name; + parent_el_name = raptor_xml_element_get_name(element->parent->xml_element); + /* If parent has an rdf:resource, this element should not be here */ + raptor_parser_error(rdf_parser, + "property element '%s' has multiple object node elements, skipping.", + parent_el_name->local_name); + element->state = RAPTOR_STATE_SKIPPING; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PRESERVED; + + } else { + if(!element->parent->child_state) { + raptor_parser_fatal_error(rdf_parser, + "%s: Internal error: no parent element child_state set", + __FUNCTION__); + return; + } + + element->state = element->parent->child_state; + element->parent->xml_element->content_element_seen++; + count_bumped++; + + /* leave literal XML alone */ + if(!rdf_content_type_info[element->content_type].cdata_allowed) { + if(element->parent->xml_element->content_element_seen && + element->parent->xml_element->content_cdata_seen) { + raptor_qname* parent_el_name; + + parent_el_name = raptor_xml_element_get_name(element->parent->xml_element); + /* Uh oh - mixed content, the parent element has cdata too */ + raptor_parser_warning(rdf_parser, "element '%s' has mixed content.", + parent_el_name->local_name); + } + + /* If there is some existing all-whitespace content cdata + * before this node element, delete it + */ + if(element->parent->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTIES && + element->parent->xml_element->content_element_seen && + element->parent->content_cdata_all_whitespace && + element->parent->xml_element->content_cdata_length) { + + element->parent->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE; + + raptor_free_stringbuffer(element->parent->xml_element->content_cdata_sb); + element->parent->xml_element->content_cdata_sb = NULL; + element->parent->xml_element->content_cdata_length = 0; + } + + } /* end if leave literal XML alone */ + + } /* end if parent has no rdf:resource */ + + } /* end if element->parent */ + + +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG2("Using content type %s\n", + rdf_content_type_info[element->content_type].name); + + fprintf(stderr, "raptor_rdfxml_start_element_handler: Start ns-element: "); + raptor_print_xml_element(xml_element, stderr); +#endif + + + /* Check for non namespaced stuff when not in a parseType literal, other */ + if(rdf_content_type_info[element->content_type].rdf_processing) { + const raptor_namespace* ns; + + ns = raptor_xml_element_get_name(xml_element)->nspace; + /* The element */ + + /* If has no namespace or the namespace has no name (xmlns="") */ + if((!ns || (ns && !raptor_namespace_get_uri(ns))) && element->parent) { + raptor_qname* parent_el_name; + + parent_el_name = raptor_xml_element_get_name(element->parent->xml_element); + + raptor_parser_error(rdf_parser, + "Using an element '%s' without a namespace is forbidden.", + parent_el_name->local_name); + element->state = RAPTOR_STATE_SKIPPING; + /* Remove count above so that parent thinks this is empty */ + if(count_bumped) + element->parent->xml_element->content_element_seen--; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PRESERVED; + } + + + /* Check for any remaining non-namespaced attributes */ + if(named_attrs) { + for(i = 0; i < ns_attributes_count; i++) { + raptor_qname *attr = named_attrs[i]; + /* Check if any attributes are non-namespaced */ + if(!attr->nspace || + (attr->nspace && !raptor_namespace_get_uri(attr->nspace))) { + raptor_parser_error(rdf_parser, + "Using an attribute '%s' without a namespace is forbidden.", + attr->local_name); + raptor_free_qname(attr); + named_attrs[i] = NULL; + } + } + } + } + + + if(element->rdf_attr[RDF_NS_aboutEach] || + element->rdf_attr[RDF_NS_aboutEachPrefix]) { + raptor_parser_warning(rdf_parser, + "element '%s' has aboutEach / aboutEachPrefix, skipping.", + raptor_xml_element_get_name(xml_element)->local_name); + element->state = RAPTOR_STATE_SKIPPING; + /* Remove count above so that parent thinks this is empty */ + if(count_bumped) + element->parent->xml_element->content_element_seen--; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PRESERVED; + } + + /* Right, now ready to enter the grammar */ + raptor_rdfxml_start_element_grammar(rdf_parser, element); + + return; +} + + +static void +raptor_rdfxml_end_element_handler(void *user_data, + raptor_xml_element* xml_element) +{ + raptor_parser* rdf_parser; + raptor_rdfxml_parser* rdf_xml_parser; + raptor_rdfxml_element* element; + + rdf_parser = (raptor_parser*)user_data; + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + if(!rdf_parser->failed) { + raptor_rdfxml_update_document_locator(rdf_parser); + + raptor_rdfxml_end_element_grammar(rdf_parser, + rdf_xml_parser->current_element); + } + + element = raptor_rdfxml_element_pop(rdf_xml_parser); + if(element) { + if(element->parent) { + /* Do not change this; PROPERTYELT will turn into MEMBER if necessary + * See the switch case for MEMBER / PROPERTYELT where the test is done. + * + * PARSETYPE_RESOURCE should never be propogated up since it + * will turn the next child (node) element into a property + */ + if(element->state != RAPTOR_STATE_MEMBER_PROPERTYELT && + element->state != RAPTOR_STATE_PARSETYPE_RESOURCE) + element->parent->child_state = element->state; + } + + raptor_free_rdfxml_element(element); + } +} + + +/* cdata (and ignorable whitespace for libxml). + * s 0 terminated is for libxml + */ +static void +raptor_rdfxml_characters_handler(void *user_data, + raptor_xml_element* xml_element, + const unsigned char *s, int len) +{ + raptor_parser* rdf_parser = (raptor_parser*)user_data; + + raptor_rdfxml_cdata_grammar(rdf_parser, s, len, 0); +} + + +/* cdata (and ignorable whitespace for libxml). + * s is 0 terminated for libxml2 + */ +static void +raptor_rdfxml_cdata_handler(void *user_data, raptor_xml_element* xml_element, + const unsigned char *s, int len) +{ + raptor_parser* rdf_parser = (raptor_parser*)user_data; + + raptor_rdfxml_cdata_grammar(rdf_parser, s, len, 1); +} + + +/* comment handler + * s is 0 terminated + */ +static void +raptor_rdfxml_comment_handler(void *user_data, raptor_xml_element* xml_element, + const unsigned char *s) +{ + raptor_parser* rdf_parser = (raptor_parser*)user_data; + raptor_rdfxml_parser* rdf_xml_parser; + raptor_rdfxml_element* element; + + if(rdf_parser->failed || !xml_element) + return; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + element = rdf_xml_parser->current_element; + + if(element) { + if(element->child_content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_XML_LITERAL) + raptor_xml_writer_comment(rdf_xml_parser->xml_writer, s); + } + + +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG2("XML Comment '%s'\n", s); +#endif +} + + +static const unsigned char* const daml_namespace_uri_string = (const unsigned char*)"http://www.daml.org/2001/03/daml+oil#"; +static const int daml_namespace_uri_string_len = 37; + + +static int +raptor_rdfxml_parse_init(raptor_parser* rdf_parser, const char *name) +{ + raptor_rdfxml_parser* rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + raptor_sax2* sax2; + raptor_world* world = rdf_parser->world; + + /* Allocate sax2 object */ + sax2 = raptor_new_sax2(rdf_parser->world, &rdf_parser->locator, rdf_parser); + rdf_xml_parser->sax2 = sax2; + if(!sax2) + return 1; + + /* Initialize sax2 element handlers */ + raptor_sax2_set_start_element_handler(sax2, raptor_rdfxml_start_element_handler); + raptor_sax2_set_end_element_handler(sax2, raptor_rdfxml_end_element_handler); + raptor_sax2_set_characters_handler(sax2, raptor_rdfxml_characters_handler); + raptor_sax2_set_cdata_handler(sax2, raptor_rdfxml_cdata_handler); + raptor_sax2_set_comment_handler(sax2, raptor_rdfxml_comment_handler); + raptor_sax2_set_namespace_handler(sax2, raptor_rdfxml_sax2_new_namespace_handler); + + /* Allocate uris */ + RAPTOR_DAML_NS_URI(rdf_xml_parser) = raptor_new_uri_from_counted_string(world, + daml_namespace_uri_string, + daml_namespace_uri_string_len); + + RAPTOR_DAML_List_URI(rdf_xml_parser) = raptor_new_uri_from_uri_local_name(world, RAPTOR_DAML_NS_URI(rdf_xml_parser), (const unsigned char *)"List"); + RAPTOR_DAML_first_URI(rdf_xml_parser) = raptor_new_uri_from_uri_local_name(world, RAPTOR_DAML_NS_URI(rdf_xml_parser) ,(const unsigned char *)"first"); + RAPTOR_DAML_rest_URI(rdf_xml_parser) = raptor_new_uri_from_uri_local_name(world, RAPTOR_DAML_NS_URI(rdf_xml_parser), (const unsigned char *)"rest"); + RAPTOR_DAML_nil_URI(rdf_xml_parser) = raptor_new_uri_from_uri_local_name(world, RAPTOR_DAML_NS_URI(rdf_xml_parser), (const unsigned char *)"nil"); + + /* Check for uri allocation failures */ + if(!RAPTOR_DAML_NS_URI(rdf_xml_parser) || + !RAPTOR_DAML_List_URI(rdf_xml_parser) || + !RAPTOR_DAML_first_URI(rdf_xml_parser) || + !RAPTOR_DAML_rest_URI(rdf_xml_parser) || + !RAPTOR_DAML_nil_URI(rdf_xml_parser)) + return 1; + + /* Everything succeeded */ + return 0; +} + + +static int +raptor_rdfxml_parse_start(raptor_parser* rdf_parser) +{ + raptor_uri *uri = rdf_parser->base_uri; + raptor_rdfxml_parser* rdf_xml_parser; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + /* base URI required for RDF/XML */ + if(!uri) + return 1; + + /* Optionally normalize language to lowercase + * http://www.w3.org/TR/rdf-concepts/#dfn-language-identifier + */ + raptor_sax2_set_option(rdf_xml_parser->sax2, + RAPTOR_OPTION_NORMALIZE_LANGUAGE, NULL, + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NORMALIZE_LANGUAGE)); + + /* Optionally forbid internal network and file requests in the XML parser */ + raptor_sax2_set_option(rdf_xml_parser->sax2, + RAPTOR_OPTION_NO_NET, NULL, + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET)); + raptor_sax2_set_option(rdf_xml_parser->sax2, + RAPTOR_OPTION_NO_FILE, NULL, + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_FILE)); + raptor_sax2_set_option(rdf_xml_parser->sax2, + RAPTOR_OPTION_LOAD_EXTERNAL_ENTITIES, NULL, + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_EXTERNAL_ENTITIES)); + if(rdf_parser->uri_filter) + raptor_sax2_set_uri_filter(rdf_xml_parser->sax2, rdf_parser->uri_filter, + rdf_parser->uri_filter_user_data); + + raptor_sax2_parse_start(rdf_xml_parser->sax2, uri); + + /* Delete any existing id_set */ + if(rdf_xml_parser->id_set) { + raptor_free_id_set(rdf_xml_parser->id_set); + rdf_xml_parser->id_set = NULL; + } + + /* Create a new id_set if needed */ + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_CHECK_RDF_ID)) { + rdf_xml_parser->id_set = raptor_new_id_set(rdf_parser->world); + if(!rdf_xml_parser->id_set) + return 1; + } + + return 0; +} + + +static void +raptor_rdfxml_parse_terminate(raptor_parser *rdf_parser) +{ + raptor_rdfxml_parser* rdf_xml_parser; + raptor_rdfxml_element* element; + int i; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + if(rdf_xml_parser->sax2) { + raptor_free_sax2(rdf_xml_parser->sax2); + rdf_xml_parser->sax2 = NULL; + } + + while( (element = raptor_rdfxml_element_pop(rdf_xml_parser)) ) + raptor_free_rdfxml_element(element); + + + for(i = 0; i < RAPTOR_RDFXML_N_CONCEPTS; i++) { + raptor_uri* concept_uri = rdf_xml_parser->concepts[i]; + if(concept_uri) { + raptor_free_uri(concept_uri); + rdf_xml_parser->concepts[i] = NULL; + } + } + + if(rdf_xml_parser->id_set) { + raptor_free_id_set(rdf_xml_parser->id_set); + rdf_xml_parser->id_set = NULL; + } + + if (rdf_xml_parser->xml_writer) { + raptor_free_xml_writer(rdf_xml_parser->xml_writer); + rdf_xml_parser->xml_writer = NULL; + } + + if (rdf_xml_parser->iostream) { + raptor_free_iostream(rdf_xml_parser->iostream); + rdf_xml_parser->iostream = NULL; + } + + if (rdf_xml_parser->xml_content) { + RAPTOR_FREE(char*, rdf_xml_parser->xml_content); + rdf_xml_parser->xml_content = NULL; + rdf_xml_parser->xml_content_length = 0; + } +} + + +static int +raptor_rdfxml_parse_recognise_syntax(raptor_parser_factory* factory, + const unsigned char *buffer, size_t len, + const unsigned char *identifier, + const unsigned char *suffix, + const char *mime_type) +{ + int score = 0; + + if(suffix) { + if(!strcmp((const char*)suffix, "rdf") || + !strcmp((const char*)suffix, "rdfs") || + !strcmp((const char*)suffix, "foaf") || + !strcmp((const char*)suffix, "doap") || + !strcmp((const char*)suffix, "owl") || + !strcmp((const char*)suffix, "daml")) + score = 9; + if(!strcmp((const char*)suffix, "rss")) + score = 3; + } + + if(identifier) { + if(strstr((const char*)identifier, "rss1")) + score += 5; + else if(!suffix && strstr((const char*)identifier, "rss")) + score += 3; + else if(!suffix && strstr((const char*)identifier, "rdf")) + score += 2; + else if(!suffix && strstr((const char*)identifier, "RDF")) + score += 2; + } + + if(mime_type) { + if(strstr((const char*)mime_type, "html")) + score -= 4; + else if(!strcmp((const char*)mime_type, "text/rdf")) + score += 7; + else if(!strcmp((const char*)mime_type, "application/xml")) + score += 5; + } + + if(buffer && len) { + /* Check it's an XML namespace declared and not N3 or Turtle which + * mention the namespace URI but not in this form. + */ +#define HAS_RDF_XMLNS1 (raptor_memstr((const char*)buffer, len, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#") != NULL) +#define HAS_RDF_XMLNS2 (raptor_memstr((const char*)buffer, len, "xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#") != NULL) +#define HAS_RDF_XMLNS3 (raptor_memstr((const char*)buffer, len, "xmlns=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#") != NULL) +#define HAS_RDF_XMLNS4 (raptor_memstr((const char*)buffer, len, "xmlns='http://www.w3.org/1999/02/22-rdf-syntax-ns#") != NULL) +#define HAS_RDF_ENTITY1 (raptor_memstr((const char*)buffer, len, "!ENTITY rdf 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'") != NULL) +#define HAS_RDF_ENTITY2 (raptor_memstr((const char*)buffer, len, "!ENTITY rdf \"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"") != NULL) +#define HAS_RDF_ENTITY3 (raptor_memstr((const char*)buffer, len, "xmlns:rdf=\"&rdf;\"") != NULL) +#define HAS_RDF_ENTITY4 (raptor_memstr((const char*)buffer, len, "xmlns:rdf='&rdf;'") != NULL) +#define HAS_HTML_NS (raptor_memstr((const char*)buffer, len, "http://www.w3.org/1999/xhtml") != NULL) +#define HAS_HTML_ROOT (raptor_memstr((const char*)buffer, len, "<html") != NULL) + + if(!HAS_HTML_NS && !HAS_HTML_ROOT && + (HAS_RDF_XMLNS1 || HAS_RDF_XMLNS2 || HAS_RDF_XMLNS3 || HAS_RDF_XMLNS4 || + HAS_RDF_ENTITY1 || HAS_RDF_ENTITY2 || HAS_RDF_ENTITY3 || HAS_RDF_ENTITY4) + ) { + int has_rdf_RDF = (raptor_memstr((const char*)buffer, len, "<rdf:RDF") != NULL); + int has_rdf_Description = (raptor_memstr((const char*)buffer, len, "rdf:Description") != NULL); + int has_rdf_about = (raptor_memstr((const char*)buffer, len, "rdf:about") != NULL); + + score += 7; + if(has_rdf_RDF) + score++; + if(has_rdf_Description) + score++; + if(has_rdf_about) + score++; + } + } + + return score; +} + + + +static int +raptor_rdfxml_parse_chunk(raptor_parser* rdf_parser, + const unsigned char *buffer, + size_t len, int is_end) +{ + raptor_rdfxml_parser* rdf_xml_parser; + int rc; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + if(rdf_parser->failed) + return 1; + + rc = raptor_sax2_parse_chunk(rdf_xml_parser->sax2, buffer, len, is_end); + + if(is_end) { + if(rdf_parser->emitted_default_graph) { + raptor_parser_end_graph(rdf_parser, NULL, 0); + rdf_parser->emitted_default_graph--; + } + } + + return rc; +} + + +static void +raptor_rdfxml_generate_statement(raptor_parser *rdf_parser, + raptor_term *subject_term, + raptor_uri *predicate_uri, + raptor_term *object_term, + raptor_term *reified_term, + raptor_rdfxml_element* bag_element) +{ + raptor_statement *statement = &rdf_parser->statement; + raptor_term* predicate_term = NULL; + int free_reified_term = 0; + + if(rdf_parser->failed) + return; + +#ifdef RAPTOR_DEBUG_VERBOSE + if(!subject_term) + RAPTOR_FATAL1("Statement has no subject\n"); + + if(!predicate_uri) + RAPTOR_FATAL1("Statement has no predicate\n"); + + if(!object_term) + RAPTOR_FATAL1("Statement has no object\n"); + +#endif + + predicate_term = raptor_new_term_from_uri(rdf_parser->world, predicate_uri); + if(!predicate_term) + return; + + statement->subject = subject_term; + statement->predicate = predicate_term; + statement->object = object_term; + +#ifdef RAPTOR_DEBUG_VERBOSE + fprintf(stderr, "raptor_rdfxml_generate_statement: Generating statement: "); + raptor_statement_print(statement, stderr); + fputc('\n', stderr); +#endif + + if(!rdf_parser->emitted_default_graph) { + raptor_parser_start_graph(rdf_parser, NULL, 0); + rdf_parser->emitted_default_graph++; + } + + if(!rdf_parser->statement_handler) + goto generate_tidy; + + /* Generate the statement; or is it a fact? */ + (*rdf_parser->statement_handler)(rdf_parser->user_data, statement); + + + /* the bagID mess */ + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_ALLOW_BAGID) && + bag_element && bag_element->bag) { + raptor_term* bag = bag_element->bag; + raptor_uri* bag_predicate_uri = NULL; + raptor_term* bag_predicate_term = NULL; + + statement->subject = bag; + + bag_element->last_bag_ordinal++; + + /* new URI object */ + bag_predicate_uri = raptor_new_uri_from_rdf_ordinal(rdf_parser->world, + bag_element->last_bag_ordinal); + if(!bag_predicate_uri) + goto generate_tidy; + + bag_predicate_term = raptor_new_term_from_uri(rdf_parser->world, + bag_predicate_uri); + raptor_free_uri(bag_predicate_uri); + + if(!bag_predicate_term) + goto generate_tidy; + + statement->predicate = bag_predicate_term; + + if(!reified_term || !reified_term->value.blank.string) { + unsigned char *reified_id = NULL; + + /* reified_term is NULL so generate a bag ID */ + reified_id = raptor_world_generate_bnodeid(rdf_parser->world); + if(!reified_id) + goto generate_tidy; + + reified_term = raptor_new_term_from_blank(rdf_parser->world, reified_id); + RAPTOR_FREE(char*, reified_id); + + if(!reified_term) + goto generate_tidy; + free_reified_term = 1; + } + + statement->object = reified_term; + (*rdf_parser->statement_handler)(rdf_parser->user_data, statement); + + if(bag_predicate_term) + raptor_free_term(bag_predicate_term); + } + + + /* return if is there no reified ID (that is valid) */ + if(!reified_term || !reified_term->value.blank.string) + goto generate_tidy; + + + /* otherwise generate reified statements */ + + statement->subject = reified_term; + statement->predicate = RAPTOR_RDF_type_term(rdf_parser->world); + statement->object = RAPTOR_RDF_Statement_term(rdf_parser->world); + (*rdf_parser->statement_handler)(rdf_parser->user_data, statement); + + /* statement->subject = reified_term; */ + statement->predicate = RAPTOR_RDF_subject_term(rdf_parser->world); + statement->object = subject_term; + (*rdf_parser->statement_handler)(rdf_parser->user_data, statement); + + + /* statement->subject = reified_term; */ + statement->predicate = RAPTOR_RDF_predicate_term(rdf_parser->world); + statement->object = predicate_term; + (*rdf_parser->statement_handler)(rdf_parser->user_data, statement); + + /* statement->subject = reified_term; */ + statement->predicate = RAPTOR_RDF_object_term(rdf_parser->world); + statement->object = object_term; + (*rdf_parser->statement_handler)(rdf_parser->user_data, statement); + + + generate_tidy: + /* Tidy up things allocated here */ + if(predicate_term) + raptor_free_term(predicate_term); + if(free_reified_term && reified_term) + raptor_free_term(reified_term); +} + + + +/** + * raptor_rdfxml_element_has_property_attributes: + * @element: element with the property attributes + * + * Return true if the element has at least one property attribute. + * + **/ +static int +raptor_rdfxml_element_has_property_attributes(raptor_rdfxml_element *element) +{ + int i; + + if(element->xml_element->attribute_count > 0) + return 1; + + /* look for rdf: properties */ + for(i = 0; i <= RDF_NS_LAST; i++) { + if(element->rdf_attr[i] && + raptor_rdf_ns_terms_info[i].type != RAPTOR_TERM_TYPE_UNKNOWN) + return 1; + } + return 0; +} + + +/** + * raptor_rdfxml_process_property_attributes: + * @rdf_parser: Raptor parser object + * @attributes_element: element with the property attributes + * @resource_element: element that defines the resource URI + * subject->value etc. + * @property_node_identifier: Use this identifier for the resource URI + * and count any ordinals for it locally + * + * Process the property attributes for an element for a given resource. + * + **/ +static int +raptor_rdfxml_process_property_attributes(raptor_parser *rdf_parser, + raptor_rdfxml_element *attributes_element, + raptor_rdfxml_element *resource_element, + raptor_term *property_node_identifier) +{ + unsigned int i; + raptor_term *resource_identifier; + + resource_identifier = property_node_identifier ? property_node_identifier : resource_element->subject; + + + /* Process attributes as propAttr* = * (propName="string")* + */ + for(i = 0; i < attributes_element->xml_element->attribute_count; i++) { + raptor_qname* attr = attributes_element->xml_element->attributes[i]; + const unsigned char *name; + const unsigned char *value; + int handled = 0; + + if(!attr) + continue; + + name = attr->local_name; + value = attr->value; + + if(!attr->nspace) { + raptor_rdfxml_update_document_locator(rdf_parser); + raptor_parser_error(rdf_parser, + "Using property attribute '%s' without a namespace is forbidden.", + name); + continue; + } + + + if(!raptor_unicode_check_utf8_nfc_string(value, strlen((const char*)value))) { + raptor_log_level l; + + raptor_rdfxml_update_document_locator(rdf_parser); + l = (RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NON_NFC_FATAL)) ? RAPTOR_LOG_LEVEL_ERROR : + RAPTOR_LOG_LEVEL_WARN; + raptor_parser_log_error(rdf_parser, l, + "Property attribute '%s' has a string not in Unicode Normal Form C: %s", + name, value); + continue; + } + + + /* Generate the property statement using one of these properties: + * 1) rdf:_n + * 2) the URI from the rdf:* attribute where allowed + * 3) otherwise forbidden (including rdf:li) + */ + if(attr->nspace->is_rdf_ms) { + /* is rdf: namespace */ + + if(*name == '_') { + int ordinal; + + /* recognise rdf:_ */ + name++; + ordinal = raptor_check_ordinal(name); + if(ordinal < 1) { + raptor_rdfxml_update_document_locator(rdf_parser); + raptor_parser_error(rdf_parser, + "Illegal ordinal value %d in property attribute '%s' seen on containing element '%s'.", + ordinal, attr->local_name, name); + } + } else { + int rc; + + raptor_rdfxml_update_document_locator(rdf_parser); + + rc = raptor_rdfxml_check_propertyAttribute_name((const char*)name); + if(!rc) + raptor_parser_error(rdf_parser, + "RDF term %s is forbidden as a property attribute.", + name); + else if(rc < 0) + raptor_parser_warning(rdf_parser, + "Unknown RDF namespace property attribute '%s'.", + name); + } + + } /* end is RDF namespace property */ + + + if(!handled) { + raptor_term* object_term; + + object_term = raptor_new_term_from_literal(rdf_parser->world, + (unsigned char*)value, + NULL, NULL); + + /* else not rdf: namespace or unknown in rdf: namespace so + * generate a statement with a literal object + */ + raptor_rdfxml_generate_statement(rdf_parser, + resource_identifier, + attr->uri, + object_term, + NULL, /* Property attributes are never reified*/ + resource_element); + + raptor_free_term(object_term); + } + + } /* end for ... attributes */ + + + /* Handle rdf property attributes + * (only rdf:type and rdf:value at present) + */ + for(i = 0; i <= RDF_NS_LAST; i++) { + const unsigned char *value = attributes_element->rdf_attr[i]; + size_t value_len; + int object_is_literal; + raptor_uri *property_uri; + raptor_term* object_term; + + if(!value) + continue; + + value_len = strlen((const char*)value); + + object_is_literal = (raptor_rdf_ns_terms_info[i].type == RAPTOR_TERM_TYPE_LITERAL); + + if(raptor_rdf_ns_terms_info[i].type == RAPTOR_TERM_TYPE_UNKNOWN) { + const char *name = raptor_rdf_ns_terms_info[i].name; + int rc = raptor_rdfxml_check_propertyAttribute_name(name); + if(!rc) { + raptor_rdfxml_update_document_locator(rdf_parser); + raptor_parser_error(rdf_parser, + "RDF term %s is forbidden as a property attribute.", + name); + continue; + } else if(rc < 0) + raptor_parser_warning(rdf_parser, + "Unknown RDF namespace property attribute '%s'.", + name); + } + + if(object_is_literal && + !raptor_unicode_check_utf8_nfc_string(value, value_len)) { + raptor_log_level l; + + raptor_rdfxml_update_document_locator(rdf_parser); + l = (RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NON_NFC_FATAL)) ? RAPTOR_LOG_LEVEL_ERROR : + RAPTOR_LOG_LEVEL_WARN; + + raptor_parser_log_error(rdf_parser, l, + "Property attribute '%s' has a string not in Unicode Normal Form C: %s", + raptor_rdf_ns_terms_info[i].name, value); + continue; + } + + property_uri = raptor_new_uri_for_rdf_concept(rdf_parser->world, + (const unsigned char*)raptor_rdf_ns_terms_info[i].name); + + if(object_is_literal) { + object_term = raptor_new_term_from_literal(rdf_parser->world, + (unsigned char*)value, + NULL, NULL); + } else { + raptor_uri *base_uri; + raptor_uri *object_uri; + base_uri = raptor_rdfxml_inscope_base_uri(rdf_parser); + object_uri = raptor_new_uri_relative_to_base(rdf_parser->world, + base_uri, value); + object_term = raptor_new_term_from_uri(rdf_parser->world, object_uri); + raptor_free_uri(object_uri); + } + + raptor_rdfxml_generate_statement(rdf_parser, + resource_identifier, + property_uri, + object_term, + NULL, /* Property attributes are never reified*/ + resource_element); + + raptor_free_term(object_term); + + raptor_free_uri(property_uri); + + } /* end for rdf:property values */ + + return 0; +} + + +static void +raptor_rdfxml_start_element_grammar(raptor_parser *rdf_parser, + raptor_rdfxml_element *element) +{ + raptor_rdfxml_parser *rdf_xml_parser; + int finished; + raptor_state state; + raptor_xml_element* xml_element; + raptor_qname* el_qname; + const unsigned char *el_name; + int element_in_rdf_ns; + int rc = 0; + raptor_uri* base_uri; + raptor_uri* element_name_uri; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + xml_element = element->xml_element; + el_qname = raptor_xml_element_get_name(xml_element); + el_name = el_qname->local_name; + element_in_rdf_ns = (el_qname->nspace && el_qname->nspace->is_rdf_ms); + base_uri = raptor_rdfxml_inscope_base_uri(rdf_parser); + element_name_uri = el_qname->uri; + + state = element->state; +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG2("Starting in state %s\n", raptor_rdfxml_state_as_string(state)); +#endif + + finished = 0; + while(!finished) { + + switch(state) { + case RAPTOR_STATE_SKIPPING: + element->child_state = state; + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PRESERVED; + finished = 1; + break; + + case RAPTOR_STATE_UNKNOWN: + /* found <rdf:RDF> ? */ + + if(element_in_rdf_ns) { + if(raptor_uri_equals(element_name_uri, + RAPTOR_RDF_RDF_URI(rdf_parser->world))) { + element->child_state = RAPTOR_STATE_NODE_ELEMENT_LIST; + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_NODES; + /* Yes - need more content before can continue, + * so wait for another element + */ + finished = 1; + break; + } + if(raptor_uri_equals(element_name_uri, + RAPTOR_RDF_Description_URI(rdf_parser->world))) { + state = RAPTOR_STATE_DESCRIPTION; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTIES; + /* Yes - found something so move immediately to description */ + break; + } + + if(element_in_rdf_ns) { + rc = raptor_rdfxml_check_nodeElement_name((const char*)el_name); + if(!rc) { + raptor_parser_error(rdf_parser, + "rdf:%s is forbidden as a node element.", + el_name); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } else if(rc < 0) { + raptor_parser_warning(rdf_parser, + "rdf:%s is an unknown RDF namespaced element.", + el_name); + } + } + } + + /* If scanning for element, can continue */ + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_SCANNING)) { + finished = 1; + break; + } + + /* Otherwise the choice of the next state can be made + * from the current element by the OBJ state + */ + state = RAPTOR_STATE_NODE_ELEMENT_LIST; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_NODES; + break; + + + case RAPTOR_STATE_NODE_ELEMENT_LIST: + /* Handling + * http://www.w3.org/TR/rdf-syntax-grammar/#nodeElementList + * + * Everything goes to nodeElement + */ + + state = RAPTOR_STATE_NODE_ELEMENT; + + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTIES; + + break; + + + + case RAPTOR_STATE_DESCRIPTION: + case RAPTOR_STATE_NODE_ELEMENT: + case RAPTOR_STATE_PARSETYPE_RESOURCE: + case RAPTOR_STATE_PARSETYPE_COLLECTION: + /* Handling <rdf:Description> or other node element + * http://www.w3.org/TR/rdf-syntax-grammar/#nodeElement + * + * or a property element acting as a node element for + * rdf:parseType="Resource" + * http://www.w3.org/TR/rdf-syntax-grammar/#parseTypeResourcePropertyElt + * or rdf:parseType="Collection" (and daml:Collection) + * http://www.w3.org/TR/rdf-syntax-grammar/#parseTypeCollectionPropertyElt + * + * Only create a bag if bagID given + */ + + if(!element_name_uri) { + /* We cannot handle this */ + raptor_parser_warning(rdf_parser, "Using node element '%s' without a namespace is forbidden.", + el_qname->local_name); + raptor_rdfxml_update_document_locator(rdf_parser); + element->state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + + if(element_in_rdf_ns) { + rc = raptor_rdfxml_check_nodeElement_name((const char*)el_name); + if(!rc) { + raptor_parser_error(rdf_parser, + "rdf:%s is forbidden as a node element.", + el_name); + state = RAPTOR_STATE_SKIPPING; + element->state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } else if(rc < 0) { + raptor_parser_warning(rdf_parser, + "rdf:%s is an unknown RDF namespaced element.", + el_name); + } + } + + if(element->content_type != RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_COLLECTION && + element->content_type != RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION && + element->parent && + (element->parent->state == RAPTOR_STATE_PROPERTYELT || + element->parent->state == RAPTOR_STATE_MEMBER_PROPERTYELT) && + element->parent->xml_element->content_element_seen > 1) { + raptor_rdfxml_update_document_locator(rdf_parser); + raptor_parser_error(rdf_parser, "The enclosing property already has an object"); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + + if(state == RAPTOR_STATE_NODE_ELEMENT || + state == RAPTOR_STATE_DESCRIPTION || + state == RAPTOR_STATE_PARSETYPE_COLLECTION) { + if(element_in_rdf_ns && + raptor_uri_equals(element_name_uri, + RAPTOR_RDF_Description_URI(rdf_parser->world))) + state = RAPTOR_STATE_DESCRIPTION; + else + state = RAPTOR_STATE_NODE_ELEMENT; + } + + + if((element->rdf_attr[RDF_NS_ID]!=NULL) + + (element->rdf_attr[RDF_NS_about]!=NULL) + + (element->rdf_attr[RDF_NS_nodeID]!=NULL) > 1) { + raptor_rdfxml_update_document_locator(rdf_parser); + raptor_parser_error(rdf_parser, "Multiple attributes of rdf:ID, rdf:about and rdf:nodeID on element '%s' - only one allowed.", el_name); + } + + if(element->rdf_attr[RDF_NS_ID]) { + unsigned char* subject_id; + raptor_uri* subject_uri; + + subject_id = (unsigned char*)element->rdf_attr[RDF_NS_ID]; + + if(!raptor_valid_xml_ID(rdf_parser, subject_id)) { + raptor_parser_error(rdf_parser, "Illegal rdf:ID value '%s'", + subject_id); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + if(raptor_rdfxml_record_ID(rdf_parser, element, subject_id)) { + raptor_parser_error(rdf_parser, "Duplicated rdf:ID value '%s'", + subject_id); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + + /* after this, subject_id is the owner of the ID string */ + element->rdf_attr[RDF_NS_ID] = NULL; + + subject_uri = raptor_new_uri_from_id(rdf_parser->world, base_uri, + subject_id); + RAPTOR_FREE(char*, subject_id); + + if(!subject_uri) + goto oom; + element->subject = raptor_new_term_from_uri(rdf_parser->world, + subject_uri); + raptor_free_uri(subject_uri); + + if(!element->subject) + goto oom; + + } else if(element->rdf_attr[RDF_NS_about]) { + raptor_uri* subject_uri; + + subject_uri = raptor_new_uri_relative_to_base(rdf_parser->world, + base_uri, + (const unsigned char*)element->rdf_attr[RDF_NS_about]); + if(!subject_uri) + goto oom; + + element->subject = raptor_new_term_from_uri(rdf_parser->world, + subject_uri); + raptor_free_uri(subject_uri); + + RAPTOR_FREE(char*, element->rdf_attr[RDF_NS_about]); + element->rdf_attr[RDF_NS_about] = NULL; + if(!element->subject) + goto oom; + + } else if(element->rdf_attr[RDF_NS_nodeID]) { + unsigned char* subject_id; + subject_id = raptor_world_internal_generate_id(rdf_parser->world, + (unsigned char*)element->rdf_attr[RDF_NS_nodeID]); + if(!subject_id) + goto oom; + + element->subject = raptor_new_term_from_blank(rdf_parser->world, + subject_id); + RAPTOR_FREE(char*, subject_id); + + element->rdf_attr[RDF_NS_nodeID] = NULL; + if(!element->subject) + goto oom; + + if(!raptor_valid_xml_ID(rdf_parser, element->subject->value.blank.string)) { + raptor_parser_error(rdf_parser, "Illegal rdf:nodeID value '%s'", + (const char*)element->subject->value.blank.string); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + } else if(element->parent && + element->parent->child_content_type != RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_COLLECTION && + element->parent->child_content_type != RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION && + element->parent->object) { + /* copy from parent (property element), it has a URI for us */ + element->subject = raptor_term_copy(element->parent->object); + } else { + unsigned char* subject_id; + subject_id = raptor_world_generate_bnodeid(rdf_parser->world); + if(!subject_id) + goto oom; + + element->subject = raptor_new_term_from_blank(rdf_parser->world, + subject_id); + RAPTOR_FREE(char*, subject_id); + + if(!element->subject) + goto oom; + } + + + if(element->rdf_attr[RDF_NS_bagID]) { + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_ALLOW_BAGID)) { + unsigned char* bag_id; + raptor_uri* bag_uri = NULL; + + bag_id = (unsigned char*)element->rdf_attr[RDF_NS_bagID]; + element->rdf_attr[RDF_NS_bagID] = NULL; + + bag_uri = raptor_new_uri_from_id(rdf_parser->world, + base_uri, bag_id); + if(!bag_uri) { + RAPTOR_FREE(char*, bag_id); + goto oom; + } + + element->bag = raptor_new_term_from_uri(rdf_parser->world, bag_uri); + raptor_free_uri(bag_uri); + + if(!raptor_valid_xml_ID(rdf_parser, bag_id)) { + raptor_parser_error(rdf_parser, "Illegal rdf:bagID value '%s'", + bag_id); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + RAPTOR_FREE(char*, bag_id); + break; + } + if(raptor_rdfxml_record_ID(rdf_parser, element, bag_id)) { + raptor_parser_error(rdf_parser, "Duplicated rdf:bagID value '%s'", + bag_id); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + RAPTOR_FREE(char*, bag_id); + break; + } + + RAPTOR_FREE(char*, bag_id); + raptor_parser_warning(rdf_parser, "rdf:bagID is deprecated."); + + + raptor_rdfxml_generate_statement(rdf_parser, + element->bag, + RAPTOR_RDF_type_URI(rdf_parser->world), + RAPTOR_RDF_Bag_term(rdf_parser->world), + NULL, + NULL); + } else { + /* bagID forbidden */ + raptor_parser_error(rdf_parser, "rdf:bagID is forbidden."); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + } + + + if(element->parent) { + + /* In a rdf:parseType="Collection" the resources are appended + * to the list at the genid element->parent->tail_id + */ + if(element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_COLLECTION || + element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION) { + /* <idList> rdf:type rdf:List */ + const unsigned char * idList; + raptor_uri *predicate_uri; + raptor_term* idList_term; + raptor_term* object_term; + + idList = raptor_world_generate_bnodeid(rdf_parser->world); + if(!idList) + goto oom; + /* idList string is saved below in element->parent->tail_id */ + + idList_term = raptor_new_term_from_blank(rdf_parser->world, idList); + if(!idList_term) { + RAPTOR_FREE(char*, idList); + goto oom; + } + + if((element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION) || + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_ALLOW_RDF_TYPE_RDF_LIST)) { + raptor_uri* class_uri = NULL; + + if(element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION) { + class_uri = RAPTOR_DAML_List_URI(rdf_xml_parser); + object_term = raptor_new_term_from_uri(rdf_parser->world, + class_uri); + } else + object_term = raptor_term_copy(RAPTOR_RDF_List_term(rdf_parser->world)); + + raptor_rdfxml_generate_statement(rdf_parser, + idList_term, + RAPTOR_RDF_type_URI(rdf_parser->world), + object_term, + NULL, + element); + raptor_free_term(object_term); + } + + predicate_uri = (element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION) ? RAPTOR_DAML_first_URI(rdf_xml_parser) : RAPTOR_RDF_first_URI(rdf_parser->world); + + /* <idList> rdf:first <element->uri> */ + raptor_rdfxml_generate_statement(rdf_parser, + idList_term, + predicate_uri, + element->subject, + NULL, + NULL); + + /* If there is no rdf:parseType="Collection" */ + if(!element->parent->tail_id) { + /* Free any existing object still around. + * I suspect this can never happen. + */ + if(element->parent->object) + raptor_free_term(element->parent->object); + + element->parent->object = raptor_new_term_from_blank(rdf_parser->world, + idList); + } else { + raptor_term* tail_id_term; + + tail_id_term = raptor_new_term_from_blank(rdf_parser->world, + element->parent->tail_id); + + predicate_uri = (element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION) ? RAPTOR_DAML_rest_URI(rdf_xml_parser) : RAPTOR_RDF_rest_URI(rdf_parser->world); + + /* _:tail_id rdf:rest _:listRest */ + raptor_rdfxml_generate_statement(rdf_parser, + tail_id_term, + predicate_uri, + idList_term, + NULL, + NULL); + + raptor_free_term(tail_id_term); + } + + /* update new tail */ + if(element->parent->tail_id) + RAPTOR_FREE(char*, (char*)element->parent->tail_id); + + element->parent->tail_id = idList; + + raptor_free_term(idList_term); + } else if(element->parent->state != RAPTOR_STATE_UNKNOWN && + element->state != RAPTOR_STATE_PARSETYPE_RESOURCE) { + /* If there is a parent element (property) containing this + * element (node) and it has no object, set it from this subject + */ + + if(element->parent->object) { + raptor_rdfxml_update_document_locator(rdf_parser); + raptor_parser_error(rdf_parser, + "Tried to set multiple objects of a statement"); + } else { + /* Store URI of this node in our parent as the property object */ + element->parent->object = raptor_term_copy(element->subject); + element->parent->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE; + } + + } + } + + + /* If this is a node element, generate the rdf:type statement + * from this node + */ + if(state == RAPTOR_STATE_NODE_ELEMENT) { + raptor_term* el_name_term; + + el_name_term = raptor_new_term_from_uri(rdf_parser->world, + element_name_uri); + + raptor_rdfxml_generate_statement(rdf_parser, + element->subject, + RAPTOR_RDF_type_URI(rdf_parser->world), + el_name_term, + element->reified, + element); + + raptor_free_term(el_name_term); + } + + if(raptor_rdfxml_process_property_attributes(rdf_parser, element, + element, NULL)) + goto oom; + + /* for both productions now need some more content or + * property elements before can do any more work. + */ + + element->child_state = RAPTOR_STATE_PROPERTYELT; + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTIES; + finished = 1; + break; + + + case RAPTOR_STATE_PARSETYPE_OTHER: + /* FALLTHROUGH */ + + case RAPTOR_STATE_PARSETYPE_LITERAL: + raptor_xml_writer_start_element(rdf_xml_parser->xml_writer, xml_element); + element->child_state = RAPTOR_STATE_PARSETYPE_LITERAL; + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_XML_LITERAL; + + finished = 1; + break; + + /* Handle all the detail of the various options of property element + * http://www.w3.org/TR/rdf-syntax-grammar/#propertyElt + * + * All the attributes must be scanned here to see what additional + * property element work is needed. No triples are generated + * until the end of this element, until it is clear if the + * element was empty. + */ + case RAPTOR_STATE_MEMBER_PROPERTYELT: + case RAPTOR_STATE_PROPERTYELT: + + if(!element_name_uri) { + raptor_parser_error(rdf_parser, "Using property element '%s' without a namespace is forbidden.", + raptor_xml_element_get_name(element->parent->xml_element)->local_name); + raptor_rdfxml_update_document_locator(rdf_parser); + element->state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + + /* Handling rdf:li as a property, noting special processing */ + if(element_in_rdf_ns && + raptor_uri_equals(element_name_uri, + RAPTOR_RDF_li_URI(rdf_parser->world))) { + state = RAPTOR_STATE_MEMBER_PROPERTYELT; + } + + + if(element_in_rdf_ns) { + rc = raptor_rdfxml_check_propertyElement_name((const char*)el_name); + if(!rc) { + raptor_parser_error(rdf_parser, + "rdf:%s is forbidden as a property element.", + el_name); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } else if(rc < 0) { + raptor_parser_warning(rdf_parser, + "rdf:%s is an unknown RDF namespaced element.", + el_name); + } + } + + + /* rdf:ID on a property element - reify a statement. + * Allowed on all property element forms + */ + if(element->rdf_attr[RDF_NS_ID]) { + raptor_uri *reified_uri; + + element->reified_id = element->rdf_attr[RDF_NS_ID]; + element->rdf_attr[RDF_NS_ID] = NULL; + reified_uri = raptor_new_uri_from_id(rdf_parser->world, base_uri, + element->reified_id); + if(!reified_uri) + goto oom; + + element->reified = raptor_new_term_from_uri(rdf_parser->world, + reified_uri); + raptor_free_uri(reified_uri); + + if(!element->reified) + goto oom; + + if(!raptor_valid_xml_ID(rdf_parser, element->reified_id)) { + raptor_parser_error(rdf_parser, "Illegal rdf:ID value '%s'", + element->reified_id); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + if(raptor_rdfxml_record_ID(rdf_parser, element, element->reified_id)) { + raptor_parser_error(rdf_parser, "Duplicated rdf:ID value '%s'", + element->reified_id); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + } + + /* rdf:datatype on a property element. + * Only allowed for + * http://www.w3.org/TR/rdf-syntax-grammar/#literalPropertyElt + */ + if(element->rdf_attr[RDF_NS_datatype]) { + raptor_uri *datatype_uri; + + datatype_uri = raptor_new_uri_relative_to_base(rdf_parser->world, + base_uri, + (const unsigned char*)element->rdf_attr[RDF_NS_datatype]); + element->object_literal_datatype = datatype_uri; + RAPTOR_FREE(char*, element->rdf_attr[RDF_NS_datatype]); + element->rdf_attr[RDF_NS_datatype] = NULL; + if(!element->object_literal_datatype) + goto oom; + } + + if(element->rdf_attr[RDF_NS_bagID]) { + + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_ALLOW_BAGID)) { + + if(element->rdf_attr[RDF_NS_resource] || + element->rdf_attr[RDF_NS_parseType]) { + + raptor_parser_error(rdf_parser, "rdf:bagID is forbidden on property element '%s' with an rdf:resource or rdf:parseType attribute.", el_name); + /* prevent this being used later either */ + RAPTOR_FREE(char*, element->rdf_attr[RDF_NS_bagID]); + element->rdf_attr[RDF_NS_bagID] = NULL; + } else { + unsigned char* bag_id; + raptor_uri* bag_uri; + + bag_id = (unsigned char*)element->rdf_attr[RDF_NS_bagID]; + element->rdf_attr[RDF_NS_bagID] = NULL; + bag_uri = raptor_new_uri_from_id(rdf_parser->world, base_uri, + bag_id); + if(!bag_uri) { + RAPTOR_FREE(char*, bag_id); + goto oom; + } + + element->bag = raptor_new_term_from_uri(rdf_parser->world, + bag_uri); + raptor_free_uri(bag_uri); + + if(!element->bag) { + RAPTOR_FREE(char*, bag_id); + goto oom; + } + + if(!raptor_valid_xml_ID(rdf_parser, bag_id)) { + raptor_parser_error(rdf_parser, "Illegal rdf:bagID value '%s'", + bag_id); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + RAPTOR_FREE(char*, bag_id); + break; + } + if(raptor_rdfxml_record_ID(rdf_parser, element, bag_id)) { + raptor_parser_error(rdf_parser, + "Duplicated rdf:bagID value '%s'", bag_id); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + RAPTOR_FREE(char*, bag_id); + finished = 1; + break; + } + + RAPTOR_FREE(char*, bag_id); + raptor_parser_warning(rdf_parser, "rdf:bagID is deprecated."); + } + } else { + /* bagID forbidden */ + raptor_parser_error(rdf_parser, "rdf:bagID is forbidden."); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + } /* if rdf:bagID on property element */ + + + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTY_CONTENT; + + if(element->rdf_attr[RDF_NS_parseType]) { + const unsigned char *parse_type; + int i; + int is_parseType_Literal = 0; + + parse_type = element->rdf_attr[RDF_NS_parseType]; + + if(raptor_rdfxml_element_has_property_attributes(element)) { + raptor_parser_error(rdf_parser, "Property attributes cannot be used with rdf:parseType='%s'", parse_type); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + finished = 1; + break; + } + + /* Check for bad combinations of things with parseType */ + for(i = 0; i <= RDF_NS_LAST; i++) + if(element->rdf_attr[i] && i != RDF_NS_parseType) { + raptor_parser_error(rdf_parser, "Attribute '%s' cannot be used with rdf:parseType='%s'", raptor_rdf_ns_terms_info[i].name, parse_type); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + break; + } + + + if(!strcmp((char*)parse_type, "Literal")) + is_parseType_Literal = 1; + else if(!strcmp((char*)parse_type, "Resource")) { + unsigned char* subject_id; + + state = RAPTOR_STATE_PARSETYPE_RESOURCE; + element->child_state = RAPTOR_STATE_PROPERTYELT; + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTIES; + + /* create a node for the subject of the contained properties */ + subject_id = raptor_world_generate_bnodeid(rdf_parser->world); + if(!subject_id) + goto oom; + + element->subject = raptor_new_term_from_blank(rdf_parser->world, + subject_id); + RAPTOR_FREE(char*, subject_id); + + if(!element->subject) + goto oom; + } else if(!strcmp((char*)parse_type, "Collection")) { + /* An rdf:parseType="Collection" appears as a single node */ + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE; + element->child_state = RAPTOR_STATE_PARSETYPE_COLLECTION; + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_COLLECTION; + } else { + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_ALLOW_OTHER_PARSETYPES) && + !raptor_strcasecmp((char*)parse_type, "daml:collection")) { + /* A DAML collection appears as a single node */ + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE; + element->child_state = RAPTOR_STATE_PARSETYPE_COLLECTION; + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION; + } else { + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_WARN_OTHER_PARSETYPES)) { + raptor_parser_warning(rdf_parser, "Unknown rdf:parseType value '%s' taken as 'Literal'", parse_type); + } + is_parseType_Literal = 1; + } + + } + + if(is_parseType_Literal) { + raptor_xml_writer* xml_writer; + + /* rdf:parseType="Literal" - explicitly or default + * if the parseType value is not recognised + */ + rdf_xml_parser->xml_content = NULL; + rdf_xml_parser->xml_content_length = 0; + rdf_xml_parser->iostream = + raptor_new_iostream_to_string(rdf_parser->world, + &rdf_xml_parser->xml_content, + &rdf_xml_parser->xml_content_length, + raptor_alloc_memory); + if(!rdf_xml_parser->iostream) + goto oom; + xml_writer = raptor_new_xml_writer(rdf_parser->world, NULL, + rdf_xml_parser->iostream); + rdf_xml_parser->xml_writer = xml_writer; + if(!rdf_xml_parser->xml_writer) + goto oom; + + raptor_xml_writer_set_option(rdf_xml_parser->xml_writer, + RAPTOR_OPTION_WRITER_XML_DECLARATION, + NULL, 0); + + element->child_state = RAPTOR_STATE_PARSETYPE_LITERAL; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_XML_LITERAL; + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_XML_LITERAL; + } + } else { + + /* Can only be the empty property element case + * http://www.w3.org/TR/rdf-syntax-grammar/#emptyPropertyElt + */ + + /* The presence of the rdf:resource or rdf:nodeID + * attributes is checked at element close time + */ + + /* + * Assign reified URI here so we don't reify property attributes + * using this id + */ + if(element->reified_id && !element->reified) { + raptor_uri* reified_uri; + reified_uri = raptor_new_uri_from_id(rdf_parser->world, base_uri, + element->reified_id); + if(!reified_uri) + goto oom; + element->reified = raptor_new_term_from_uri(rdf_parser->world, + reified_uri); + raptor_free_uri(reified_uri); + + if(!element->reified) + goto oom; + } + + if(element->rdf_attr[RDF_NS_resource] || + element->rdf_attr[RDF_NS_nodeID]) { + /* Done - wait for end of this element to end in order to + * check the element was empty as expected */ + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE; + } else { + /* Otherwise process content in obj (value) state */ + element->child_state = RAPTOR_STATE_NODE_ELEMENT_LIST; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTY_CONTENT; + } + } + + finished = 1; + + break; + + + case RAPTOR_STATE_INVALID: + default: + raptor_parser_fatal_error(rdf_parser, + "%s Internal error - unexpected parser state %u - %s", + __FUNCTION__, + state, raptor_rdfxml_state_as_string(state)); + finished = 1; + + } /* end switch */ + + if(state != element->state) { + element->state = state; +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG3("Moved to state %d - %s\n", state, + raptor_rdfxml_state_as_string(state)); +#endif + } + + } /* end while */ + +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG2("Ending in state %s\n", raptor_rdfxml_state_as_string(state)); +#endif + + return; + + oom: + raptor_parser_fatal_error(rdf_parser, "Out of memory, skipping"); + element->state = RAPTOR_STATE_SKIPPING; +} + + +static void +raptor_rdfxml_end_element_grammar(raptor_parser *rdf_parser, + raptor_rdfxml_element *element) +{ + raptor_rdfxml_parser *rdf_xml_parser; + raptor_state state; + int finished; + raptor_xml_element* xml_element = element->xml_element; + raptor_qname* el_qname; + const unsigned char *el_name; + int element_in_rdf_ns; + raptor_uri* element_name_uri; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + el_qname = raptor_xml_element_get_name(xml_element); + el_name = el_qname->local_name; + element_in_rdf_ns= (el_qname->nspace && el_qname->nspace->is_rdf_ms); + element_name_uri = el_qname->uri; + + + state = element->state; +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG2("Starting in state %s\n", raptor_rdfxml_state_as_string(state)); +#endif + + finished= 0; + while(!finished) { + switch(state) { + case RAPTOR_STATE_SKIPPING: + finished = 1; + break; + + case RAPTOR_STATE_UNKNOWN: + finished = 1; + break; + + case RAPTOR_STATE_NODE_ELEMENT_LIST: + if(element_in_rdf_ns && + raptor_uri_equals(element_name_uri, + RAPTOR_RDF_RDF_URI(rdf_parser->world))) { + /* end of RDF - boo hoo */ + state = RAPTOR_STATE_UNKNOWN; + finished = 1; + break; + } + /* When scanning, another element ending is outside the RDF + * world so this can happen without further work + */ + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_SCANNING)) { + state = RAPTOR_STATE_UNKNOWN; + finished = 1; + break; + } + /* otherwise found some junk after RDF content in an RDF-only + * document (probably never get here since this would be + * a mismatched XML tag and cause an error earlier) + */ + raptor_rdfxml_update_document_locator(rdf_parser); + raptor_parser_warning(rdf_parser, + "Element '%s' ended, expected end of RDF element", + el_name); + state = RAPTOR_STATE_UNKNOWN; + finished = 1; + break; + + + case RAPTOR_STATE_DESCRIPTION: + case RAPTOR_STATE_NODE_ELEMENT: + case RAPTOR_STATE_PARSETYPE_RESOURCE: + + /* If there is a parent element containing this element and + * the parent isn't a description, has an identifier, + * create the statement between this node using parent property + * (Need to check for identifier so that top-level typed nodes + * don't get connect to <rdf:RDF> parent element) + */ + if(state == RAPTOR_STATE_NODE_ELEMENT && + element->parent && element->parent->subject) { + raptor_rdfxml_generate_statement(rdf_parser, + element->parent->subject, + element_name_uri, + element->subject, + NULL, + element); + } else if(state == RAPTOR_STATE_PARSETYPE_RESOURCE && + element->parent && element->parent->subject) { + /* Handle rdf:li as the rdf:parseType="resource" property */ + if(element_in_rdf_ns && + raptor_uri_equals(element_name_uri, + RAPTOR_RDF_li_URI(rdf_parser->world))) { + raptor_uri* ordinal_predicate_uri; + + element->parent->last_ordinal++; + ordinal_predicate_uri = raptor_new_uri_from_rdf_ordinal(rdf_parser->world, element->parent->last_ordinal); + + raptor_rdfxml_generate_statement(rdf_parser, + element->parent->subject, + ordinal_predicate_uri, + element->subject, + element->reified, + element->parent); + raptor_free_uri(ordinal_predicate_uri); + } else { + raptor_rdfxml_generate_statement(rdf_parser, + element->parent->subject, + element_name_uri, + element->subject, + element->reified, + element->parent); + } + } + finished = 1; + break; + + case RAPTOR_STATE_PARSETYPE_COLLECTION: + + finished = 1; + break; + + case RAPTOR_STATE_PARSETYPE_OTHER: + /* FALLTHROUGH */ + + case RAPTOR_STATE_PARSETYPE_LITERAL: + element->parent->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_XML_LITERAL; + + raptor_xml_writer_end_element(rdf_xml_parser->xml_writer, xml_element); + + finished = 1; + break; + + + case RAPTOR_STATE_PROPERTYELT: + case RAPTOR_STATE_MEMBER_PROPERTYELT: + /* A property element + * http://www.w3.org/TR/rdf-syntax-grammar/#propertyElt + * + * Literal content part is handled here. + * The element content is handled in the internal states + * Empty content is checked here. + */ + + if(element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTY_CONTENT) { + if(xml_element->content_cdata_seen) + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LITERAL; + else if(xml_element->content_element_seen) + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTIES; + else { + /* Empty Literal */ + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LITERAL; + } + + } + + + /* Handle terminating a rdf:parseType="Collection" list */ + if(element->child_content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_COLLECTION || + element->child_content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION) { + raptor_term* nil_term; + + if(element->child_content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION) { + raptor_uri* nil_uri = RAPTOR_DAML_nil_URI(rdf_xml_parser); + nil_term = raptor_new_term_from_uri(rdf_parser->world, nil_uri); + } else { + nil_term = raptor_term_copy(RAPTOR_RDF_nil_term(rdf_parser->world)); + } + + if(!element->tail_id) { + /* If No List: set object of statement to rdf:nil */ + element->object = raptor_term_copy(nil_term); + } else { + raptor_uri* rest_uri = NULL; + raptor_term* tail_id_term; + + if(element->child_content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION) + rest_uri = RAPTOR_DAML_rest_URI(rdf_xml_parser); + else + rest_uri = RAPTOR_RDF_rest_URI(rdf_parser->world); + + tail_id_term = raptor_new_term_from_blank(rdf_parser->world, + element->tail_id); + + /* terminate the list */ + raptor_rdfxml_generate_statement(rdf_parser, + tail_id_term, + rest_uri, + nil_term, + NULL, + NULL); + + raptor_free_term(tail_id_term); + } + + raptor_free_term(nil_term); + + } /* end rdf:parseType="Collection" termination */ + + +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG3("Content type %s (%d)\n", + raptor_rdfxml_element_content_type_as_string(element->content_type), + element->content_type); +#endif + + switch(element->content_type) { + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE: + + if(raptor_rdfxml_element_has_property_attributes(element) && + element->child_state == RAPTOR_STATE_DESCRIPTION) { + raptor_parser_error(rdf_parser, + "Property element '%s' has both property attributes and a node element content", + el_name); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + break; + } + + if(!element->object) { + if(element->rdf_attr[RDF_NS_resource]) { + raptor_uri* resource_uri; + resource_uri = raptor_new_uri_relative_to_base(rdf_parser->world, + raptor_rdfxml_inscope_base_uri(rdf_parser), + (const unsigned char*)element->rdf_attr[RDF_NS_resource]); + if(!resource_uri) + goto oom; + + element->object = raptor_new_term_from_uri(rdf_parser->world, + resource_uri); + raptor_free_uri(resource_uri); + + RAPTOR_FREE(char*, element->rdf_attr[RDF_NS_resource]); + element->rdf_attr[RDF_NS_resource] = NULL; + if(!element->object) + goto oom; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE; + } else if(element->rdf_attr[RDF_NS_nodeID]) { + unsigned char* resource_id; + resource_id = raptor_world_internal_generate_id(rdf_parser->world, + (unsigned char*)element->rdf_attr[RDF_NS_nodeID]); + if(!resource_id) + goto oom; + + element->object = raptor_new_term_from_blank(rdf_parser->world, + resource_id); + RAPTOR_FREE(char*, resource_id); + element->rdf_attr[RDF_NS_nodeID] = NULL; + if(!element->object) + goto oom; + + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE; + if(!raptor_valid_xml_ID(rdf_parser, + element->object->value.blank.string)) { + raptor_parser_error(rdf_parser, "Illegal rdf:nodeID value '%s'", (const char*)element->object->value.blank.string); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + break; + } + } else { + unsigned char* resource_id; + resource_id = raptor_world_generate_bnodeid(rdf_parser->world); + if(!resource_id) + goto oom; + + element->object = raptor_new_term_from_blank(rdf_parser->world, + resource_id); + RAPTOR_FREE(char*, resource_id); + + if(!element->object) + goto oom; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE; + } + + if(raptor_rdfxml_process_property_attributes(rdf_parser, element, + element->parent, + element->object)) + goto oom; + + } + + /* We know object is a resource, so delete any unsignficant + * whitespace so that FALLTHROUGH code below finds the object. + */ + if(xml_element->content_cdata_length) { + raptor_free_stringbuffer(xml_element->content_cdata_sb); + xml_element->content_cdata_sb = NULL; + xml_element->content_cdata_length = 0; + } + + /* FALLTHROUGH */ + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LITERAL: + + if(element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LITERAL) { + + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_ALLOW_BAGID)) { + /* Only an empty literal can have a rdf:bagID */ + if(element->bag) { + if(xml_element->content_cdata_length > 0) { + raptor_parser_error(rdf_parser, + "rdf:bagID is forbidden on a literal property element '%s'.", + el_name); + + /* prevent this being used later either */ + element->rdf_attr[RDF_NS_bagID] = NULL; + } else { + raptor_rdfxml_generate_statement(rdf_parser, + element->bag, + RAPTOR_RDF_type_URI(rdf_parser->world), + RAPTOR_RDF_Bag_term(rdf_parser->world), + NULL, + NULL); + } + } + } /* if rdf:bagID */ + + /* If there is empty literal content with properties + * generate a node to hang properties off + */ + if(raptor_rdfxml_element_has_property_attributes(element) && + xml_element->content_cdata_length > 0) { + raptor_parser_error(rdf_parser, + "Literal property element '%s' has property attributes", + el_name); + state = RAPTOR_STATE_SKIPPING; + element->child_state = RAPTOR_STATE_SKIPPING; + break; + } + + if(element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LITERAL && + raptor_rdfxml_element_has_property_attributes(element) && + !element->object) { + unsigned char* object_id; + object_id = raptor_world_generate_bnodeid(rdf_parser->world); + if(!object_id) + goto oom; + + element->object = raptor_new_term_from_blank(rdf_parser->world, + object_id); + RAPTOR_FREE(char*, object_id); + + if(!element->object) + goto oom; + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_RESOURCE; + } + + if(raptor_rdfxml_process_property_attributes(rdf_parser, element, + element, + element->object)) + goto oom; + } + + + /* just be friendly to older compilers and don't declare + * variables in the middle of a block + */ + if(1) { + raptor_uri *predicate_uri = NULL; + int predicate_ordinal = -1; + raptor_term* object_term = NULL; + + if(state == RAPTOR_STATE_MEMBER_PROPERTYELT) { + predicate_ordinal = ++element->parent->last_ordinal; + predicate_uri = raptor_new_uri_from_rdf_ordinal(rdf_parser->world, + predicate_ordinal); + + } else { + predicate_uri = element_name_uri; + } + + + if(element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LITERAL) { + unsigned char* literal = NULL; + raptor_uri* literal_datatype; + unsigned char* literal_language = NULL; + + /* an empty stringbuffer - empty CDATA - is OK */ + if(raptor_stringbuffer_length(xml_element->content_cdata_sb)) { + literal = raptor_stringbuffer_as_string(xml_element->content_cdata_sb); + if(!literal) + goto oom; + } + + literal_datatype = element->object_literal_datatype; + if(!literal_datatype) + literal_language = (unsigned char*)raptor_sax2_inscope_xml_language(rdf_xml_parser->sax2); + + if(!literal_datatype && literal && + !raptor_unicode_check_utf8_nfc_string(literal, + xml_element->content_cdata_length)) { + raptor_log_level l; + + raptor_rdfxml_update_document_locator(rdf_parser); + l = (RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NON_NFC_FATAL)) ? RAPTOR_LOG_LEVEL_ERROR : + RAPTOR_LOG_LEVEL_WARN; + + raptor_parser_log_error(rdf_parser, l, + "Property element '%s' has a string not in Unicode Normal Form C: %s", + el_name, literal); + } + + object_term = raptor_new_term_from_literal(rdf_parser->world, + literal, + literal_datatype, + literal_language); + } else { + object_term = raptor_term_copy(element->object); + } + + raptor_rdfxml_generate_statement(rdf_parser, + element->parent->subject, + predicate_uri, + object_term, + element->reified, + element->parent); + + if(predicate_ordinal >= 0) + raptor_free_uri(predicate_uri); + + raptor_free_term(object_term); + } + + break; + + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PRESERVED: + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_XML_LITERAL: + { + unsigned char *buffer; + size_t length; + raptor_term* xmlliteral_term = NULL; + + if(rdf_xml_parser->xml_writer) { + raptor_xml_writer_flush(rdf_xml_parser->xml_writer); + + raptor_free_iostream(rdf_xml_parser->iostream); + rdf_xml_parser->iostream = NULL; + + buffer = (unsigned char*)rdf_xml_parser->xml_content; + length = rdf_xml_parser->xml_content_length; + } else { + buffer = raptor_stringbuffer_as_string(xml_element->content_cdata_sb); + length = xml_element->content_cdata_length; + } + + if(!raptor_unicode_check_utf8_nfc_string(buffer, length)) { + raptor_log_level l; + + raptor_rdfxml_update_document_locator(rdf_parser); + l = (RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NON_NFC_FATAL)) ? RAPTOR_LOG_LEVEL_ERROR : + RAPTOR_LOG_LEVEL_WARN; + + raptor_parser_log_error(rdf_parser, l, + "Property element '%s' has XML literal content not in Unicode Normal Form C: %s", + el_name, buffer); + } + + xmlliteral_term = raptor_new_term_from_literal(rdf_parser->world, + buffer, + RAPTOR_RDF_XMLLiteral_URI(rdf_parser->world), + NULL); + + if(state == RAPTOR_STATE_MEMBER_PROPERTYELT) { + raptor_uri* predicate_uri; + + element->parent->last_ordinal++; + predicate_uri = raptor_new_uri_from_rdf_ordinal(rdf_parser->world, element->parent->last_ordinal); + + raptor_rdfxml_generate_statement(rdf_parser, + element->parent->subject, + predicate_uri, + xmlliteral_term, + element->reified, + element->parent); + + raptor_free_uri(predicate_uri); + } else { + raptor_rdfxml_generate_statement(rdf_parser, + element->parent->subject, + element_name_uri, + xmlliteral_term, + element->reified, + element->parent); + } + + raptor_free_term(xmlliteral_term); + + /* Finish the xml writer iostream for parseType="Literal" */ + if(rdf_xml_parser->xml_writer) { + raptor_free_xml_writer(rdf_xml_parser->xml_writer); + rdf_xml_parser->xml_writer = NULL; + RAPTOR_FREE(char*, rdf_xml_parser->xml_content); + rdf_xml_parser->xml_content = NULL; + rdf_xml_parser->xml_content_length = 0; + } + } + + break; + + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_COLLECTION: + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_DAML_COLLECTION: + + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_NODES: + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTIES: + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTY_CONTENT: + + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_UNKNOWN: + case RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LAST: + default: + raptor_parser_fatal_error(rdf_parser, + "%s: Internal error in state RAPTOR_STATE_PROPERTYELT - got unexpected content type %s (%u)", + __FUNCTION__, + raptor_rdfxml_element_content_type_as_string(element->content_type), + element->content_type); + } /* end switch */ + + finished = 1; + break; + + case RAPTOR_STATE_INVALID: + default: + raptor_parser_fatal_error(rdf_parser, + "%s: Internal error - unexpected parser state %u - %s", + __FUNCTION__, + state, + raptor_rdfxml_state_as_string(state)); + finished = 1; + + } /* end switch */ + + if(state != element->state) { + element->state = state; +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG3("Moved to state %d - %s\n", state, + raptor_rdfxml_state_as_string(state)); +#endif + } + + } /* end while */ + +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG2("Ending in state %s\n", raptor_rdfxml_state_as_string(state)); +#endif + + return; + + oom: + raptor_parser_fatal_error(rdf_parser, "Out of memory, skipping"); + element->state = RAPTOR_STATE_SKIPPING; +} + + + +static void +raptor_rdfxml_cdata_grammar(raptor_parser *rdf_parser, + const unsigned char *s, int len, + int is_cdata) +{ + raptor_rdfxml_parser* rdf_xml_parser; + raptor_rdfxml_element* element; + raptor_xml_element* xml_element; + raptor_state state; + int all_whitespace = 1; + int i; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + if(rdf_parser->failed) + return; + +#ifdef RAPTOR_DEBUG_CDATA + RAPTOR_DEBUG2("Adding characters (is_cdata=%d): '", is_cdata); + (void)fwrite(s, 1, len, stderr); + fprintf(stderr, "' (%d bytes)\n", len); +#endif + + for(i = 0; i < len; i++) + if(!isspace(s[i])) { + all_whitespace = 0; + break; + } + + element = rdf_xml_parser->current_element; + + /* this file is very broke - probably not XML, whatever */ + if(!element) + return; + + xml_element = element->xml_element; + + raptor_rdfxml_update_document_locator(rdf_parser); + + /* cdata never changes the parser state + * and the containing element state always determines what to do. + * Use the child_state first if there is one, since that applies + */ + state = element->child_state; +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG2("Working in state %s\n", raptor_rdfxml_state_as_string(state)); +#endif + + +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG3("Content type %s (%d)\n", + raptor_rdfxml_element_content_type_as_string(element->content_type), + element->content_type); +#endif + + + + if(state == RAPTOR_STATE_SKIPPING) + return; + + if(state == RAPTOR_STATE_UNKNOWN) { + /* Ignore all cdata if still looking for RDF */ + if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_SCANNING)) + return; + + /* Ignore all whitespace cdata before first element */ + if(all_whitespace) + return; + + /* This probably will never happen since that would make the + * XML not be well-formed + */ + raptor_parser_warning(rdf_parser, "Character data before RDF element."); + } + + + if(element->child_content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTIES) { + /* If found non-whitespace content, move to literal content */ + if(!all_whitespace) + element->child_content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LITERAL; + } + + + if(!rdf_content_type_info[element->child_content_type].whitespace_significant) { + + /* Whitespace is ignored except for literal or preserved content types */ + if(all_whitespace) { +#ifdef RAPTOR_DEBUG_CDATA + RAPTOR_DEBUG2("Ignoring whitespace cdata inside element '%s'\n", + raptor_xml_element_get_name(element->parent->xml_element)->local_name); +#endif + return; + } + + if(xml_element->content_cdata_seen && xml_element->content_element_seen) { + raptor_qname* parent_el_name; + + parent_el_name = raptor_xml_element_get_name(element->parent->xml_element); + /* Uh oh - mixed content, this element has elements too */ + raptor_parser_warning(rdf_parser, "element '%s' has mixed content.", + parent_el_name->local_name); + } + } + + + if(element->content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_PROPERTY_CONTENT) { + element->content_type = RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_LITERAL; +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG3("Content type changed to %s (%d)\n", + raptor_rdfxml_element_content_type_as_string(element->content_type), + element->content_type); +#endif + } + + if(element->child_content_type == RAPTOR_RDFXML_ELEMENT_CONTENT_TYPE_XML_LITERAL) + raptor_xml_writer_cdata_counted(rdf_xml_parser->xml_writer, s, len); + else { + raptor_stringbuffer_append_counted_string(xml_element->content_cdata_sb, + s, len, 1); + element->content_cdata_all_whitespace &= all_whitespace; + + /* adjust stored length */ + xml_element->content_cdata_length += len; + } + + +#ifdef RAPTOR_DEBUG_CDATA + RAPTOR_DEBUG3("Content cdata now: %d bytes\n", + xml_element->content_cdata_length); +#endif +#ifdef RAPTOR_DEBUG_VERBOSE + RAPTOR_DEBUG2("Ending in state %s\n", raptor_rdfxml_state_as_string(state)); +#endif +} + + + +/** + * raptor_rdfxml_inscope_base_uri: + * @rdf_parser: Raptor parser object + * + * Return the in-scope base URI. + * + * Looks for the innermost xml:base on an element or document URI + * + * Return value: The URI string value or NULL on failure. + **/ +static raptor_uri* +raptor_rdfxml_inscope_base_uri(raptor_parser *rdf_parser) +{ + raptor_rdfxml_parser* rdf_xml_parser; + raptor_uri* base_uri; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + base_uri = raptor_sax2_inscope_base_uri(rdf_xml_parser->sax2); + if(!base_uri) + base_uri = rdf_parser->base_uri; + + return base_uri; +} + + +/** + * raptor_rdfxml_record_ID: + * @rdf_parser: Raptor parser object + * @element: Current element + * @id: ID string + * + * Record an rdf:ID / rdf:bagID value (with xml base) and check it hasn't been seen already. + * + * Record and check the ID values, if they have been seen already. + * per in-scope-base URI. + * + * Return value: non-zero if already seen, or failure + **/ +static int +raptor_rdfxml_record_ID(raptor_parser *rdf_parser, + raptor_rdfxml_element *element, + const unsigned char *id) +{ + raptor_rdfxml_parser *rdf_xml_parser; + raptor_uri* base_uri; + size_t id_len; + int rc; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + if(!RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_CHECK_RDF_ID)) + return 0; + + base_uri = raptor_rdfxml_inscope_base_uri(rdf_parser); + + id_len = strlen((const char*)id); + + rc = raptor_id_set_add(rdf_xml_parser->id_set, base_uri, id, id_len); + + return (rc != 0); +} + + + +static void +raptor_rdfxml_update_document_locator(raptor_parser *rdf_parser) +{ + raptor_rdfxml_parser *rdf_xml_parser; + + rdf_xml_parser = (raptor_rdfxml_parser*)rdf_parser->context; + + raptor_sax2_update_document_locator(rdf_xml_parser->sax2, + &rdf_parser->locator); +} + + + +static void +raptor_rdfxml_parse_finish_factory(raptor_parser_factory* factory) +{ +} + + +static const char* const rdfxml_names[3] = { "rdfxml", "raptor", NULL}; + +static const char* const rdfxml_uri_strings[3] = { + "http://www.w3.org/ns/formats/RDF_XML", + "http://www.w3.org/TR/rdf-syntax-grammar", + NULL +}; + +#define RDFXML_TYPES_COUNT 2 +static const raptor_type_q rdfxml_types[RDFXML_TYPES_COUNT + 1] = { + { "application/rdf+xml", 19, 10}, + { "text/rdf", 8, 6}, + { NULL, 0, 0} +}; + +static int +raptor_rdfxml_parser_register_factory(raptor_parser_factory *factory) +{ + int rc = 0; + + factory->desc.names = rdfxml_names; + + factory->desc.mime_types = rdfxml_types; + + factory->desc.label = "RDF/XML"; + factory->desc.uri_strings = rdfxml_uri_strings; + + factory->desc.flags = RAPTOR_SYNTAX_NEED_BASE_URI; + + factory->context_length = sizeof(raptor_rdfxml_parser); + + factory->init = raptor_rdfxml_parse_init; + factory->terminate = raptor_rdfxml_parse_terminate; + factory->start = raptor_rdfxml_parse_start; + factory->chunk = raptor_rdfxml_parse_chunk; + factory->finish_factory = raptor_rdfxml_parse_finish_factory; + factory->recognise_syntax = raptor_rdfxml_parse_recognise_syntax; + + return rc; +} + + +int +raptor_init_parser_rdfxml(raptor_world* world) +{ + return !raptor_world_register_parser_factory(world, + &raptor_rdfxml_parser_register_factory); +} + + +#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1 +void +raptor_rdfxml_parser_stats_print(raptor_rdfxml_parser* rdf_xml_parser, + FILE *stream) +{ + fputs("rdf:ID set ", stream); + raptor_id_set_stats_print(rdf_xml_parser->id_set, stream); +} +#endif |