diff options
Diffstat (limited to 'src/raptor_rss.c')
-rw-r--r-- | src/raptor_rss.c | 1733 |
1 files changed, 1733 insertions, 0 deletions
diff --git a/src/raptor_rss.c b/src/raptor_rss.c new file mode 100644 index 0000000..ebbbbbb --- /dev/null +++ b/src/raptor_rss.c @@ -0,0 +1,1733 @@ +/* -*- Mode: c; c-basic-offset: 2 -*- + * + * raptor_rss.c - Raptor Feeds (RSS and Atom) tag soup parser + * + * Copyright (C) 2003-2010, David Beckett http://www.dajobe.org/ + * Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/ + * + * This package is Free Software and part of Redland http://librdf.org/ + * + * It is licensed under the following three licenses as alternatives: + * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version + * 2. GNU General Public License (GPL) V2 or any newer version + * 3. Apache License, V2.0 or any newer version + * + * You may not use this file except in compliance with at least one of + * the above three licenses. + * + * See LICENSE.html or LICENSE.txt at the top of this package for the + * complete terms and further detail along with the license texts for + * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. + * + * + */ + +#ifdef HAVE_CONFIG_H +#include <raptor_config.h> +#endif + +#include <stdio.h> +#include <string.h> +#ifdef HAVE_STRINGS_H +#include <strings.h> +#endif +#include <ctype.h> +#include <stdarg.h> +#ifdef HAVE_ERRNO_H +#include <errno.h> +#endif + + +/* Raptor includes */ +#include "raptor2.h" +#include "raptor_internal.h" +#include "raptor_rss.h" + + +/* local prototypes */ + +static void raptor_rss_uplift_items(raptor_parser* rdf_parser); +static int raptor_rss_emit(raptor_parser* rdf_parser); + +static void raptor_rss_start_element_handler(void *user_data, raptor_xml_element* xml_element); +static void raptor_rss_end_element_handler(void *user_data, raptor_xml_element* xml_element); +static void raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s, int len); +static void raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s); +static void raptor_rss_sax2_new_namespace_handler(void *user_data, raptor_namespace* nspace); + +/* + * RSS parser object + */ +struct raptor_rss_parser_s { + /* static model */ + raptor_rss_model model; + + /* current line */ + char *line; + /* current line length */ + int line_length; + /* current char in line buffer */ + int offset; + + /* static statement for use in passing to user code */ + raptor_statement statement; + + raptor_sax2 *sax2; + + /* rss node type of current CONTAINER item */ + raptor_rss_type current_type; + + /* one place stack */ + raptor_rss_type prev_type; + raptor_rss_fields_type current_field; + + /* emptyness of current element */ + int element_is_empty; + + /* stack of namespaces */ + raptor_namespace_stack *nstack; + + /* non-0 if this is an atom 1.0 parser */ + int is_atom; + + /* namespaces declared here */ + raptor_namespace* nspaces[RAPTOR_RSS_NAMESPACES_SIZE]; + + /* namespaces seen during parsing or creating output model */ + char nspaces_seen[RAPTOR_RSS_NAMESPACES_SIZE]; + + /* current BLOCK pointer (inside CONTAINER of type current_type) */ + raptor_rss_block *current_block; +}; + +typedef struct raptor_rss_parser_s raptor_rss_parser; + + +typedef enum { + RAPTOR_RSS_CONTENT_TYPE_NONE, + RAPTOR_RSS_CONTENT_TYPE_XML, + RAPTOR_RSS_CONTENT_TYPE_TEXT +} raptor_rss_content_type; + + +struct raptor_rss_element_s +{ + raptor_world* world; + + raptor_uri* uri; + + /* Two types of content */ + raptor_rss_content_type type; + + /* 1) XML */ + raptor_xml_writer* xml_writer; + /* XML written to this iostream to the xml_content string */ + raptor_iostream* iostream; + /* ends up here */ + void *xml_content; + size_t xml_content_length; + + /* 2) cdata */ + raptor_stringbuffer* sb; +}; + +typedef struct raptor_rss_element_s raptor_rss_element; + + +static void +raptor_free_rss_element(raptor_rss_element *rss_element) +{ + if(rss_element->uri) + raptor_free_uri(rss_element->uri); + if(rss_element->type == RAPTOR_RSS_CONTENT_TYPE_XML) { + if(rss_element->xml_writer) + raptor_free_xml_writer(rss_element->xml_writer); + if(rss_element->iostream) + raptor_free_iostream(rss_element->iostream); + if(rss_element->xml_content) + raptor_free_memory(rss_element->xml_content); + } + if(rss_element->sb) + raptor_free_stringbuffer(rss_element->sb); + + RAPTOR_FREE(raptor_rss_element, rss_element); +} + + +static int +raptor_rss_parse_init(raptor_parser* rdf_parser, const char *name) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + raptor_sax2* sax2; + int n; + + raptor_rss_common_init(rdf_parser->world); + + raptor_rss_model_init(rdf_parser->world, &rss_parser->model); + + rss_parser->prev_type = RAPTOR_RSS_NONE; + rss_parser->current_field = RAPTOR_RSS_FIELD_NONE; + rss_parser->current_type = RAPTOR_RSS_NONE; + rss_parser->current_block = NULL; + + if(rss_parser->sax2) { + raptor_free_sax2(rss_parser->sax2); + rss_parser->sax2 = NULL; + } + + rss_parser->nstack = raptor_new_namespaces(rdf_parser->world, 1); + + /* Initialise the namespaces */ + for(n = 0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) { + unsigned const char* prefix; + raptor_uri* uri; + raptor_namespace* nspace = NULL; + + prefix = (unsigned const char*)raptor_rss_namespaces_info[n].prefix; + uri = rdf_parser->world->rss_namespaces_info_uris[n]; + if(prefix && uri) + nspace = raptor_new_namespace_from_uri(rss_parser->nstack, + prefix, uri, 0); + rss_parser->nspaces[n] = nspace; + } + + sax2 = raptor_new_sax2(rdf_parser->world, &rdf_parser->locator, rdf_parser); + rss_parser->sax2 = sax2; + + raptor_sax2_set_start_element_handler(sax2, raptor_rss_start_element_handler); + raptor_sax2_set_end_element_handler(sax2, raptor_rss_end_element_handler); + raptor_sax2_set_characters_handler(sax2, raptor_rss_cdata_handler); + raptor_sax2_set_cdata_handler(sax2, raptor_rss_cdata_handler); + raptor_sax2_set_comment_handler(sax2, raptor_rss_comment_handler); + raptor_sax2_set_namespace_handler(sax2, raptor_rss_sax2_new_namespace_handler); + + raptor_statement_init(&rss_parser->statement, rdf_parser->world); + + return 0; +} + + +static void +raptor_rss_parse_terminate(raptor_parser *rdf_parser) +{ + raptor_rss_parser *rss_parser = (raptor_rss_parser*)rdf_parser->context; + int n; + + if(rss_parser->sax2) + raptor_free_sax2(rss_parser->sax2); + + raptor_rss_model_clear(&rss_parser->model); + + for(n = 0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) { + if(rss_parser->nspaces[n]) + raptor_free_namespace(rss_parser->nspaces[n]); + } + + if(rss_parser->nstack) + raptor_free_namespaces(rss_parser->nstack); + + raptor_rss_common_terminate(rdf_parser->world); +} + + +static int +raptor_rss_parse_start(raptor_parser *rdf_parser) +{ + raptor_uri *uri = rdf_parser->base_uri; + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + int n; + + /* base URI required for RSS */ + if(!uri) + return 1; + + for(n = 0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) + rss_parser->nspaces_seen[n] = 'N'; + + /* Optionally forbid internal network and file requests in the XML parser */ + raptor_sax2_set_option(rss_parser->sax2, + RAPTOR_OPTION_NO_NET, NULL, + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET)); + raptor_sax2_set_option(rss_parser->sax2, + RAPTOR_OPTION_NO_FILE, NULL, + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_FILE)); + raptor_sax2_set_option(rss_parser->sax2, + RAPTOR_OPTION_LOAD_EXTERNAL_ENTITIES, NULL, + RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_EXTERNAL_ENTITIES)); + if(rdf_parser->uri_filter) + raptor_sax2_set_uri_filter(rss_parser->sax2, rdf_parser->uri_filter, + rdf_parser->uri_filter_user_data); + + raptor_sax2_parse_start(rss_parser->sax2, uri); + + return 0; +} + + + +static int +raptor_rss_add_container(raptor_rss_parser *rss_parser, const char *name) +{ + raptor_rss_type type = RAPTOR_RSS_NONE; + + if(!strcmp(name, "rss") || !strcmp(name, "rdf") || !strcmp(name, "RDF")) { + /* rss */ + } else if(!raptor_strcasecmp(name, "channel")) { + /* rss or atom 0.3 channel */ + type = RAPTOR_RSS_CHANNEL; + } else if(!strcmp(name, "feed")) { + /* atom 1.0 feed */ + type = RAPTOR_RSS_CHANNEL; + rss_parser->is_atom = 1; + } else if(!strcmp(name, "item")) { + type = RAPTOR_RSS_ITEM; + } else if(!strcmp(name, "entry")) { + type = RAPTOR_RSS_ITEM; + rss_parser->is_atom = 1; + } else { + int i; + for(i = 0; i < RAPTOR_RSS_COMMON_SIZE; i++) { + if(!(raptor_rss_items_info[i].flags & RAPTOR_RSS_ITEM_CONTAINER)) + continue; + + if(!strcmp(name, raptor_rss_items_info[i].name)) { + /* rss and atom clash on the author name field (rss) or type (atom) */ + if(i != RAPTOR_ATOM_AUTHOR || + (i == RAPTOR_ATOM_AUTHOR && rss_parser->is_atom)) { + type = (raptor_rss_type)i; + break; + } + } + } + } + + if(type != RAPTOR_RSS_NONE) { + if(type == RAPTOR_RSS_ITEM) + raptor_rss_model_add_item(&rss_parser->model); + else + raptor_rss_model_add_common(&rss_parser->model, type); + + /* Inner container - push the current type onto a 1-place stack */ + if(rss_parser->current_type != RAPTOR_RSS_NONE) + rss_parser->prev_type = rss_parser->current_type; + + rss_parser->current_type = type; + } + + return (type == RAPTOR_RSS_NONE); +} + + +static raptor_uri* +raptor_rss_promote_namespace_uri(raptor_world *world, raptor_uri* nspace_URI) +{ + /* RSS 0.9 and RSS 1.1 namespaces => RSS 1.0 namespace */ + if((raptor_uri_equals(nspace_URI, + world->rss_namespaces_info_uris[RSS0_9_NS]) || + raptor_uri_equals(nspace_URI, + world->rss_namespaces_info_uris[RSS1_1_NS]))) { + nspace_URI = world->rss_namespaces_info_uris[RSS1_0_NS]; + } + + /* Atom 0.3 namespace => Atom 1.0 namespace */ + if(raptor_uri_equals(nspace_URI, + world->rss_namespaces_info_uris[ATOM0_3_NS])) { + nspace_URI = world->rss_namespaces_info_uris[ATOM1_0_NS]; + } + + return nspace_URI; +} + + + +static raptor_rss_item* +raptor_rss_get_current_item(raptor_rss_parser *rss_parser) +{ + raptor_rss_item* item; + + if(rss_parser->current_type == RAPTOR_RSS_ITEM) + item = rss_parser->model.last; + else + item = raptor_rss_model_get_common(&rss_parser->model, + rss_parser->current_type); + return item; +} + + +static int +raptor_rss_block_set_field(raptor_world *world, raptor_uri *base_uri, + raptor_rss_block *block, + const raptor_rss_block_field_info *bfi, + const char *string) +{ + int attribute_type = bfi->attribute_type; + int offset = bfi->offset; + if(attribute_type == RSS_BLOCK_FIELD_TYPE_URL) { + raptor_uri* uri; + uri = raptor_new_uri_relative_to_base(world, base_uri, + (const unsigned char*)string); + if(!uri) + return 1; + + block->urls[offset] = uri; + } else if(attribute_type == RSS_BLOCK_FIELD_TYPE_STRING) { + size_t len = strlen(string); + block->strings[offset] = RAPTOR_MALLOC(char*, len + 1); + if(!block->strings[offset]) + return 1; + + memcpy(block->strings[offset], string, len+1); + } else { +#ifdef RAPTOR_DEBUG + RAPTOR_FATAL2("Found unknown attribute_type %d\n", attribute_type); +#else + return 1; +#endif + } + + return 0; +} + + +static void +raptor_rss_start_element_handler(void *user_data, + raptor_xml_element* xml_element) +{ + raptor_parser *rdf_parser; + raptor_rss_parser *rss_parser; + raptor_rss_block *block = NULL; + raptor_uri* base_uri; + raptor_qname *el_qname; + const unsigned char *name; + int ns_attributes_count; + raptor_qname** named_attrs; + const raptor_namespace* el_nspace; + raptor_rss_element* rss_element; + int i; + + rdf_parser = (raptor_parser*)user_data; + rss_parser = (raptor_rss_parser*)rdf_parser->context; + + rss_element = RAPTOR_CALLOC(raptor_rss_element*, 1, sizeof(*rss_element)); + if(!rss_element) { + rdf_parser->failed = 1; + return; + } + + rss_element->world = rdf_parser->world; + rss_element->sb = raptor_new_stringbuffer(); + + xml_element->user_data = rss_element; + + if(xml_element->parent) { + raptor_rss_element* parent_rss_element; + parent_rss_element = (raptor_rss_element*)(xml_element->parent->user_data); + if(parent_rss_element->xml_writer) + rss_element->xml_writer = parent_rss_element->xml_writer; + } + + if(rss_element->xml_writer) { + raptor_xml_writer_start_element(rss_element->xml_writer, xml_element); + return; + } + + el_qname = raptor_xml_element_get_name(xml_element); + name = el_qname->local_name; + el_nspace = el_qname->nspace; + + named_attrs = raptor_xml_element_get_attributes(xml_element); + ns_attributes_count = raptor_xml_element_get_attributes_count(xml_element); + + base_uri = raptor_sax2_inscope_base_uri(rss_parser->sax2); + + + /* No container type - identify and record in rss_parser->current_type + * either as a top-level container or an inner-container */ + if(!raptor_rss_add_container(rss_parser, (const char*)name)) { +#ifdef RAPTOR_DEBUG + if(1) { + raptor_rss_type old_type = rss_parser->prev_type; + + if(old_type != rss_parser->current_type && old_type != RAPTOR_RSS_NONE) + RAPTOR_DEBUG5("FOUND inner container type %u - %s INSIDE current container type %u - %s\n", + rss_parser->current_type, + raptor_rss_items_info[rss_parser->current_type].name, + old_type, raptor_rss_items_info[old_type].name); + else + RAPTOR_DEBUG3("FOUND container type %u - %s\n", + rss_parser->current_type, + raptor_rss_items_info[rss_parser->current_type].name); + } +#endif + + /* check a few container attributes */ + if(named_attrs) { + raptor_rss_item* update_item = raptor_rss_get_current_item(rss_parser); + + for(i = 0; i < ns_attributes_count; i++) { + raptor_qname* attr = named_attrs[i]; + const char* attrName = (const char*)attr->local_name; + const unsigned char* attrValue = attr->value; + + RAPTOR_DEBUG3(" container attribute %s=%s\n", attrName, attrValue); + if(!strcmp(attrName, "about")) { + if(update_item) { + update_item->uri = raptor_new_uri(rdf_parser->world, attrValue); + update_item->term = raptor_new_term_from_uri(rdf_parser->world, + update_item->uri); + } + } + } + } + return; + } else if(rss_parser->current_type == RAPTOR_RSS_NONE) { + RAPTOR_DEBUG2("Unknown container element named %s\n", name); + /* Nothing more that can be done with unknown element - skip it */ + return; + } + + + /* have container (current_type) so this element is inside it is either: + * 1. a metadata block element (such as rss:enclosure) + * 2. a field (such as atom:title) + */ + + /* Find field ID */ + rss_parser->current_field = RAPTOR_RSS_FIELD_UNKNOWN; + for(i = 0; i < RAPTOR_RSS_FIELDS_SIZE; i++) { + raptor_uri* nspace_URI; + raptor_uri* field_nspace_URI; + rss_info_namespace nsid = raptor_rss_fields_info[i].nspace; + + if(strcmp((const char*)name, raptor_rss_fields_info[i].name)) + continue; + + if(!el_nspace) { + if(nsid != RSS_NO_NS && nsid != RSS1_0_NS && nsid != RSS0_91_NS && + nsid != RSS0_9_NS && nsid != RSS1_1_NS) + continue; + + /* Matches if the element has no namespace and field is not atom */ + rss_parser->current_field = (raptor_rss_fields_type)i; + break; + } + + /* Promote element namespaces */ + nspace_URI = raptor_rss_promote_namespace_uri(rdf_parser->world, + raptor_namespace_get_uri(el_nspace)); + field_nspace_URI = rdf_parser->world->rss_namespaces_info_uris[raptor_rss_fields_info[i].nspace]; + + if(raptor_uri_equals(nspace_URI, + field_nspace_URI)) { + rss_parser->current_field = (raptor_rss_fields_type)i; + break; + } + } + + if(rss_parser->current_field == RAPTOR_RSS_FIELD_UNKNOWN) { + RAPTOR_DEBUG3("Unknown field element named %s inside type %s\n", name, + raptor_rss_items_info[rss_parser->current_type].name); + return; + } + + + /* Found a block element to process */ + if(raptor_rss_fields_info[rss_parser->current_field].flags & + RAPTOR_RSS_INFO_FLAG_BLOCK_VALUE) { + raptor_rss_type block_type; + raptor_rss_item* update_item; + const unsigned char *id; + raptor_term* block_term; + + block_type = raptor_rss_fields_info[rss_parser->current_field].block_type; + + RAPTOR_DEBUG3("FOUND new block type %u - %s\n", block_type, + raptor_rss_items_info[block_type].name); + + update_item = raptor_rss_get_current_item(rss_parser); + + id = raptor_world_generate_bnodeid(rdf_parser->world); + block_term = raptor_new_term_from_blank(rdf_parser->world, id); + RAPTOR_FREE(char*, id); + + block = raptor_new_rss_block(rdf_parser->world, block_type, block_term); + raptor_free_term(block_term); + + raptor_rss_item_add_block(update_item, block); + rss_parser->current_block = block; + + rss_parser->nspaces_seen[raptor_rss_items_info[block_type].nspace] = 'Y'; + + /* Now check block attributes */ + if(named_attrs) { + for(i = 0; i < ns_attributes_count; i++) { + raptor_qname* attr = named_attrs[i]; + const char* attrName = (const char*)attr->local_name; + const unsigned char* attrValue = attr->value; + const raptor_rss_block_field_info *bfi; + int offset = -1; + + for(bfi = &raptor_rss_block_fields_info[0]; + bfi->type != RAPTOR_RSS_NONE; + bfi++) { + if(!bfi->attribute) + continue; + + if(bfi->type == block_type && !strcmp(attrName, bfi->attribute)) { + offset = bfi->offset; + break; + } + } + + if(offset < 0) + continue; + + /* Found attribute for this block type */ + RAPTOR_DEBUG3(" found block attribute %s=%s\n", attrName, attrValue); + if(raptor_rss_block_set_field(rdf_parser->world, base_uri, + block, bfi, (const char*)attrValue)) { + rdf_parser->failed = 1; + return; + } + + } + + } + + return; + } + + + /* Process field */ + RAPTOR_DEBUG4("FOUND field %u - %s inside type %s\n", + rss_parser->current_field, + raptor_rss_fields_info[rss_parser->current_field].name, + raptor_rss_items_info[rss_parser->current_type].name); + + /* Mark namespace seen in new field */ + if(1) { + rss_info_namespace ns_index; + ns_index = raptor_rss_fields_info[rss_parser->current_field].nspace; + rss_parser->nspaces_seen[ns_index] = 'Y'; + } + + + /* Now check for field attributes */ + if(named_attrs) { + for(i = 0; i < ns_attributes_count; i++) { + raptor_qname* attr = named_attrs[i]; + const unsigned char* attrName = attr->local_name; + const unsigned char* attrValue = attr->value; + + RAPTOR_DEBUG3(" attribute %s=%s\n", attrName, attrValue); + + /* Pick a few attributes to care about */ + if(!strcmp((const char*)attrName, "isPermaLink")) { + raptor_rss_item* update_item = rss_parser->model.last; + if(!strcmp((const char*)name, "guid")) { + /* <guid isPermaLink="..."> */ + if(update_item) { + raptor_rss_field* field = raptor_rss_new_field(rdf_parser->world); + RAPTOR_DEBUG1("fa1 - "); + raptor_rss_item_add_field(update_item, RAPTOR_RSS_FIELD_GUID, field); + if(!strcmp((const char*)attrValue, "true")) { + RAPTOR_DEBUG2(" setting guid to URI '%s'\n", attrValue); + field->uri = raptor_new_uri_relative_to_base(rdf_parser->world, base_uri, + (const unsigned char*)attrValue); + } else { + size_t len = strlen((const char*)attrValue); + RAPTOR_DEBUG2(" setting guid to string '%s'\n", attrValue); + field->value = RAPTOR_MALLOC(unsigned char*, len + 1); + if(!field->value) { + rdf_parser->failed = 1; + return; + } + memcpy(field->value, attrValue, len + 1); + } + } + } + } else if(!strcmp((const char*)attrName, "href")) { + if(rss_parser->current_field == RAPTOR_RSS_FIELD_LINK || + rss_parser->current_field == RAPTOR_RSS_FIELD_ATOM_LINK) { + RAPTOR_DEBUG2(" setting href as URI string for type %s\n", raptor_rss_items_info[rss_parser->current_type].name); + if(rss_element->uri) + raptor_free_uri(rss_element->uri); + rss_element->uri = raptor_new_uri_relative_to_base(rdf_parser->world, base_uri, + (const unsigned char*)attrValue); + } + } else if(!strcmp((const char*)attrName, "type")) { + if(rss_parser->current_field == RAPTOR_RSS_FIELD_ATOM_LINK) { + /* do nothing with atom link attribute type */ + } else if(rss_parser->is_atom) { + /* Atom only typing */ + if(!strcmp((const char*)attrValue, "xhtml") || + !strcmp((const char*)attrValue, "xml") || + strstr((const char*)attrValue, "+xml")) { + + RAPTOR_DEBUG2(" found type '%s', making an XML writer\n", + attrValue); + + rss_element->type = RAPTOR_RSS_CONTENT_TYPE_XML; + rss_element->iostream = raptor_new_iostream_to_string(rdf_parser->world, + &rss_element->xml_content, + &rss_element->xml_content_length, + raptor_alloc_memory); + rss_element->xml_writer = raptor_new_xml_writer(rdf_parser->world, + NULL, + rss_element->iostream); + raptor_xml_writer_set_option(rss_element->xml_writer, + RAPTOR_OPTION_WRITER_XML_DECLARATION, + NULL, 0); + + raptor_free_stringbuffer(rss_element->sb); + rss_element->sb = NULL; + + } + } + } else if(!strcmp((const char*)attrName, "version")) { + if(!raptor_strcasecmp((const char*)name, "feed")) { + if(!strcmp((const char*)attrValue, "0.3")) + rss_parser->is_atom = 1; + } + } + } + } /* if have field attributes */ + +} + + +static void +raptor_rss_end_element_handler(void *user_data, + raptor_xml_element* xml_element) +{ + raptor_parser* rdf_parser; + raptor_rss_parser* rss_parser; +#ifdef RAPTOR_DEBUG + const unsigned char* name = raptor_xml_element_get_name(xml_element)->local_name; +#endif + raptor_rss_element* rss_element; + size_t cdata_len = 0; + unsigned char* cdata = NULL; + + rss_element = (raptor_rss_element*)xml_element->user_data; + + rdf_parser = (raptor_parser*)user_data; + rss_parser = (raptor_rss_parser*)rdf_parser->context; + + if(rss_element->xml_writer) { + if(rss_element->type != RAPTOR_RSS_CONTENT_TYPE_XML) { + raptor_xml_writer_end_element(rss_element->xml_writer, xml_element); + goto tidy_end_element; + } + + /* otherwise we are done making XML */ + raptor_free_iostream(rss_element->iostream); + rss_element->iostream = NULL; + cdata = (unsigned char*)rss_element->xml_content; + cdata_len = rss_element->xml_content_length; + } + + if(rss_element->sb) { + cdata_len = raptor_stringbuffer_length(rss_element->sb); + cdata = raptor_stringbuffer_as_string(rss_element->sb); + } + + if(cdata) { + raptor_uri* base_uri = NULL; + + base_uri = raptor_sax2_inscope_base_uri(rss_parser->sax2); + + if(rss_parser->current_block) { + const raptor_rss_block_field_info *bfi; +#ifdef RAPTOR_DEBUG + int handled = 0; +#endif + /* in a block, maybe store the CDATA there */ + + for(bfi = &raptor_rss_block_fields_info[0]; + bfi->type != RAPTOR_RSS_NONE; + bfi++) { + + if(bfi->type != rss_parser->current_block->rss_type || + bfi->attribute != NULL) + continue; + + /* Set author name from element */ + if(raptor_rss_block_set_field(rdf_parser->world, base_uri, + rss_parser->current_block, + bfi, (const char*)cdata)) { + rdf_parser->failed = 1; + return; + } + +#ifdef RAPTOR_DEBUG + handled = 1; +#endif + break; + } + +#ifdef RAPTOR_DEBUG + if(!handled) { + raptor_rss_type block_type = rss_parser->current_block->rss_type; + RAPTOR_DEBUG3("Ignoring cdata for block %u - %s\n", + block_type, raptor_rss_items_info[block_type].name); + } +#endif + rss_parser->current_block = NULL; + goto do_end_element; + } + + if(rss_parser->current_type == RAPTOR_RSS_NONE || + (rss_parser->current_field == RAPTOR_RSS_FIELD_NONE || + rss_parser->current_field == RAPTOR_RSS_FIELD_UNKNOWN)) { + unsigned char *p = cdata; + size_t i; + for(i = cdata_len; i > 0 && *p; i--) { + if(!isspace(*p)) + break; + p++; + } + if(i > 0 && *p) { + RAPTOR_DEBUG4("IGNORING non-whitespace text '%s' inside type %s, field %s\n", cdata, + raptor_rss_items_info[rss_parser->current_type].name, + raptor_rss_fields_info[rss_parser->current_field].name); + } + + goto do_end_element; + } + + if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) { + /* skipHours, skipDays common but IGNORED */ + RAPTOR_DEBUG2("Ignoring fields for type %s\n", raptor_rss_items_info[rss_parser->current_type].name); + } else { + raptor_rss_item* update_item = raptor_rss_get_current_item(rss_parser); + raptor_rss_field* field = raptor_rss_new_field(rdf_parser->world); + + /* if value is always an uri, make it so */ + if(raptor_rss_fields_info[rss_parser->current_field].flags & + RAPTOR_RSS_INFO_FLAG_URI_VALUE) { + RAPTOR_DEBUG4("Added URI %s to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_items_info[rss_parser->current_type].name); + field->uri = raptor_new_uri_relative_to_base(rdf_parser->world, base_uri, cdata); + } else { + RAPTOR_DEBUG4("Added text '%s' to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_items_info[rss_parser->current_type].name); + field->uri = NULL; + field->value = RAPTOR_MALLOC(unsigned char*, cdata_len + 1); + if(!field->value) { + rdf_parser->failed = 1; + raptor_rss_field_free(field); + return; + } + + memcpy(field->value, cdata, cdata_len); + field->value[cdata_len] = '\0'; + } + + RAPTOR_DEBUG1("fa3 - "); + raptor_rss_item_add_field(update_item, rss_parser->current_field, field); + } + } /* end if contained cdata */ + + if(raptor_xml_element_is_empty(xml_element)) { + /* Empty element, so consider adding one of the attributes as + * literal or URI content + */ + if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) { + /* skipHours, skipDays common but IGNORED */ + RAPTOR_DEBUG3("Ignoring empty element %s for type %s\n", name, raptor_rss_items_info[rss_parser->current_type].name); + } else if(rss_element->uri) { + raptor_rss_item* update_item = raptor_rss_get_current_item(rss_parser); + raptor_rss_field* field = raptor_rss_new_field(rdf_parser->world); + + if(rss_parser->current_field == RAPTOR_RSS_FIELD_UNKNOWN) { + RAPTOR_DEBUG2("Cannot add URI from alternate attribute to type %s unknown field\n", raptor_rss_items_info[rss_parser->current_type].name); + raptor_rss_field_free(field); + } else { + RAPTOR_DEBUG3("Added URI to field %s of type %s\n", raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_items_info[rss_parser->current_type].name); + field->uri = rss_element->uri; + rss_element->uri = NULL; + RAPTOR_DEBUG1("fa2 - "); + raptor_rss_item_add_field(update_item, rss_parser->current_field, field); + } + } + + } + + do_end_element: + if(rss_parser->current_type != RAPTOR_RSS_NONE) { + if(rss_parser->current_field != RAPTOR_RSS_FIELD_NONE) { + RAPTOR_DEBUG3("Ending element %s field %s\n", name, raptor_rss_fields_info[rss_parser->current_field].name); + rss_parser->current_field = RAPTOR_RSS_FIELD_NONE; + } else { + RAPTOR_DEBUG3("Ending element %s type %s\n", name, raptor_rss_items_info[rss_parser->current_type].name); + if(rss_parser->prev_type != RAPTOR_RSS_NONE) { + rss_parser->current_type = rss_parser->prev_type; + rss_parser->prev_type = RAPTOR_RSS_NONE; + RAPTOR_DEBUG3("Returning to type %u - %s\n", rss_parser->current_type, raptor_rss_items_info[rss_parser->current_type].name); + } else + rss_parser->current_type = RAPTOR_RSS_NONE; + } + } + + if(rss_parser->current_block) { +#ifdef RAPTOR_DEBUG + raptor_rss_type block_type = rss_parser->current_block->rss_type; + RAPTOR_DEBUG3("Ending current block %u - %s\n", + block_type, raptor_rss_items_info[block_type].name); +#endif + rss_parser->current_block = NULL; + } + + + tidy_end_element: + + raptor_free_rss_element(rss_element); + +} + + + +static void +raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element, + const unsigned char *s, int len) +{ + raptor_rss_element* rss_element; + + rss_element = (raptor_rss_element*)xml_element->user_data; + + if(rss_element->xml_writer) { + raptor_xml_writer_cdata_counted(rss_element->xml_writer, s, len); + return; + } + + raptor_stringbuffer_append_counted_string(rss_element->sb, s, len, 1); +} + + +static void +raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element, + const unsigned char *s) +{ + raptor_rss_element* rss_element; + + if(!xml_element) + return; + + rss_element = (raptor_rss_element*)xml_element->user_data; + + if(rss_element->xml_writer) { + raptor_xml_writer_comment(rss_element->xml_writer, s); + return; + } +} + + +static void +raptor_rss_sax2_new_namespace_handler(void *user_data, + raptor_namespace* nspace) +{ + raptor_parser* rdf_parser = (raptor_parser*)user_data; + raptor_rss_parser* rss_parser; + int n; + + rss_parser = (raptor_rss_parser*)rdf_parser->context; + for(n = 0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) { + raptor_uri* ns_uri = rdf_parser->world->rss_namespaces_info_uris[n]; + if(!ns_uri) + continue; + + if(!raptor_uri_equals(ns_uri, nspace->uri)) { + rss_parser->nspaces_seen[n] = 'Y'; + break; + } + } + +} + + +/* Add an rss:link from string contents of either: + * atom:id + * atom:link[@rel="self"]/@href + */ +static int +raptor_rss_insert_rss_link(raptor_parser* rdf_parser, + raptor_rss_item* item) +{ + raptor_rss_block *block; + raptor_rss_field* id_field; + raptor_rss_field* field = NULL; + + /* Try atom:id first */ + id_field = item->fields[RAPTOR_RSS_FIELD_ATOM_ID]; + if(id_field && id_field->value) { + const char *value = (const char*)id_field->value; + size_t len = strlen(value); + + field = raptor_rss_new_field(item->world); + if(!field) + return 1; + + field->value = RAPTOR_MALLOC(unsigned char*, len + 1); + if(!field->value) { + raptor_rss_field_free(field); + return 1; + } + + memcpy(field->value, value, len + 1); + raptor_rss_item_add_field(item, RAPTOR_RSS_FIELD_LINK, field); + + return 0; + } + + + for(block = item->blocks; block; block = block->next) { + if(block->rss_type != RAPTOR_ATOM_LINK) + continue; + + /* <link @href> is url at offset RAPTOR_RSS_LINK_HREF_URL_OFFSET + * <link @rel> is string at offset RAPTOR_RSS_LINK_REL_STRING_OFFSET + * The raptor_rss_block_fields_info structure records this + */ + if(!block->urls[RAPTOR_RSS_LINK_HREF_URL_OFFSET] || + (block->strings[RAPTOR_RSS_LINK_REL_STRING_OFFSET] && + strcmp(block->strings[RAPTOR_RSS_LINK_REL_STRING_OFFSET], "self")) + ) + continue; + + /* set the field rss:link to the string value of the @href */ + field = raptor_rss_new_field(item->world); + field->value = raptor_uri_to_string(block->urls[0]); + + raptor_rss_item_add_field(item, RAPTOR_RSS_FIELD_LINK, field); + return 0; + } + + return 0; +} + + +static int +raptor_rss_insert_identifiers(raptor_parser* rdf_parser) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + int i; + raptor_rss_item* item; + + for(i = 0; i< RAPTOR_RSS_COMMON_SIZE; i++) { + for(item = rss_parser->model.common[i]; item; item = item->next) { + if(!item->fields_count) + continue; + + RAPTOR_DEBUG3("Inserting identifiers in common type %d - %s\n", i, raptor_rss_items_info[i].name); + + if(item->uri) { + item->term = raptor_new_term_from_uri(rdf_parser->world, item->uri); + } else { + int url_fields[2]; + int url_fields_count = 1; + int f; + + url_fields[0] = (i== RAPTOR_RSS_IMAGE) ? RAPTOR_RSS_FIELD_URL : + RAPTOR_RSS_FIELD_LINK; + if(i == RAPTOR_RSS_CHANNEL) { + url_fields[1] = RAPTOR_RSS_FIELD_ATOM_ID; + url_fields_count++; + } + + for(f = 0; f < url_fields_count; f++) { + raptor_rss_field* field; + + for(field = item->fields[url_fields[f]]; field; field = field->next) { + raptor_uri *new_uri = NULL; + if(field->value) + new_uri = raptor_new_uri(rdf_parser->world, + (const unsigned char*)field->value); + else if(field->uri) + new_uri = raptor_uri_copy(field->uri); + + if(new_uri) { + item->term = raptor_new_term_from_uri(rdf_parser->world, new_uri); + raptor_free_uri(new_uri); + if(!item->term) + return 1; + break; + } + } + } + + if(!item->term) { + const unsigned char *id; + + /* need to make bnode */ + id = raptor_world_generate_bnodeid(rdf_parser->world); + item->term = raptor_new_term_from_blank(rdf_parser->world, id); + RAPTOR_FREE(char*, id); + } + } + + /* Try to add an rss:link if missing */ + if(i == RAPTOR_RSS_CHANNEL && !item->fields[RAPTOR_RSS_FIELD_LINK]) { + if(raptor_rss_insert_rss_link(rdf_parser, item)) + return 1; + } + + item->node_type = &raptor_rss_items_info[i]; + item->node_typei = i; + } + } + /* sequence of rss:item */ + for(item = rss_parser->model.items; item; item = item->next) { + raptor_rss_block *block; + raptor_uri* uri = NULL; + + if(!item->fields[RAPTOR_RSS_FIELD_LINK]) { + if(raptor_rss_insert_rss_link(rdf_parser, item)) + return 1; + } + + + if(item->uri) { + uri = raptor_uri_copy(item->uri); + } else { + if(item->fields[RAPTOR_RSS_FIELD_LINK]) { + if(item->fields[RAPTOR_RSS_FIELD_LINK]->value) + uri = raptor_new_uri(rdf_parser->world, + (const unsigned char*)item->fields[RAPTOR_RSS_FIELD_LINK]->value); + else if(item->fields[RAPTOR_RSS_FIELD_LINK]->uri) + uri = raptor_uri_copy(item->fields[RAPTOR_RSS_FIELD_LINK]->uri); + } else if(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]) { + if(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->value) + uri = raptor_new_uri(rdf_parser->world, + (const unsigned char*)item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->value); + else if(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->uri) + uri = raptor_uri_copy(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->uri); + } + } + + if(!uri) + continue; + + item->term = raptor_new_term_from_uri(rdf_parser->world, uri); + raptor_free_uri(uri); + uri = NULL; + + for(block = item->blocks; block; block = block->next) { + if(!block->identifier) { + const unsigned char *id; + /* need to make bnode */ + id = raptor_world_generate_bnodeid(rdf_parser->world); + item->term = raptor_new_term_from_blank(rdf_parser->world, id); + RAPTOR_FREE(char*, id); + } + } + + item->node_type = &raptor_rss_items_info[RAPTOR_RSS_ITEM]; + item->node_typei = RAPTOR_RSS_ITEM; + } + + return 0; +} + + +static int +raptor_rss_emit_type_triple(raptor_parser* rdf_parser, + raptor_term *resource, + raptor_uri *type_uri) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + raptor_term *predicate_term; + raptor_term *object_term; + + if(!resource) { + raptor_parser_error(rdf_parser, "RSS node has no identifier"); + return 1; + } + + rss_parser->statement.subject = resource; + + predicate_term = raptor_new_term_from_uri(rdf_parser->world, + RAPTOR_RDF_type_URI(rdf_parser->world)); + rss_parser->statement.predicate = predicate_term; + + object_term = raptor_new_term_from_uri(rdf_parser->world, type_uri); + rss_parser->statement.object = object_term; + + /* Generate the statement */ + (*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement); + + raptor_free_term(predicate_term); + raptor_free_term(object_term); + + return 0; +} + + +static int +raptor_rss_emit_block(raptor_parser* rdf_parser, + raptor_term *resource, + raptor_rss_block *block) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + raptor_rss_type block_type = block->rss_type; + raptor_uri *predicate_uri; + raptor_term *predicate_term = NULL; + const raptor_rss_block_field_info *bfi; + raptor_rss_fields_type predicate_field; + + if(!block->identifier) { + raptor_parser_error(rdf_parser, "Block has no identifier"); + return 1; + } + + predicate_field = raptor_rss_items_info[block_type].predicate; + predicate_uri = rdf_parser->world->rss_fields_info_uris[predicate_field]; + predicate_term = raptor_new_term_from_uri(rdf_parser->world, + predicate_uri); + + rss_parser->statement.subject = resource; + rss_parser->statement.predicate = predicate_term; + rss_parser->statement.object = block->identifier; + (*rdf_parser->statement_handler)(rdf_parser->user_data, + &rss_parser->statement); + + raptor_free_term(predicate_term); predicate_term = NULL; + + if(raptor_rss_emit_type_triple(rdf_parser, block->identifier, + block->node_type)) + return 1; + + + for(bfi = &raptor_rss_block_fields_info[0]; + bfi->type != RAPTOR_RSS_NONE; + bfi++) { + int attribute_type; + int offset; + + if(bfi->type != block_type || !bfi->attribute) + continue; + + attribute_type = bfi->attribute_type; + offset = bfi->offset; + predicate_uri = rdf_parser->world->rss_fields_info_uris[bfi->field]; + + predicate_term = raptor_new_term_from_uri(rdf_parser->world, + predicate_uri); + rss_parser->statement.predicate = predicate_term; + + if(attribute_type == RSS_BLOCK_FIELD_TYPE_URL) { + raptor_uri *uri = block->urls[offset]; + if(uri) { + raptor_term* object_term; + + object_term = raptor_new_term_from_uri(rdf_parser->world, uri); + rss_parser->statement.object = object_term; + (*rdf_parser->statement_handler)(rdf_parser->user_data, + &rss_parser->statement); + raptor_free_term(object_term); + } + } else if(attribute_type == RSS_BLOCK_FIELD_TYPE_STRING) { + const char *str = block->strings[offset]; + if(str) { + raptor_term* object_term; + + object_term = raptor_new_term_from_literal(rdf_parser->world, + (const unsigned char*)str, + NULL, NULL); + rss_parser->statement.object = object_term; + (*rdf_parser->statement_handler)(rdf_parser->user_data, + &rss_parser->statement); + raptor_free_term(object_term); + } + } else { +#ifdef RAPTOR_DEBUG + RAPTOR_FATAL2("Found unknown attribute_type %d\n", attribute_type); +#endif + } + + raptor_free_term(predicate_term); predicate_term = NULL; + } + + return 0; +} + + +static int +raptor_rss_emit_item(raptor_parser* rdf_parser, raptor_rss_item *item) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + int f; + raptor_rss_block *block; + raptor_uri *type_uri; + + if(!item->fields_count) + return 0; + + /* HACK - FIXME - set correct atom output class type */ + if(item->node_typei == RAPTOR_ATOM_AUTHOR) + type_uri = rdf_parser->world->rss_fields_info_uris[RAPTOR_RSS_RDF_ATOM_AUTHOR_CLASS]; + else + type_uri = rdf_parser->world->rss_types_info_uris[item->node_typei]; + + if(raptor_rss_emit_type_triple(rdf_parser, item->term, type_uri)) + return 1; + + for(f = 0; f< RAPTOR_RSS_FIELDS_SIZE; f++) { + raptor_rss_field* field; + raptor_uri* predicate_uri = NULL; + raptor_term* predicate_term = NULL; + + /* This is only made by a connection */ + if(f == RAPTOR_RSS_FIELD_ITEMS) + continue; + + /* skip predicates with no URI (no namespace e.g. RSS 2) */ + predicate_uri = rdf_parser->world->rss_fields_info_uris[f]; + if(!predicate_uri) + continue; + + predicate_term = raptor_new_term_from_uri(rdf_parser->world, + predicate_uri); + if(!predicate_term) + continue; + + rss_parser->statement.predicate = predicate_term; + + for(field = item->fields[f]; field; field = field->next) { + raptor_term* object_term; + + if(field->value) { + /* FIXME - should store and emit languages */ + object_term = raptor_new_term_from_literal(rdf_parser->world, + field->value, + NULL, NULL); + } else { + object_term = raptor_new_term_from_uri(rdf_parser->world, + field->uri); + } + rss_parser->statement.object = object_term; + + /* Generate the statement */ + (*rdf_parser->statement_handler)(rdf_parser->user_data, + &rss_parser->statement); + + raptor_free_term(object_term); + } + + raptor_free_term(predicate_term); + } + + for(block = item->blocks; block; block = block->next) { + raptor_rss_emit_block(rdf_parser, item->term, block); + } + + return 0; +} + + +static int +raptor_rss_emit_connection(raptor_parser* rdf_parser, + raptor_term *subject_identifier, + raptor_uri* predicate_uri, int predicate_ordinal, + raptor_term *object_identifier) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + raptor_uri *puri = NULL; + raptor_term *predicate_term = NULL; + + if(!subject_identifier) { + raptor_parser_error(rdf_parser, "Connection subject has no identifier"); + return 1; + } + + rss_parser->statement.subject = subject_identifier; + + if(!predicate_uri) { + /* new URI object */ + puri = raptor_new_uri_from_rdf_ordinal(rdf_parser->world, predicate_ordinal); + predicate_uri = puri; + } + predicate_term = raptor_new_term_from_uri(rdf_parser->world, + predicate_uri); + rss_parser->statement.predicate = predicate_term; + rss_parser->statement.object = object_identifier; + + /* Generate the statement */ + (*rdf_parser->statement_handler)(rdf_parser->user_data, + &rss_parser->statement); + + raptor_free_term(predicate_term); + + if(puri) + raptor_free_uri(puri); + + return 0; +} + + +static int +raptor_rss_emit(raptor_parser* rdf_parser) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + int i; + raptor_rss_item* item; + int rc = 0; + + if(!rss_parser->model.common[RAPTOR_RSS_CHANNEL]) { + raptor_parser_error(rdf_parser, "No RSS channel item present"); + return 1; + } + + if(!rss_parser->model.common[RAPTOR_RSS_CHANNEL]->term) { + raptor_parser_error(rdf_parser, "RSS channel has no identifier"); + return 1; + } + + /* Emit start default graph mark */ + raptor_parser_start_graph(rdf_parser, NULL, 0); + rdf_parser->emitted_default_graph++; + + + /* Emit all the common type blocks (channel, author, ...) */ + for(i = 0; i< RAPTOR_RSS_COMMON_SIZE; i++) { + for(item = rss_parser->model.common[i]; item; item = item->next) { + if(!item->fields_count) + continue; + + RAPTOR_DEBUG3("Emitting type %i - %s\n", i, raptor_rss_items_info[i].name); + + if(!item->term) { + raptor_parser_error(rdf_parser, "RSS %s has no identifier", + raptor_rss_items_info[i].name); + rc = 1; + goto tidy; + } + + if(raptor_rss_emit_item(rdf_parser, item)) { + rc = 1; + goto tidy; + } + + /* Add connections to channel */ + if(i != RAPTOR_RSS_CHANNEL) { + if(raptor_rss_emit_connection(rdf_parser, + rss_parser->model.common[RAPTOR_RSS_CHANNEL]->term, + rdf_parser->world->rss_types_info_uris[i], 0, + item->term)) { + rc = 1; + goto tidy; + } + } + } + } + + + /* Emit the feed item blocks */ + if(rss_parser->model.items_count) { + const unsigned char* id; + raptor_term *items; + + id = raptor_world_generate_bnodeid(rdf_parser->world); + + /* make a new genid for the <rdf:Seq> node */ + items = raptor_new_term_from_blank(rdf_parser->world, id); + RAPTOR_FREE(char*, id); + + /* _:genid1 rdf:type rdf:Seq . */ + if(raptor_rss_emit_type_triple(rdf_parser, items, + RAPTOR_RDF_Seq_URI(rdf_parser->world))) { + raptor_free_term(items); + rc = 1; + goto tidy; + } + + /* <channelURI> rss:items _:genid1 . */ + if(raptor_rss_emit_connection(rdf_parser, + rss_parser->model.common[RAPTOR_RSS_CHANNEL]->term, + rdf_parser->world->rss_fields_info_uris[RAPTOR_RSS_FIELD_ITEMS], 0, + items)) { + raptor_free_term(items); + rc= 1; + goto tidy; + } + + /* sequence of rss:item */ + for(i = 1, item = rss_parser->model.items; item; item = item->next, i++) { + + if(raptor_rss_emit_item(rdf_parser, item) || + raptor_rss_emit_connection(rdf_parser, items, NULL, i,item->term)) { + raptor_free_term(items); + rc = 1; + goto tidy; + } + } + + raptor_free_term(items); + } + + tidy: + if(rdf_parser->emitted_default_graph) { + raptor_parser_end_graph(rdf_parser, NULL, 0); + rdf_parser->emitted_default_graph--; + } + + return rc; +} + + +static int +raptor_rss_copy_field(raptor_rss_parser* rss_parser, + raptor_rss_item* item, + const raptor_field_pair* pair) +{ + raptor_rss_fields_type from_field = pair->from; + raptor_rss_fields_type to_field = pair->to; + raptor_rss_field* field = NULL; + + if(!(item->fields[from_field] && item->fields[from_field]->value)) + return 1; + + if(from_field == to_field) { + field = item->fields[from_field]; + } else { + if(item->fields[to_field] && item->fields[to_field]->value) + return 1; + + field = raptor_rss_new_field(item->world); + field->is_mapped = 1; + raptor_rss_item_add_field(item, to_field, field); + } + + /* Ensure output namespace is declared */ + rss_parser->nspaces_seen[raptor_rss_fields_info[to_field].nspace] = 'Y'; + + if(!field->value) { + if(pair->conversion) + pair->conversion(item->fields[from_field], field); + else { + size_t len; + + /* Otherwise default action is to copy from_field value */ + len = strlen((const char*)item->fields[from_field]->value); + + field->value = RAPTOR_MALLOC(unsigned char*, len + 1); + if(!field->value) + return 1; + + memcpy(field->value, item->fields[from_field]->value, len + 1); + } + } + + return 0; +} + + +static void +raptor_rss_uplift_fields(raptor_rss_parser* rss_parser, raptor_rss_item* item) +{ + int i; + + /* COPY some fields from atom to rss/dc */ + for(i = 0; raptor_atom_to_rss[i].from != RAPTOR_RSS_FIELD_UNKNOWN; i++) { +#ifdef RAPTOR_DEBUG + raptor_rss_fields_type from_field = raptor_atom_to_rss[i].from; + raptor_rss_fields_type to_field = raptor_atom_to_rss[i].to; +#endif + + if(raptor_rss_copy_field(rss_parser, item, &raptor_atom_to_rss[i])) + continue; + RAPTOR_DEBUG3("Copied field %s to rss field %s\n", + raptor_rss_fields_info[from_field].name, + raptor_rss_fields_info[to_field].name); + } +} + + +static void +raptor_rss_uplift_items(raptor_parser* rdf_parser) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + int i; + raptor_rss_item* item; + + for(i = 0; i< RAPTOR_RSS_COMMON_SIZE; i++) { + for(item = rss_parser->model.common[i]; item; item = item->next) { + raptor_rss_uplift_fields(rss_parser, item); + } + } + + for(item = rss_parser->model.items; item; item = item->next) { + raptor_rss_uplift_fields(rss_parser, item); + } + +} + + +static void +raptor_rss_start_namespaces(raptor_parser* rdf_parser) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + int i; + int n; + + /* for each item type (channel, item, ...) */ + for(i = 0; i< RAPTOR_RSS_COMMON_SIZE; i++) { + raptor_rss_item* item; + + /* for each item instance of a type */ + for(item = rss_parser->model.common[i]; item; item = item->next) { + int f; + if(!item->fields_count) + continue; + + /* for each field */ + for(f = 0; f< RAPTOR_RSS_FIELDS_SIZE; f++) { + raptor_rss_field* field = item->fields[f]; + if(field) { + /* knowing there is one value is enough */ + rss_info_namespace ns_index = raptor_rss_fields_info[f].nspace; + rss_parser->nspaces_seen[ns_index] = 'Y'; + } + } + } + } + + /* start the namespaces */ + for(n = 0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) { + if(rss_parser->nspaces[n] && rss_parser->nspaces_seen[n] == 'Y') + raptor_parser_start_namespace(rdf_parser, rss_parser->nspaces[n]); + } +} + + +static int +raptor_rss_parse_chunk(raptor_parser* rdf_parser, + const unsigned char *s, size_t len, + int is_end) +{ + raptor_rss_parser* rss_parser = (raptor_rss_parser*)rdf_parser->context; + + if(rdf_parser->failed) + return 1; + + raptor_sax2_parse_chunk(rss_parser->sax2, s, len, is_end); + + if(!is_end) + return 0; + + if(rdf_parser->failed) + return 1; + + /* turn strings into URIs, move things around if needed */ + if(raptor_rss_insert_identifiers(rdf_parser)) { + rdf_parser->failed = 1; + return 1; + } + + /* add some new fields */ + raptor_rss_uplift_items(rdf_parser); + + /* find out what namespaces to declare and start them */ + raptor_rss_start_namespaces(rdf_parser); + + /* generate the triples */ + raptor_rss_emit(rdf_parser); + + return 0; +} + + +static int +raptor_rss_parse_recognise_syntax(raptor_parser_factory* factory, + const unsigned char *buffer, size_t len, + const unsigned char *identifier, + const unsigned char *suffix, + const char *mime_type) +{ + int score = 0; + + if(suffix) { + if(!strcmp((const char*)suffix, "rss")) + score = 7; + if(!strcmp((const char*)suffix, "atom")) + score = 5; + if(!strcmp((const char*)suffix, "xml")) + score = 4; + } + + if(identifier) { + if(!strncmp((const char*)identifier, "http://feed", 11)) + score += 5; + else if(strstr((const char*)identifier, "feed")) + score += 3; + + if(strstr((const char*)identifier, "rss2")) + score += 5; + else if(!suffix && strstr((const char*)identifier, "rss")) + score += 4; + else if(!suffix && strstr((const char*)identifier, "atom")) + score += 4; + else if(strstr((const char*)identifier, "rss.xml")) + score += 4; + else if(strstr((const char*)identifier, "atom.xml")) + score += 4; + } + + if(mime_type) { + if(!strstr((const char*)mime_type, "html")) { + if(strstr((const char*)mime_type, "rss")) + score += 4; + else if(strstr((const char*)mime_type, "xml")) + score += 4; + else if(strstr((const char*)mime_type, "atom")) + score += 4; + } + } + + return score; +} + + +static const char* const rss_tag_soup_names[2] = { "rss-tag-soup", NULL }; + +#define RSS_TAG_SOUP_TYPES_COUNT 6 +static const raptor_type_q rss_tag_soup_types[RSS_TAG_SOUP_TYPES_COUNT + 1] = { + { "application/rss", 15, 8}, + { "application/rss+xml", 19, 8}, + { "text/rss", 8, 8}, + { "application/xml", 15, 3}, + { "text/xml", 8, 3}, + { "application/atom+xml", 20, 3}, + { NULL, 0, 0} +}; + +static int +raptor_rss_parser_register_factory(raptor_parser_factory *factory) +{ + int rc = 0; + + factory->desc.names = rss_tag_soup_names; + + factory->desc.mime_types = rss_tag_soup_types; + + factory->desc.label = "RSS Tag Soup"; + factory->desc.uri_strings = NULL; + + factory->desc.flags = RAPTOR_SYNTAX_NEED_BASE_URI; + + factory->context_length = sizeof(raptor_rss_parser); + + factory->init = raptor_rss_parse_init; + factory->terminate = raptor_rss_parse_terminate; + factory->start = raptor_rss_parse_start; + factory->chunk = raptor_rss_parse_chunk; + factory->recognise_syntax = raptor_rss_parse_recognise_syntax; + + return rc; +} + + +int +raptor_init_parser_rss(raptor_world* world) +{ + return !raptor_world_register_parser_factory(world, + &raptor_rss_parser_register_factory); +} |