summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/xml_parser
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/dlib/xml_parser')
-rw-r--r--ml/dlib/dlib/xml_parser/xml_parser_kernel_1.h1532
-rw-r--r--ml/dlib/dlib/xml_parser/xml_parser_kernel_abstract.h276
-rw-r--r--ml/dlib/dlib/xml_parser/xml_parser_kernel_interfaces.h244
3 files changed, 2052 insertions, 0 deletions
diff --git a/ml/dlib/dlib/xml_parser/xml_parser_kernel_1.h b/ml/dlib/dlib/xml_parser/xml_parser_kernel_1.h
new file mode 100644
index 000000000..e1854bc26
--- /dev/null
+++ b/ml/dlib/dlib/xml_parser/xml_parser_kernel_1.h
@@ -0,0 +1,1532 @@
+// Copyright (C) 2003 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_XML_PARSER_KERNEl_1_
+#define DLIB_XML_PARSER_KERNEl_1_
+
+
+#include "xml_parser_kernel_abstract.h"
+
+#include <sstream>
+#include <string>
+#include <fstream>
+#include <iostream>
+#include "xml_parser_kernel_interfaces.h"
+#include "../algs.h"
+#include <cstdio>
+#include "../map.h"
+#include "../stack.h"
+#include "../sequence.h"
+#include "../memory_manager.h"
+
+namespace dlib
+{
+
+ class xml_parser
+ {
+ typedef dlib::map<std::string,std::string,memory_manager<char>::kernel_2a>::kernel_1b map;
+ typedef dlib::stack<std::string,memory_manager<char>::kernel_2a>::kernel_1a stack;
+ typedef sequence<document_handler*>::kernel_2a seq_dh;
+ typedef sequence<error_handler*>::kernel_2a seq_eh;
+
+ /*!
+ INITIAL VALUE
+ dh_list.size() == 0
+ eh_list.size() == 0
+
+ CONVENTION
+ dh_list == a sequence of pointers to all the document_handlers that
+ have been added to the xml_parser
+ eh_list == a sequence of pointers to all the error_handlers that
+ have been added to the xml_parser
+
+ map is used to implement the attribute_list interface
+ stack is used just inside the parse function
+ seq_dh is used to make the dh_list member variable
+ seq_eh is used to make the eh_list member variable
+ !*/
+
+
+
+ public:
+
+ // These typedefs are here for backwards compatibly with previous versions of
+ // dlib.
+ typedef xml_parser kernel_1a;
+ typedef xml_parser kernel_1a_c;
+
+ xml_parser(
+ ) {}
+
+ virtual ~xml_parser(
+ ){}
+
+ inline void clear(
+ );
+
+ inline void parse (
+ std::istream& in
+ );
+
+ inline void add_document_handler (
+ document_handler& item
+ );
+
+ inline void add_error_handler (
+ error_handler& item
+ );
+
+
+ inline void swap (
+ xml_parser& item
+ );
+
+
+ private:
+
+ // -----------------------------------
+
+ // attribute_list interface implementation
+ class attrib_list : public attribute_list
+ {
+ public:
+ // the list of attribute name/value pairs
+ map list;
+
+ bool is_in_list (
+ const std::string& key
+ ) const
+ {
+ return list.is_in_domain(key);
+ }
+
+ const std::string& operator[] (
+ const std::string& key
+ ) const
+ {
+ if (is_in_list(key))
+ return list[key];
+ else
+ throw xml_attribute_list_error("No XML attribute named " + key + " is present in tag.");
+ }
+
+ bool at_start (
+ ) const { return list.at_start(); }
+
+ void reset (
+ ) const { return list.reset(); }
+
+ bool current_element_valid (
+ ) const { return list.current_element_valid(); }
+
+ const type& element (
+ ) const { return list.element(); }
+
+ type& element (
+ ) { return list.element(); }
+
+ bool move_next (
+ ) const { return list.move_next(); }
+
+ size_t size (
+ ) const { return list.size(); }
+ };
+
+
+ // -----------------------------------
+
+ enum token_type
+ {
+ element_start, // the first tag of an element
+ element_end, // the last tag of an element
+ empty_element, // the singular tag of an empty element
+ pi, // processing instruction
+ chars, // the non-markup data between tags
+ chars_cdata, // the data from a CDATA section
+ eof, // this token is returned when we reach the end of input
+ error, // this token indicates that the tokenizer couldn't
+ // determine which category the next token fits into
+ dtd, // this token is for an entire dtd
+ comment // this is a token for comments
+ };
+ /*
+ notes about the tokens:
+ the tokenizer guarantees that the following tokens to not
+ contain the '<' character except as the first character of the token
+ element_start, element_end, empty_element, and pi. they also only
+ contain the '>' characer as their last character.
+
+ it is also guaranteed that pi is at least of the form <??>. that
+ is to say that it always always begins with <? and ends with ?>.
+
+ it is also guaranteed that all markup tokens will begin with the '<'
+ character and end with the '>'. there won't be any leading or
+ trailing whitespaces. this whitespace is considered a chars token.
+ */
+
+
+ // private member functions
+ inline void get_next_token(
+ std::istream& in,
+ std::string& token_text,
+ int& token_kind,
+ unsigned long& line_number
+ );
+ /*!
+ ensures
+ gets the next token from in and puts it in token_text and
+ token_kind == the kind of the token found and
+ line_number is incremented every time a '\n' is encountered and
+ entity references are translated into the characters they represent
+ only for chars tokens
+ !*/
+
+ inline int parse_element (
+ const std::string& token,
+ std::string& name,
+ attrib_list& atts
+ );
+ /*!
+ requires
+ token is a token of kind start_element or empty_element
+ ensures
+ gets the element name and puts it into the string name and
+ parses out the attributes and puts them into the attribute_list atts
+
+ return 0 upon success or
+ returns -1 if it failed to parse token
+ !*/
+
+ inline int parse_pi (
+ const std::string& token,
+ std::string& target,
+ std::string& data
+ );
+ /*!
+ requires
+ token is a token of kind pi
+ ensures
+ the target from the processing instruction is put into target and
+ the data from the processing instruction is put into data
+
+ return 0 upon success or
+ returns -1 if it failed to parse token
+ !*/
+
+ inline int parse_element_end (
+ const std::string& token,
+ std::string& name
+ );
+ /*!
+ requires
+ token is a token of kind element_end
+ ensures
+ the name from the ending element tag is put into the string name
+
+ return 0 upon success or
+ returns -1 if it failed to parse token
+ !*/
+
+ inline int change_entity (
+ std::istream& in
+ );
+ /*!
+ ensures
+ performs the following translations and returns the new character
+ amp; -> &
+ lt; -> <
+ gt; -> >
+ apos; -> '
+ quot; -> "
+
+ or returns -1 if we hit an undefined entity reference or EOF.
+ (i.e. it was not one of the entities listed above)
+
+ !*/
+
+ // -----------------------------------
+
+ // private member data
+ seq_dh dh_list;
+ seq_eh eh_list;
+
+ // -----------------------------------
+
+ // restricted functions: assignment and copy construction
+ xml_parser(xml_parser&);
+ xml_parser& operator= (
+ xml_parser&
+ );
+
+ };
+
+ inline void swap (
+ xml_parser& a,
+ xml_parser& b
+ ) { a.swap(b); }
+
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+ // member function definitions
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ void xml_parser::
+ clear(
+ )
+ {
+ // unregister all event handlers
+ eh_list.clear();
+ dh_list.clear();
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void xml_parser::
+ parse (
+ std::istream& in
+ )
+ {
+ DLIB_CASSERT ( in.fail() == false ,
+ "\tvoid xml_parser::parse"
+ << "\n\tthe input stream must not be in the fail state"
+ << "\n\tthis: " << this
+ );
+
+
+ // save which exceptions in will throw and make it so it won't throw any
+ // for the life of this function
+ std::ios::iostate old_exceptions = in.exceptions();
+ // set it to not throw anything
+ in.exceptions(std::ios::goodbit);
+
+
+ try
+ {
+ unsigned long line_number = 1;
+
+ // skip any whitespace before the start of the document
+ while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' )
+ {
+ if (in.peek() == '\n')
+ ++line_number;
+ in.get();
+ }
+
+
+
+ stack tags; // this stack contains the last start tag seen
+ bool seen_fatal_error = false;
+ bool seen_root_tag = false; // this is true after we have seen the root tag
+
+
+
+ // notify all the document_handlers that we are about to being parsing
+ for (unsigned long i = 0; i < dh_list.size(); ++i)
+ {
+ dh_list[i]->start_document();
+ }
+
+
+ std::string chars_buf; // used to collect chars data between consecutive
+ // chars and chars_cdata tokens so that
+ // document_handlers receive all chars data between
+ // tags in one call
+
+ // variables to be used with the parsing functions
+ attrib_list atts;
+ std::string name;
+ std::string target;
+ std::string data;
+
+
+
+ // variables to use with the get_next_token() function
+ std::string token_text;
+ int token_kind;
+
+ get_next_token(in,token_text,token_kind,line_number);
+
+
+ while (token_kind != eof)
+ {
+ bool is_empty = false; // this becomes true when this token is an empty_element
+
+ switch (token_kind)
+ {
+
+
+ case empty_element: is_empty = true;
+ // fall through
+ case element_start:
+ {
+ seen_root_tag = true;
+
+ int status = parse_element(token_text,name,atts);
+ // if there was no error parsing the element
+ if (status == 0)
+ {
+ // notify all the document_handlers
+ for (unsigned long i = 0; i < dh_list.size(); ++i)
+ {
+ dh_list[i]->start_element(line_number,name,atts);
+ if (is_empty)
+ dh_list[i]->end_element(line_number,name);
+ }
+ }
+ else
+ {
+ seen_fatal_error = true;
+ }
+
+ // if this is an element_start token then push the name of
+ // the element on to the stack
+ if (token_kind == element_start)
+ {
+ tags.push(name);
+ }
+
+ }break;
+
+ // ----------------------------------------
+
+ case element_end:
+ {
+
+ int status = parse_element_end (token_text,name);
+
+ // if there was no error parsing the element
+ if (status == 0)
+ {
+ // make sure this ending element tag matches the last start
+ // element tag we saw
+ if ( tags.size() == 0 || name != tags.current())
+ {
+ // they don't match so signal a fatal error
+ seen_fatal_error = true;
+ }
+ else
+ {
+ // notify all the document_handlers
+ for (unsigned long i = 0; i < dh_list.size(); ++i)
+ {
+ dh_list[i]->end_element(line_number,name);
+ }
+
+ // they match so throw away this element name
+ tags.pop(name);
+ }
+ }
+ else
+ {
+ seen_fatal_error = true;
+ }
+
+
+ }break;
+
+ // ----------------------------------------
+
+ case pi:
+ {
+
+ int status = parse_pi (token_text,target,data);
+ // if there was no error parsing the element
+ if (status == 0)
+ {
+ // notify all the document_handlers
+ for (unsigned long i = 0; i < dh_list.size(); ++i)
+ {
+ dh_list[i]->processing_instruction(line_number,target,data);
+ }
+ }
+ else
+ {
+ // notify all the error_handlers
+ for (unsigned long i = 0; i < eh_list.size(); ++i)
+ {
+ eh_list[i]->error(line_number);
+ }
+ }
+ while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' )
+ {
+ if (in.peek() == '\n')
+ ++line_number;
+ in.get();
+ }
+
+
+ }break;
+
+ // ----------------------------------------
+
+ case chars:
+ {
+ if (tags.size() != 0)
+ {
+ chars_buf += token_text;
+ }
+ else if (token_text.find_first_not_of(" \t\r\n") != std::string::npos)
+ {
+ // you can't have non whitespace chars data outside the root element
+ seen_fatal_error = true;
+ }
+ }break;
+
+ // ----------------------------------------
+
+ case chars_cdata:
+ {
+ if (tags.size() != 0)
+ {
+ chars_buf += token_text;
+ }
+ else
+ {
+ // you can't have chars_data outside the root element
+ seen_fatal_error = true;
+ }
+ }break;
+
+ // ----------------------------------------
+
+ case eof:
+ break;
+
+ // ----------------------------------------
+
+ case error:
+ {
+ seen_fatal_error = true;
+ }break;
+
+ // ----------------------------------------
+
+ case dtd: // fall though
+ case comment: // do nothing
+ break;
+
+ // ----------------------------------------
+
+
+ }
+
+ // if there was a fatal error then quit loop
+ if (seen_fatal_error)
+ break;
+
+ // if we have seen the last tag then quit the loop
+ if (tags.size() == 0 && seen_root_tag)
+ break;
+
+
+ get_next_token(in,token_text,token_kind,line_number);
+
+ // if the next token is not a chars or chars_cdata token then flush
+ // the chars_buf to the document_handlers
+ if ( (token_kind != chars) &&
+ (token_kind != chars_cdata) &&
+ (token_kind != dtd) &&
+ (token_kind != comment) &&
+ (chars_buf.size() != 0)
+ )
+ {
+ // notify all the document_handlers
+ for (unsigned long i = 0; i < dh_list.size(); ++i)
+ {
+ dh_list[i]->characters(chars_buf);
+ }
+ chars_buf.erase();
+ }
+
+
+ } //while (token_kind != eof)
+
+
+
+
+ // you can't have any unmatched tags or any fatal erros
+ if (tags.size() != 0 || seen_fatal_error)
+ {
+ // notify all the error_handlers
+ for (unsigned long i = 0; i < eh_list.size(); ++i)
+ {
+ eh_list[i]->fatal_error(line_number);
+ }
+
+ }
+
+
+ // notify all the document_handlers that we have ended parsing
+ for (unsigned long i = 0; i < dh_list.size(); ++i)
+ {
+ dh_list[i]->end_document();
+ }
+
+ }
+ catch (...)
+ {
+ // notify all the document_handlers that we have ended parsing
+ for (unsigned long i = 0; i < dh_list.size(); ++i)
+ {
+ dh_list[i]->end_document();
+ }
+
+ // restore the old exception settings to in
+ in.exceptions(old_exceptions);
+
+ // don't forget to rethrow the exception
+ throw;
+ }
+
+ // restore the old exception settings to in
+ in.exceptions(old_exceptions);
+
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void xml_parser::
+ add_document_handler (
+ document_handler& item
+ )
+ {
+ document_handler* temp = &item;
+ dh_list.add(dh_list.size(),temp);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void xml_parser::
+ add_error_handler (
+ error_handler& item
+ )
+ {
+ error_handler* temp = &item;
+ eh_list.add(eh_list.size(),temp);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void xml_parser::
+ swap (
+ xml_parser& item
+ )
+ {
+ dh_list.swap(item.dh_list);
+ eh_list.swap(item.eh_list);
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+ // private member function definitions
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ void xml_parser::
+ get_next_token(
+ std::istream& in,
+ std::string& token_text,
+ int& token_kind,
+ unsigned long& line_number
+ )
+ {
+
+ token_text.erase();
+
+ std::istream::int_type ch1 = in.get();
+ std::istream::int_type ch2;
+
+
+ switch (ch1)
+ {
+
+ // -----------------------------------------
+
+ // this is the start of some kind of a tag
+ case '<':
+ {
+ ch2 = in.get();
+ switch (ch2)
+ {
+
+ // ---------------------------------
+
+ // this is a dtd, comment, or chars_cdata token
+ case '!':
+ {
+ // if this is a CDATA section *******************************
+ if ( in.peek() == '[')
+ {
+ token_kind = chars_cdata;
+
+ // throw away the '['
+ in.get();
+
+ // make sure the next chars are CDATA[
+ std::istream::int_type ch = in.get();
+ if (ch != 'C')
+ token_kind = error;
+ ch = in.get();
+ if (ch != 'D')
+ token_kind = error;
+ ch = in.get();
+ if (ch != 'A')
+ token_kind = error;
+ ch = in.get();
+ if (ch != 'T')
+ token_kind = error;
+ ch = in.get();
+ if (ch != 'A')
+ token_kind = error;
+ ch = in.get();
+ if (ch != '[')
+ token_kind = error;
+ // if this is an error token then end
+ if (token_kind == error)
+ break;
+
+
+ // get the rest of the chars and put them into token_text
+ int brackets_seen = 0; // this is the number of ']' chars
+ // we have seen in a row
+ bool seen_closing = false; // true if we have seen ]]>
+ do
+ {
+ ch = in.get();
+
+ if (ch == '\n')
+ ++line_number;
+
+ token_text += ch;
+
+ // if this is the closing
+ if (brackets_seen == 2 && ch == '>')
+ seen_closing = true;
+ // if we are seeing a bracket
+ else if (ch == ']')
+ ++brackets_seen;
+ // if we didn't see a bracket
+ else
+ brackets_seen = 0;
+
+
+ } while ( (!seen_closing) && (ch != EOF) );
+
+ // check if this is an error token
+ if (ch == EOF)
+ {
+ token_kind = error;
+ }
+ else
+ {
+ token_text.erase(token_text.size()-3);
+ }
+
+
+
+ }
+ // this is a comment token ****************************
+ else if (in.peek() == '-')
+ {
+
+ token_text += ch1;
+ token_text += ch2;
+ token_text += '-';
+
+ token_kind = comment;
+
+ // throw away the '-' char
+ in.get();
+
+ // make sure the next char is another '-'
+ std::istream::int_type ch = in.get();
+ if (ch != '-')
+ {
+ token_kind = error;
+ break;
+ }
+
+ token_text += '-';
+
+
+ // get the rest of the chars and put them into token_text
+ int hyphens_seen = 0; // this is the number of '-' chars
+ // we have seen in a row
+ bool seen_closing = false; // true if we have seen ]]>
+ do
+ {
+ ch = in.get();
+
+ if (ch == '\n')
+ ++line_number;
+
+ token_text += ch;
+
+ // if this should be a closing block
+ if (hyphens_seen == 2)
+ {
+ if (ch == '>')
+ seen_closing = true;
+ else // this isn't a closing so make it signal error
+ ch = EOF;
+ }
+ // if we are seeing a hyphen
+ else if (ch == '-')
+ ++hyphens_seen;
+ // if we didn't see a hyphen
+ else
+ hyphens_seen = 0;
+
+
+ } while ( (!seen_closing) && (ch != EOF) );
+
+ // check if this is an error token
+ if (ch == EOF)
+ {
+ token_kind = error;
+ }
+
+
+
+
+
+ }
+ else // this is a dtd token *************************
+ {
+
+ token_text += ch1;
+ token_text += ch2;
+ int bracket_depth = 1; // this is the number of '<' chars seen
+ // minus the number of '>' chars seen
+
+ std::istream::int_type ch;
+ do
+ {
+ ch = in.get();
+ if (ch == '>')
+ --bracket_depth;
+ else if (ch == '<')
+ ++bracket_depth;
+ else if (ch == '\n')
+ ++line_number;
+
+ token_text += ch;
+
+ } while ( (bracket_depth > 0) && (ch != EOF) );
+
+ // make sure we didn't just hit EOF
+ if (bracket_depth == 0)
+ {
+ token_kind = dtd;
+ }
+ else
+ {
+ token_kind = error;
+ }
+ }
+ }
+ break;
+
+ // ---------------------------------
+
+ // this is a pi token
+ case '?':
+ {
+ token_text += ch1;
+ token_text += ch2;
+ std::istream::int_type ch;
+
+ do
+ {
+ ch = in.get();
+ token_text += ch;
+ if (ch == '\n')
+ ++line_number;
+ // else if we hit a < then thats an error
+ else if (ch == '<')
+ ch = EOF;
+ } while (ch != '>' && ch != EOF);
+ // if we hit the end of the pi
+ if (ch == '>')
+ {
+ // make sure there was a trailing '?'
+ if ( (token_text.size() > 3) &&
+ (token_text[token_text.size()-2] != '?')
+ )
+ {
+ token_kind = error;
+ }
+ else
+ {
+ token_kind = pi;
+ }
+ }
+ // if we hit EOF unexpectidely then error
+ else
+ {
+ token_kind = error;
+ }
+ }
+ break;
+
+ // ---------------------------------
+
+ // this is an error token
+ case EOF:
+ {
+ token_kind = error;
+ }
+ break;
+
+ // ---------------------------------
+ // this is an element_end token
+ case '/':
+ {
+ token_kind = element_end;
+ token_text += ch1;
+ token_text += ch2;
+ std::istream::int_type ch;
+ do
+ {
+ ch = in.get();
+ if (ch == '\n')
+ ++line_number;
+ // else if we hit a < then thats an error
+ else if (ch == '<')
+ ch = EOF;
+ token_text += ch;
+ } while ( (ch != '>') && (ch != EOF));
+
+ // check if this is an error token
+ if (ch == EOF)
+ {
+ token_kind = error;
+ }
+ }
+ break;
+
+
+ // ---------------------------------
+
+ // this is an element_start or empty_element token
+ default:
+ {
+
+ token_text += ch1;
+ token_text += ch2;
+ std::istream::int_type ch = '\0';
+ std::istream::int_type last;
+ do
+ {
+ last = ch;
+ ch = in.get();
+ if (ch == '\n')
+ ++line_number;
+ // else if we hit a < then thats an error
+ else if (ch == '<')
+ ch = EOF;
+ token_text += ch;
+ } while ( (ch != '>') && (ch != EOF));
+
+ // check if this is an error token
+ if (ch == EOF)
+ {
+ token_kind = error;
+ }
+ // if this is an empty_element
+ else if (last == '/')
+ {
+ token_kind = empty_element;
+ }
+ else
+ {
+ token_kind = element_start;
+ }
+
+
+ }
+ break;
+
+ // ---------------------------------
+
+ }
+
+ }
+ break;
+
+ // -----------------------------------------
+
+ // this is an eof token
+ case EOF:
+ {
+ token_kind = eof;
+ }
+ break;
+
+ // -----------------------------------------
+
+ // this is a chars token
+ default:
+ {
+ if (ch1 == '\n')
+ {
+ ++line_number;
+ token_text += ch1;
+ }
+ // if the first thing in this chars token is an entity reference
+ else if (ch1 == '&')
+ {
+
+ int temp = change_entity(in);
+ if (temp == -1)
+ {
+ token_kind = error;
+ break;
+ }
+ else
+ {
+ token_text += temp;
+ }
+ }
+ else
+ {
+ token_text += ch1;
+ }
+
+
+ token_kind = chars;
+
+ std::istream::int_type ch = 0;
+ while (in.peek() != '<' && in.peek() != EOF)
+ {
+ ch = in.get();
+
+ if (ch == '\n')
+ ++line_number;
+
+ // if this is one of the predefined entity references then change it
+ if (ch == '&')
+ {
+ int temp = change_entity(in);
+ if (temp == -1)
+ {
+ ch = EOF;
+ break;
+ }
+ else
+ token_text += temp;
+ }
+ else
+ {
+ token_text += ch;
+ }
+ }
+
+ // if this is an error token
+ if (ch == EOF)
+ {
+ token_kind = error;
+ }
+
+ }
+ break;
+
+ // -----------------------------------------
+
+ }
+
+
+ }
+
+
+
+// ----------------------------------------------------------------------------------------
+
+ int xml_parser::
+ parse_element (
+ const std::string& token,
+ std::string& name,
+ attrib_list& atts
+ )
+ {
+ name.erase();
+ atts.list.clear();
+
+ // there must be at least one character between the <>
+ if (token[1] == '>')
+ return -1;
+
+ std::string::size_type i;
+ std::istream::int_type ch = token[1];
+ i = 2;
+
+ // fill out name. the name can not contain any of the following characters
+ while ( (ch != '>') &&
+ (ch != ' ') &&
+ (ch != '=') &&
+ (ch != '/') &&
+ (ch != '\t') &&
+ (ch != '\r') &&
+ (ch != '\n')
+ )
+ {
+ name += ch;
+ ch = token[i];
+ ++i;
+ }
+
+ // skip any whitespaces
+ while ( ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' )
+ {
+ ch = token[i];
+ ++i;
+ }
+
+ // find any attributes
+ while (ch != '>' && ch != '/')
+ {
+ std::string attribute_name;
+ std::string attribute_value;
+
+ // fill out attribute_name
+ while ( (ch != '=') &&
+ (ch != ' ') &&
+ (ch != '\t') &&
+ (ch != '\r') &&
+ (ch != '\n') &&
+ (ch != '>')
+ )
+ {
+ attribute_name += ch;
+ ch = token[i];
+ ++i;
+ }
+
+ // you can't have empty attribute names
+ if (attribute_name.size() == 0)
+ return -1;
+
+ // if we hit > too early then return error
+ if (ch == '>')
+ return -1;
+
+ // skip any whitespaces
+ while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r')
+ {
+ ch = token[i];
+ ++i;
+ }
+
+ // the next char should be a '=', error if it's not
+ if (ch != '=')
+ return -1;
+
+ // get the next char
+ ch = token[i];
+ ++i;
+
+ // skip any whitespaces
+ while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r')
+ {
+ ch = token[i];
+ ++i;
+ }
+
+
+ // get the delimiter for the attribute value
+ std::istream::int_type delimiter = ch; // this should be either a ' or " character
+ ch = token[i]; // get the next char
+ ++i;
+ if (delimiter != '\'' && delimiter!='"')
+ return -1;
+
+
+ // fill out attribute_value
+ while ( (ch != delimiter) &&
+ (ch != '>')
+ )
+ {
+ attribute_value += ch;
+ ch = token[i];
+ ++i;
+ }
+
+
+ // if there was no delimiter then this is an error
+ if (ch == '>')
+ {
+ return -1;
+ }
+
+ // go to the next char
+ ch = token[i];
+ ++i;
+
+ // the next char must be either a '>' or '/' (denoting the end of the tag)
+ // or a white space character
+ if (ch != '>' && ch != ' ' && ch != '/' && ch != '\t' && ch !='\n' && ch !='\r')
+ return -1;
+
+ // skip any whitespaces
+ while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r')
+ {
+ ch = token[i];
+ ++i;
+ }
+
+
+ // add attribute_value and attribute_name to atts
+ if (atts.list.is_in_domain(attribute_name))
+ {
+ // attributes may not be multiply defined
+ return -1;
+ }
+ else
+ {
+ atts.list.add(attribute_name,attribute_value);
+ }
+
+
+ }
+
+ // you can't have an element with no name
+ if (name.size() == 0)
+ return -1;
+
+ return 0;
+
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ int xml_parser::
+ parse_pi (
+ const std::string& token,
+ std::string& target,
+ std::string& data
+ )
+ {
+ target.erase();
+ data.erase();
+
+ std::istream::int_type ch = token[2];
+ std::string::size_type i = 3;
+ while (ch != ' ' && ch != '?' && ch != '\t' && ch != '\n' && ch!='\r')
+ {
+ target += ch;
+ ch = token[i];
+ ++i;
+ }
+ if (target.size() == 0)
+ return -1;
+
+ // if we aren't at a ? character then go to the next character
+ if (ch != '?' )
+ {
+ ch = token[i];
+ ++i;
+ }
+
+ // if we still aren't at the end of the processing instruction then
+ // set this stuff in the data section
+ while (ch != '?')
+ {
+ data += ch;
+ ch = token[i];
+ ++i;
+ }
+
+ return 0;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ int xml_parser::
+ parse_element_end (
+ const std::string& token,
+ std::string& name
+ )
+ {
+ name.erase();
+ std::string::size_type end = token.size()-1;
+ for (std::string::size_type i = 2; i < end; ++i)
+ {
+ if (token[i] == ' ' || token[i] == '\t' || token[i] == '\n'|| token[i] == '\r')
+ break;
+ name += token[i];
+ }
+
+ if (name.size() == 0)
+ return -1;
+
+ return 0;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ int xml_parser::
+ change_entity (
+ std::istream& in
+ )
+ {
+
+ std::istream::int_type buf[6];
+
+
+ buf[1] = in.get();
+
+ // if this is an undefined entity reference then return error
+ if (buf[1] != 'a' &&
+ buf[1] != 'l' &&
+ buf[1] != 'g' &&
+ buf[1] != 'q'
+ )
+ return -1;
+
+
+ buf[2] = in.get();
+ // if this is an undefined entity reference then return error
+ if (buf[2] != 'm' &&
+ buf[2] != 't' &&
+ buf[2] != 'p' &&
+ buf[2] != 'u'
+ )
+ return -1;
+
+
+ buf[3] = in.get();
+ // if this is an undefined entity reference then return error
+ if (buf[3] != 'p' &&
+ buf[3] != ';' &&
+ buf[3] != 'o'
+ )
+ return -1;
+
+ // check if this is &lt; or &gt;
+ if (buf[3] == ';')
+ {
+ if (buf[2] != 't')
+ return -1;
+
+ // if this is &lt; then return '<'
+ if (buf[1] == 'l')
+ return '<';
+ // if this is &gt; then return '>'
+ if (buf[1] == 'g')
+ return '>';
+
+ // it is neither so it must be an undefined entity reference
+ return -1;
+ }
+
+
+ buf[4] = in.get();
+ // if this should be &amp;
+ if (buf[4] == ';')
+ {
+ // if this is not &amp; then return error
+ if (buf[1] != 'a' ||
+ buf[2] != 'm' ||
+ buf[3] != 'p'
+ )
+ return -1;
+
+ return '&';
+ }
+
+ buf[5] = in.get();
+
+ // if this should be &apos;
+ if (buf[1] == 'a' &&
+ buf[2] == 'p' &&
+ buf[3] == 'o' &&
+ buf[4] == 's' &&
+ buf[5] == ';'
+ )
+ return '\'';
+
+
+ // if this should be &quot;
+ if (buf[1] == 'q' &&
+ buf[2] == 'u' &&
+ buf[3] == 'o' &&
+ buf[4] == 't' &&
+ buf[5] == ';'
+ )
+ return '"';
+
+
+ // it was an undefined entity reference
+ return -1;
+
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class xml_parse_error : public error
+ {
+ public:
+ xml_parse_error(
+ const std::string& a
+ ): error(a) {}
+ };
+
+ namespace impl
+ {
+ class default_xml_error_handler : public error_handler
+ {
+ std::string filename;
+
+ public:
+
+ default_xml_error_handler (
+ ) {}
+
+ default_xml_error_handler (
+ const std::string& filename_
+ ) :filename(filename_) {}
+
+ virtual void error (
+ const unsigned long
+ )
+ {
+ // just ignore non-fatal errors
+ }
+
+ virtual void fatal_error (
+ const unsigned long line_number
+ )
+ {
+ std::ostringstream sout;
+ if (filename.size() != 0)
+ sout << "There is a fatal error on line " << line_number << " in the XML file '"<<filename<<"'.";
+ else
+ sout << "There is a fatal error on line " << line_number << " in the XML being processed.";
+
+ throw xml_parse_error(sout.str());
+ }
+ };
+ }
+
+ inline void parse_xml (
+ std::istream& in,
+ document_handler& dh,
+ error_handler& eh
+ )
+ {
+ if (!in)
+ throw xml_parse_error("Unexpected end of file during xml parsing.");
+ xml_parser parser;
+ parser.add_document_handler(dh);
+ parser.add_error_handler(eh);
+ parser.parse(in);
+ }
+
+ inline void parse_xml (
+ std::istream& in,
+ error_handler& eh,
+ document_handler& dh
+ )
+ {
+ if (!in)
+ throw xml_parse_error("Unexpected end of file during xml parsing.");
+ xml_parser parser;
+ parser.add_document_handler(dh);
+ parser.add_error_handler(eh);
+ parser.parse(in);
+ }
+
+ inline void parse_xml (
+ std::istream& in,
+ error_handler& eh
+ )
+ {
+ if (!in)
+ throw xml_parse_error("Unexpected end of file during xml parsing.");
+ xml_parser parser;
+ parser.add_error_handler(eh);
+ parser.parse(in);
+ }
+
+ inline void parse_xml (
+ std::istream& in,
+ document_handler& dh
+ )
+ {
+ if (!in)
+ throw xml_parse_error("Unexpected end of file during xml parsing.");
+ xml_parser parser;
+ parser.add_document_handler(dh);
+ impl::default_xml_error_handler eh;
+ parser.add_error_handler(eh);
+ parser.parse(in);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline void parse_xml (
+ const std::string& filename,
+ document_handler& dh,
+ error_handler& eh
+ )
+ {
+ std::ifstream in(filename.c_str());
+ if (!in)
+ throw xml_parse_error("Unable to open file '" + filename + "'.");
+ xml_parser parser;
+ parser.add_document_handler(dh);
+ parser.add_error_handler(eh);
+ parser.parse(in);
+ }
+
+ inline void parse_xml (
+ const std::string& filename,
+ error_handler& eh,
+ document_handler& dh
+ )
+ {
+ std::ifstream in(filename.c_str());
+ if (!in)
+ throw xml_parse_error("Unable to open file '" + filename + "'.");
+ xml_parser parser;
+ parser.add_document_handler(dh);
+ parser.add_error_handler(eh);
+ parser.parse(in);
+ }
+
+ inline void parse_xml (
+ const std::string& filename,
+ error_handler& eh
+ )
+ {
+ std::ifstream in(filename.c_str());
+ if (!in)
+ throw xml_parse_error("Unable to open file '" + filename + "'.");
+ xml_parser parser;
+ parser.add_error_handler(eh);
+ parser.parse(in);
+ }
+
+ inline void parse_xml (
+ const std::string& filename,
+ document_handler& dh
+ )
+ {
+ std::ifstream in(filename.c_str());
+ if (!in)
+ throw xml_parse_error("Unable to open file '" + filename + "'.");
+ xml_parser parser;
+ parser.add_document_handler(dh);
+ impl::default_xml_error_handler eh(filename);
+ parser.add_error_handler(eh);
+ parser.parse(in);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_XML_PARSER_KERNEl_1_
+
diff --git a/ml/dlib/dlib/xml_parser/xml_parser_kernel_abstract.h b/ml/dlib/dlib/xml_parser/xml_parser_kernel_abstract.h
new file mode 100644
index 000000000..45b513e55
--- /dev/null
+++ b/ml/dlib/dlib/xml_parser/xml_parser_kernel_abstract.h
@@ -0,0 +1,276 @@
+// Copyright (C) 2003 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_XML_PARSER_KERNEl_ABSTRACT_
+#ifdef DLIB_XML_PARSER_KERNEl_ABSTRACT_
+
+#include <string>
+#include <iosfwd>
+#include "xml_parser_kernel_interfaces.h"
+
+namespace dlib
+{
+
+ class xml_parser
+ {
+
+ /*!
+ INITIAL VALUE
+ no objects are registered to receive events
+
+
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a simple SAX style event driven XML parser.
+ It takes its input from an input stream object and sends events to all
+ registered document_handler and error_handler objects.
+
+ note that this xml parser ignores all DTD related XML markup. It will
+ parse XML documents with DTD's but it just won't check if the document
+ is valid. This also means that entity references may not be used except
+ for the predefined ones which are as follows:
+ &amp;
+ &lt;
+ &gt;
+ &apos;
+ &quot;
+
+ also note that there is no interpreting of entity references inside
+ a CDATA section or inside of tags, they are only interpreted inside
+ normal non-markup data.
+
+ This parser considers the end of the xml document to be the closing
+ tag of the root tag (as opposed to using EOF as the end of the
+ document). This is a deviation from the xml standard.
+
+ Aside from ignoring DTD stuff and entity references everywhere but
+ data, and the above comment regarding EOF, this parser should conform
+ to the rest of the XML standard.
+ !*/
+
+ public:
+
+
+ xml_parser(
+ );
+ /*!
+ ensures
+ - #*this is properly initialized
+ throws
+ - std::bad_alloc
+ !*/
+
+ virtual ~xml_parser(
+ );
+ /*!
+ ensures
+ - all memory associated with *this has been released
+ !*/
+
+ void clear(
+ );
+ /*!
+ ensures
+ - #*this has its initial value
+ throws
+ - std::bad_alloc
+ if this exception is thrown then *this is unusable
+ until clear() is called and succeeds
+ !*/
+
+ void parse (
+ std::istream& in
+ );
+ /*!
+ requires
+ - in.fail() == false
+ ensures
+ - the data from the input stream in will be parsed and the appropriate
+ events will be generated
+ - parsing will stop when the parser has reached the closing tag
+ for the xml document or EOF (which ever comes first). Note that
+ hitting EOF first is a fatal error.
+ throws
+ - std::bad_alloc
+ if parse() throws then it will be unusable until clear() is
+ called and succeeds
+ - other exceptions
+ document_handlers and error_handlers my throw any exception. If
+ they throw while parse() is running then parse() will let the
+ exception propagate out and the xml_parser object will be unusable
+ until clear() is called and succeeds. note that end_document()
+ is still called.
+ !*/
+
+ void add_document_handler (
+ document_handler& item
+ );
+ /*!
+ ensures
+ - item will now receive document events from the parser
+ throws
+ - std::bad_alloc
+ if add_document_handler() throws then it has no effect
+ !*/
+
+ void add_error_handler (
+ error_handler& item
+ );
+ /*!
+ ensures
+ - item will now receive error events from the parser
+ throws
+ - std::bad_alloc
+ if add_error_handler() throws then it has no effect
+ !*/
+
+
+ void swap (
+ xml_parser& item
+ );
+ /*!
+ ensures
+ - swaps *this and item
+ !*/
+
+
+ private:
+
+ // restricted functions
+ xml_parser(xml_parser&); // copy constructor
+ xml_parser& operator=(xml_parser&); // assignment operator
+
+ };
+
+
+ inline void swap (
+ xml_parser& a,
+ xml_parser& b
+ ) { a.swap(b); }
+ /*!
+ provides a global swap function
+ !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class xml_parse_error : public error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is the exception object thrown by the parse_xml() routines defined
+ below.
+ !*/
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ void parse_xml (
+ std::istream& in,
+ document_handler& dh,
+ error_handler& eh
+ );
+ /*!
+ ensures
+ - makes an xml_parser and tells it to parse the given input stream using the
+ supplied document_handler and error_handler.
+ !*/
+
+ void parse_xml (
+ std::istream& in,
+ error_handler& eh,
+ document_handler& dh
+ )
+ /*!
+ ensures
+ - makes an xml_parser and tells it to parse the given input stream using the
+ supplied document_handler and error_handler.
+ !*/
+
+ void parse_xml (
+ std::istream& in,
+ error_handler& eh
+ );
+ /*!
+ ensures
+ - makes an xml_parser and tells it to parse the given input stream using the
+ supplied error_handler.
+ !*/
+
+ void parse_xml (
+ std::istream& in,
+ document_handler& dh
+ );
+ /*!
+ ensures
+ - makes an xml_parser and tells it to parse the given input stream using the
+ supplied document_handler.
+ - Uses a default error handler that will throw an xml_parse_error exception
+ if a fatal parsing error is encountered.
+ throws
+ - xml_parse_error
+ Thrown if a fatal parsing error is encountered.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void parse_xml (
+ const std::string& filename,
+ document_handler& dh,
+ error_handler& eh
+ );
+ /*!
+ ensures
+ - makes an xml_parser and tells it to parse the given input file using the
+ supplied error_handler and document_handler.
+ throws
+ - xml_parse_error
+ Thrown if there is a problem parsing the input file.
+ !*/
+
+ void parse_xml (
+ const std::string& filename,
+ error_handler& eh,
+ document_handler& dh
+ );
+ /*!
+ ensures
+ - makes an xml_parser and tells it to parse the given input file using the
+ supplied error_handler and document_handler.
+ throws
+ - xml_parse_error
+ Thrown if there is a problem parsing the input file.
+ !*/
+
+ void parse_xml (
+ const std::string& filename,
+ error_handler& eh
+ );
+ /*!
+ ensures
+ - makes an xml_parser and tells it to parse the given input file using the
+ supplied error_handler.
+ throws
+ - xml_parse_error
+ Thrown if there is a problem parsing the input file.
+ !*/
+
+ void parse_xml (
+ const std::string& filename,
+ document_handler& dh
+ );
+ /*!
+ ensures
+ - makes an xml_parser and tells it to parse the given input file using the
+ supplied document_handler.
+ - Uses a default error handler that will throw an xml_parse_error exception
+ if a fatal parsing error is encountered.
+ throws
+ - xml_parse_error
+ Thrown if there is a problem parsing the input file.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_XML_PARSER_KERNEl_ABSTRACT_
+
diff --git a/ml/dlib/dlib/xml_parser/xml_parser_kernel_interfaces.h b/ml/dlib/dlib/xml_parser/xml_parser_kernel_interfaces.h
new file mode 100644
index 000000000..a0edf3317
--- /dev/null
+++ b/ml/dlib/dlib/xml_parser/xml_parser_kernel_interfaces.h
@@ -0,0 +1,244 @@
+// Copyright (C) 2003 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_XML_PARSER_KERNEl_INTERFACES_
+#define DLIB_XML_PARSER_KERNEl_INTERFACES_
+
+#include <string>
+#include "../interfaces/enumerable.h"
+#include "../interfaces/map_pair.h"
+#include "../error.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class xml_attribute_list_error : public dlib::error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an exception object thrown by attribute_list objects if you try to
+ access a non-existent attribute.
+ !*/
+ public:
+ xml_attribute_list_error(const std::string& msg) : dlib::error(msg){}
+ };
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class attribute_list : public enumerable<map_pair<std::string,std::string> >
+ {
+
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ this object represents a list of the attributes found in
+ an XML element. each attribute is associated with a value.
+ !*/
+
+
+ public:
+
+ inline virtual ~attribute_list (
+ ) =0;
+
+
+ virtual bool is_in_list (
+ const std::string& key
+ ) const =0;
+ /*!
+ ensures
+ - returns true if there is an attribute named key in the list
+ - returns false
+ !*/
+
+ virtual const std::string& operator[] (
+ const std::string& key
+ ) const =0;
+ /*!
+ ensures
+ if (is_in_list(key) == true) then
+ - returns a const reference to the value associated with the attribute
+ named key.
+ - else
+ - throws xml_attribute_list_error
+ !*/
+
+ protected:
+
+ // restricted functions
+ attribute_list& operator=(attribute_list&) {return *this;}
+ };
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class document_handler
+ {
+ /*!
+ EXCEPTIONS
+ a document_handler is allowed to throw any exception
+
+
+ WHAT THIS OBJECT REPRESENTS
+ this object is an interface for handling the basic events
+ generated by an XML parser
+ !*/
+
+
+ public:
+
+ inline virtual ~document_handler (
+ ) =0;
+
+ virtual void start_document (
+ )=0;
+ /*!
+ requires
+ - is called when the document parsing begins
+ !*/
+
+ virtual void end_document (
+ )=0;
+ /*!
+ requires
+ - is called after the document parsing has ended. note that this
+ is always called, even if an error occurs.
+ !*/
+
+ virtual void start_element (
+ const unsigned long line_number,
+ const std::string& name,
+ const dlib::attribute_list& atts
+ )=0;
+ /*!
+ requires
+ - is called when an opening element tag is encountered.
+ - line_number == the line number where the opening tag for this element
+ was encountered.
+ - name == the name of the element encountered
+ - atts == a list containing all the attributes in this element and their
+ associated values
+ !*/
+
+ virtual void end_element (
+ const unsigned long line_number,
+ const std::string& name
+ )=0;
+ /*!
+ requires
+ - is called when a closing element tag is encountered. (note that this
+ includes tags such as <example_tag/>. I.e. the previous tag would
+ trigger a start_element() callback as well as an end_element() callback)
+ - line_number == the line number where the closing tag for this
+ element was encountered and
+ - name == the name of the element encountered
+ !*/
+
+ virtual void characters (
+ const std::string& data
+ )=0;
+ /*!
+ requires
+ - is called just before we encounter a start_element, end_element, or
+ processing_instruction tag but only if there was data between the
+ last and next tag.
+ (i.e. data will never be "")
+ - data == all the normal non-markup data and CDATA between the next and
+ last tag in the document.
+ !*/
+
+ virtual void processing_instruction (
+ const unsigned long line_number,
+ const std::string& target,
+ const std::string& data
+ )=0;
+ /*!
+ requires
+ - is called when a processing instruction is encountered
+ - line_number == the line number where this processing instruction
+ was encountered
+ - target == the target value for this processing instruction
+ - data == the data value for this processing instruction
+ !*/
+
+ protected:
+
+ // restricted functions
+ document_handler& operator=(document_handler&) { return *this; }
+ };
+
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class error_handler
+ {
+ /*!
+ EXCEPTIONS
+ an error_handler is allowed to throw any exception
+
+
+ WHAT THIS OBJECT REPRESENTS
+ this object is an interface for handling the error/warning
+ events generated by an XML parser
+ !*/
+
+ public:
+
+ inline virtual ~error_handler (
+ ) =0;
+
+ virtual void error (
+ const unsigned long line_number
+ )=0;
+ /*!
+ requires
+ - is called when an error that does NOT require the parser to halt
+ is encountered. (i.e. somewhat minor errors in the input)
+ - line_number == the line number where this error was encountered
+
+ the following events trigger an error:
+ an invalid processing instruction
+ !*/
+
+ virtual void fatal_error (
+ const unsigned long line_number
+ )=0;
+ /*!
+ requires
+ - is called when an error that requires the parser to abort its parsing
+ is encountered (i.e. fatal errors in the input)
+ - line_number == the line number where this fatal error was encountered
+
+ the following events trigger a fatal_error:
+ Everything other than the events listed above for error.
+ Also note that encountering an entity reference other than the
+ predefined ones listed in xml_parser_kernel_abstract is a fatal_error.
+ Hitting EOF before the closing tag for the document is also a fatal_error.
+ !*/
+
+ protected:
+
+ // restricted functions
+ error_handler& operator=(error_handler&) { return *this;}
+ };
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ document_handler::~document_handler (
+ ){}
+ attribute_list::~attribute_list (
+ ){}
+ error_handler::~error_handler (
+ ){}
+
+}
+
+#endif // DLIB_XML_PARSER_KERNEl_INTERFACES_
+