diff options
Diffstat (limited to 'ml/dlib/dlib/xml_parser')
-rw-r--r-- | ml/dlib/dlib/xml_parser/xml_parser_kernel_1.h | 1532 | ||||
-rw-r--r-- | ml/dlib/dlib/xml_parser/xml_parser_kernel_abstract.h | 276 | ||||
-rw-r--r-- | ml/dlib/dlib/xml_parser/xml_parser_kernel_interfaces.h | 244 |
3 files changed, 2052 insertions, 0 deletions
diff --git a/ml/dlib/dlib/xml_parser/xml_parser_kernel_1.h b/ml/dlib/dlib/xml_parser/xml_parser_kernel_1.h new file mode 100644 index 000000000..e1854bc26 --- /dev/null +++ b/ml/dlib/dlib/xml_parser/xml_parser_kernel_1.h @@ -0,0 +1,1532 @@ +// Copyright (C) 2003 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_XML_PARSER_KERNEl_1_ +#define DLIB_XML_PARSER_KERNEl_1_ + + +#include "xml_parser_kernel_abstract.h" + +#include <sstream> +#include <string> +#include <fstream> +#include <iostream> +#include "xml_parser_kernel_interfaces.h" +#include "../algs.h" +#include <cstdio> +#include "../map.h" +#include "../stack.h" +#include "../sequence.h" +#include "../memory_manager.h" + +namespace dlib +{ + + class xml_parser + { + typedef dlib::map<std::string,std::string,memory_manager<char>::kernel_2a>::kernel_1b map; + typedef dlib::stack<std::string,memory_manager<char>::kernel_2a>::kernel_1a stack; + typedef sequence<document_handler*>::kernel_2a seq_dh; + typedef sequence<error_handler*>::kernel_2a seq_eh; + + /*! + INITIAL VALUE + dh_list.size() == 0 + eh_list.size() == 0 + + CONVENTION + dh_list == a sequence of pointers to all the document_handlers that + have been added to the xml_parser + eh_list == a sequence of pointers to all the error_handlers that + have been added to the xml_parser + + map is used to implement the attribute_list interface + stack is used just inside the parse function + seq_dh is used to make the dh_list member variable + seq_eh is used to make the eh_list member variable + !*/ + + + + public: + + // These typedefs are here for backwards compatibly with previous versions of + // dlib. + typedef xml_parser kernel_1a; + typedef xml_parser kernel_1a_c; + + xml_parser( + ) {} + + virtual ~xml_parser( + ){} + + inline void clear( + ); + + inline void parse ( + std::istream& in + ); + + inline void add_document_handler ( + document_handler& item + ); + + inline void add_error_handler ( + error_handler& item + ); + + + inline void swap ( + xml_parser& item + ); + + + private: + + // ----------------------------------- + + // attribute_list interface implementation + class attrib_list : public attribute_list + { + public: + // the list of attribute name/value pairs + map list; + + bool is_in_list ( + const std::string& key + ) const + { + return list.is_in_domain(key); + } + + const std::string& operator[] ( + const std::string& key + ) const + { + if (is_in_list(key)) + return list[key]; + else + throw xml_attribute_list_error("No XML attribute named " + key + " is present in tag."); + } + + bool at_start ( + ) const { return list.at_start(); } + + void reset ( + ) const { return list.reset(); } + + bool current_element_valid ( + ) const { return list.current_element_valid(); } + + const type& element ( + ) const { return list.element(); } + + type& element ( + ) { return list.element(); } + + bool move_next ( + ) const { return list.move_next(); } + + size_t size ( + ) const { return list.size(); } + }; + + + // ----------------------------------- + + enum token_type + { + element_start, // the first tag of an element + element_end, // the last tag of an element + empty_element, // the singular tag of an empty element + pi, // processing instruction + chars, // the non-markup data between tags + chars_cdata, // the data from a CDATA section + eof, // this token is returned when we reach the end of input + error, // this token indicates that the tokenizer couldn't + // determine which category the next token fits into + dtd, // this token is for an entire dtd + comment // this is a token for comments + }; + /* + notes about the tokens: + the tokenizer guarantees that the following tokens to not + contain the '<' character except as the first character of the token + element_start, element_end, empty_element, and pi. they also only + contain the '>' characer as their last character. + + it is also guaranteed that pi is at least of the form <??>. that + is to say that it always always begins with <? and ends with ?>. + + it is also guaranteed that all markup tokens will begin with the '<' + character and end with the '>'. there won't be any leading or + trailing whitespaces. this whitespace is considered a chars token. + */ + + + // private member functions + inline void get_next_token( + std::istream& in, + std::string& token_text, + int& token_kind, + unsigned long& line_number + ); + /*! + ensures + gets the next token from in and puts it in token_text and + token_kind == the kind of the token found and + line_number is incremented every time a '\n' is encountered and + entity references are translated into the characters they represent + only for chars tokens + !*/ + + inline int parse_element ( + const std::string& token, + std::string& name, + attrib_list& atts + ); + /*! + requires + token is a token of kind start_element or empty_element + ensures + gets the element name and puts it into the string name and + parses out the attributes and puts them into the attribute_list atts + + return 0 upon success or + returns -1 if it failed to parse token + !*/ + + inline int parse_pi ( + const std::string& token, + std::string& target, + std::string& data + ); + /*! + requires + token is a token of kind pi + ensures + the target from the processing instruction is put into target and + the data from the processing instruction is put into data + + return 0 upon success or + returns -1 if it failed to parse token + !*/ + + inline int parse_element_end ( + const std::string& token, + std::string& name + ); + /*! + requires + token is a token of kind element_end + ensures + the name from the ending element tag is put into the string name + + return 0 upon success or + returns -1 if it failed to parse token + !*/ + + inline int change_entity ( + std::istream& in + ); + /*! + ensures + performs the following translations and returns the new character + amp; -> & + lt; -> < + gt; -> > + apos; -> ' + quot; -> " + + or returns -1 if we hit an undefined entity reference or EOF. + (i.e. it was not one of the entities listed above) + + !*/ + + // ----------------------------------- + + // private member data + seq_dh dh_list; + seq_eh eh_list; + + // ----------------------------------- + + // restricted functions: assignment and copy construction + xml_parser(xml_parser&); + xml_parser& operator= ( + xml_parser& + ); + + }; + + inline void swap ( + xml_parser& a, + xml_parser& b + ) { a.swap(b); } + + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + // member function definitions +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + void xml_parser:: + clear( + ) + { + // unregister all event handlers + eh_list.clear(); + dh_list.clear(); + } + +// ---------------------------------------------------------------------------------------- + + void xml_parser:: + parse ( + std::istream& in + ) + { + DLIB_CASSERT ( in.fail() == false , + "\tvoid xml_parser::parse" + << "\n\tthe input stream must not be in the fail state" + << "\n\tthis: " << this + ); + + + // save which exceptions in will throw and make it so it won't throw any + // for the life of this function + std::ios::iostate old_exceptions = in.exceptions(); + // set it to not throw anything + in.exceptions(std::ios::goodbit); + + + try + { + unsigned long line_number = 1; + + // skip any whitespace before the start of the document + while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' ) + { + if (in.peek() == '\n') + ++line_number; + in.get(); + } + + + + stack tags; // this stack contains the last start tag seen + bool seen_fatal_error = false; + bool seen_root_tag = false; // this is true after we have seen the root tag + + + + // notify all the document_handlers that we are about to being parsing + for (unsigned long i = 0; i < dh_list.size(); ++i) + { + dh_list[i]->start_document(); + } + + + std::string chars_buf; // used to collect chars data between consecutive + // chars and chars_cdata tokens so that + // document_handlers receive all chars data between + // tags in one call + + // variables to be used with the parsing functions + attrib_list atts; + std::string name; + std::string target; + std::string data; + + + + // variables to use with the get_next_token() function + std::string token_text; + int token_kind; + + get_next_token(in,token_text,token_kind,line_number); + + + while (token_kind != eof) + { + bool is_empty = false; // this becomes true when this token is an empty_element + + switch (token_kind) + { + + + case empty_element: is_empty = true; + // fall through + case element_start: + { + seen_root_tag = true; + + int status = parse_element(token_text,name,atts); + // if there was no error parsing the element + if (status == 0) + { + // notify all the document_handlers + for (unsigned long i = 0; i < dh_list.size(); ++i) + { + dh_list[i]->start_element(line_number,name,atts); + if (is_empty) + dh_list[i]->end_element(line_number,name); + } + } + else + { + seen_fatal_error = true; + } + + // if this is an element_start token then push the name of + // the element on to the stack + if (token_kind == element_start) + { + tags.push(name); + } + + }break; + + // ---------------------------------------- + + case element_end: + { + + int status = parse_element_end (token_text,name); + + // if there was no error parsing the element + if (status == 0) + { + // make sure this ending element tag matches the last start + // element tag we saw + if ( tags.size() == 0 || name != tags.current()) + { + // they don't match so signal a fatal error + seen_fatal_error = true; + } + else + { + // notify all the document_handlers + for (unsigned long i = 0; i < dh_list.size(); ++i) + { + dh_list[i]->end_element(line_number,name); + } + + // they match so throw away this element name + tags.pop(name); + } + } + else + { + seen_fatal_error = true; + } + + + }break; + + // ---------------------------------------- + + case pi: + { + + int status = parse_pi (token_text,target,data); + // if there was no error parsing the element + if (status == 0) + { + // notify all the document_handlers + for (unsigned long i = 0; i < dh_list.size(); ++i) + { + dh_list[i]->processing_instruction(line_number,target,data); + } + } + else + { + // notify all the error_handlers + for (unsigned long i = 0; i < eh_list.size(); ++i) + { + eh_list[i]->error(line_number); + } + } + while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' ) + { + if (in.peek() == '\n') + ++line_number; + in.get(); + } + + + }break; + + // ---------------------------------------- + + case chars: + { + if (tags.size() != 0) + { + chars_buf += token_text; + } + else if (token_text.find_first_not_of(" \t\r\n") != std::string::npos) + { + // you can't have non whitespace chars data outside the root element + seen_fatal_error = true; + } + }break; + + // ---------------------------------------- + + case chars_cdata: + { + if (tags.size() != 0) + { + chars_buf += token_text; + } + else + { + // you can't have chars_data outside the root element + seen_fatal_error = true; + } + }break; + + // ---------------------------------------- + + case eof: + break; + + // ---------------------------------------- + + case error: + { + seen_fatal_error = true; + }break; + + // ---------------------------------------- + + case dtd: // fall though + case comment: // do nothing + break; + + // ---------------------------------------- + + + } + + // if there was a fatal error then quit loop + if (seen_fatal_error) + break; + + // if we have seen the last tag then quit the loop + if (tags.size() == 0 && seen_root_tag) + break; + + + get_next_token(in,token_text,token_kind,line_number); + + // if the next token is not a chars or chars_cdata token then flush + // the chars_buf to the document_handlers + if ( (token_kind != chars) && + (token_kind != chars_cdata) && + (token_kind != dtd) && + (token_kind != comment) && + (chars_buf.size() != 0) + ) + { + // notify all the document_handlers + for (unsigned long i = 0; i < dh_list.size(); ++i) + { + dh_list[i]->characters(chars_buf); + } + chars_buf.erase(); + } + + + } //while (token_kind != eof) + + + + + // you can't have any unmatched tags or any fatal erros + if (tags.size() != 0 || seen_fatal_error) + { + // notify all the error_handlers + for (unsigned long i = 0; i < eh_list.size(); ++i) + { + eh_list[i]->fatal_error(line_number); + } + + } + + + // notify all the document_handlers that we have ended parsing + for (unsigned long i = 0; i < dh_list.size(); ++i) + { + dh_list[i]->end_document(); + } + + } + catch (...) + { + // notify all the document_handlers that we have ended parsing + for (unsigned long i = 0; i < dh_list.size(); ++i) + { + dh_list[i]->end_document(); + } + + // restore the old exception settings to in + in.exceptions(old_exceptions); + + // don't forget to rethrow the exception + throw; + } + + // restore the old exception settings to in + in.exceptions(old_exceptions); + + } + +// ---------------------------------------------------------------------------------------- + + void xml_parser:: + add_document_handler ( + document_handler& item + ) + { + document_handler* temp = &item; + dh_list.add(dh_list.size(),temp); + } + +// ---------------------------------------------------------------------------------------- + + void xml_parser:: + add_error_handler ( + error_handler& item + ) + { + error_handler* temp = &item; + eh_list.add(eh_list.size(),temp); + } + +// ---------------------------------------------------------------------------------------- + + void xml_parser:: + swap ( + xml_parser& item + ) + { + dh_list.swap(item.dh_list); + eh_list.swap(item.eh_list); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + // private member function definitions +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + void xml_parser:: + get_next_token( + std::istream& in, + std::string& token_text, + int& token_kind, + unsigned long& line_number + ) + { + + token_text.erase(); + + std::istream::int_type ch1 = in.get(); + std::istream::int_type ch2; + + + switch (ch1) + { + + // ----------------------------------------- + + // this is the start of some kind of a tag + case '<': + { + ch2 = in.get(); + switch (ch2) + { + + // --------------------------------- + + // this is a dtd, comment, or chars_cdata token + case '!': + { + // if this is a CDATA section ******************************* + if ( in.peek() == '[') + { + token_kind = chars_cdata; + + // throw away the '[' + in.get(); + + // make sure the next chars are CDATA[ + std::istream::int_type ch = in.get(); + if (ch != 'C') + token_kind = error; + ch = in.get(); + if (ch != 'D') + token_kind = error; + ch = in.get(); + if (ch != 'A') + token_kind = error; + ch = in.get(); + if (ch != 'T') + token_kind = error; + ch = in.get(); + if (ch != 'A') + token_kind = error; + ch = in.get(); + if (ch != '[') + token_kind = error; + // if this is an error token then end + if (token_kind == error) + break; + + + // get the rest of the chars and put them into token_text + int brackets_seen = 0; // this is the number of ']' chars + // we have seen in a row + bool seen_closing = false; // true if we have seen ]]> + do + { + ch = in.get(); + + if (ch == '\n') + ++line_number; + + token_text += ch; + + // if this is the closing + if (brackets_seen == 2 && ch == '>') + seen_closing = true; + // if we are seeing a bracket + else if (ch == ']') + ++brackets_seen; + // if we didn't see a bracket + else + brackets_seen = 0; + + + } while ( (!seen_closing) && (ch != EOF) ); + + // check if this is an error token + if (ch == EOF) + { + token_kind = error; + } + else + { + token_text.erase(token_text.size()-3); + } + + + + } + // this is a comment token **************************** + else if (in.peek() == '-') + { + + token_text += ch1; + token_text += ch2; + token_text += '-'; + + token_kind = comment; + + // throw away the '-' char + in.get(); + + // make sure the next char is another '-' + std::istream::int_type ch = in.get(); + if (ch != '-') + { + token_kind = error; + break; + } + + token_text += '-'; + + + // get the rest of the chars and put them into token_text + int hyphens_seen = 0; // this is the number of '-' chars + // we have seen in a row + bool seen_closing = false; // true if we have seen ]]> + do + { + ch = in.get(); + + if (ch == '\n') + ++line_number; + + token_text += ch; + + // if this should be a closing block + if (hyphens_seen == 2) + { + if (ch == '>') + seen_closing = true; + else // this isn't a closing so make it signal error + ch = EOF; + } + // if we are seeing a hyphen + else if (ch == '-') + ++hyphens_seen; + // if we didn't see a hyphen + else + hyphens_seen = 0; + + + } while ( (!seen_closing) && (ch != EOF) ); + + // check if this is an error token + if (ch == EOF) + { + token_kind = error; + } + + + + + + } + else // this is a dtd token ************************* + { + + token_text += ch1; + token_text += ch2; + int bracket_depth = 1; // this is the number of '<' chars seen + // minus the number of '>' chars seen + + std::istream::int_type ch; + do + { + ch = in.get(); + if (ch == '>') + --bracket_depth; + else if (ch == '<') + ++bracket_depth; + else if (ch == '\n') + ++line_number; + + token_text += ch; + + } while ( (bracket_depth > 0) && (ch != EOF) ); + + // make sure we didn't just hit EOF + if (bracket_depth == 0) + { + token_kind = dtd; + } + else + { + token_kind = error; + } + } + } + break; + + // --------------------------------- + + // this is a pi token + case '?': + { + token_text += ch1; + token_text += ch2; + std::istream::int_type ch; + + do + { + ch = in.get(); + token_text += ch; + if (ch == '\n') + ++line_number; + // else if we hit a < then thats an error + else if (ch == '<') + ch = EOF; + } while (ch != '>' && ch != EOF); + // if we hit the end of the pi + if (ch == '>') + { + // make sure there was a trailing '?' + if ( (token_text.size() > 3) && + (token_text[token_text.size()-2] != '?') + ) + { + token_kind = error; + } + else + { + token_kind = pi; + } + } + // if we hit EOF unexpectidely then error + else + { + token_kind = error; + } + } + break; + + // --------------------------------- + + // this is an error token + case EOF: + { + token_kind = error; + } + break; + + // --------------------------------- + // this is an element_end token + case '/': + { + token_kind = element_end; + token_text += ch1; + token_text += ch2; + std::istream::int_type ch; + do + { + ch = in.get(); + if (ch == '\n') + ++line_number; + // else if we hit a < then thats an error + else if (ch == '<') + ch = EOF; + token_text += ch; + } while ( (ch != '>') && (ch != EOF)); + + // check if this is an error token + if (ch == EOF) + { + token_kind = error; + } + } + break; + + + // --------------------------------- + + // this is an element_start or empty_element token + default: + { + + token_text += ch1; + token_text += ch2; + std::istream::int_type ch = '\0'; + std::istream::int_type last; + do + { + last = ch; + ch = in.get(); + if (ch == '\n') + ++line_number; + // else if we hit a < then thats an error + else if (ch == '<') + ch = EOF; + token_text += ch; + } while ( (ch != '>') && (ch != EOF)); + + // check if this is an error token + if (ch == EOF) + { + token_kind = error; + } + // if this is an empty_element + else if (last == '/') + { + token_kind = empty_element; + } + else + { + token_kind = element_start; + } + + + } + break; + + // --------------------------------- + + } + + } + break; + + // ----------------------------------------- + + // this is an eof token + case EOF: + { + token_kind = eof; + } + break; + + // ----------------------------------------- + + // this is a chars token + default: + { + if (ch1 == '\n') + { + ++line_number; + token_text += ch1; + } + // if the first thing in this chars token is an entity reference + else if (ch1 == '&') + { + + int temp = change_entity(in); + if (temp == -1) + { + token_kind = error; + break; + } + else + { + token_text += temp; + } + } + else + { + token_text += ch1; + } + + + token_kind = chars; + + std::istream::int_type ch = 0; + while (in.peek() != '<' && in.peek() != EOF) + { + ch = in.get(); + + if (ch == '\n') + ++line_number; + + // if this is one of the predefined entity references then change it + if (ch == '&') + { + int temp = change_entity(in); + if (temp == -1) + { + ch = EOF; + break; + } + else + token_text += temp; + } + else + { + token_text += ch; + } + } + + // if this is an error token + if (ch == EOF) + { + token_kind = error; + } + + } + break; + + // ----------------------------------------- + + } + + + } + + + +// ---------------------------------------------------------------------------------------- + + int xml_parser:: + parse_element ( + const std::string& token, + std::string& name, + attrib_list& atts + ) + { + name.erase(); + atts.list.clear(); + + // there must be at least one character between the <> + if (token[1] == '>') + return -1; + + std::string::size_type i; + std::istream::int_type ch = token[1]; + i = 2; + + // fill out name. the name can not contain any of the following characters + while ( (ch != '>') && + (ch != ' ') && + (ch != '=') && + (ch != '/') && + (ch != '\t') && + (ch != '\r') && + (ch != '\n') + ) + { + name += ch; + ch = token[i]; + ++i; + } + + // skip any whitespaces + while ( ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ) + { + ch = token[i]; + ++i; + } + + // find any attributes + while (ch != '>' && ch != '/') + { + std::string attribute_name; + std::string attribute_value; + + // fill out attribute_name + while ( (ch != '=') && + (ch != ' ') && + (ch != '\t') && + (ch != '\r') && + (ch != '\n') && + (ch != '>') + ) + { + attribute_name += ch; + ch = token[i]; + ++i; + } + + // you can't have empty attribute names + if (attribute_name.size() == 0) + return -1; + + // if we hit > too early then return error + if (ch == '>') + return -1; + + // skip any whitespaces + while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r') + { + ch = token[i]; + ++i; + } + + // the next char should be a '=', error if it's not + if (ch != '=') + return -1; + + // get the next char + ch = token[i]; + ++i; + + // skip any whitespaces + while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r') + { + ch = token[i]; + ++i; + } + + + // get the delimiter for the attribute value + std::istream::int_type delimiter = ch; // this should be either a ' or " character + ch = token[i]; // get the next char + ++i; + if (delimiter != '\'' && delimiter!='"') + return -1; + + + // fill out attribute_value + while ( (ch != delimiter) && + (ch != '>') + ) + { + attribute_value += ch; + ch = token[i]; + ++i; + } + + + // if there was no delimiter then this is an error + if (ch == '>') + { + return -1; + } + + // go to the next char + ch = token[i]; + ++i; + + // the next char must be either a '>' or '/' (denoting the end of the tag) + // or a white space character + if (ch != '>' && ch != ' ' && ch != '/' && ch != '\t' && ch !='\n' && ch !='\r') + return -1; + + // skip any whitespaces + while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r') + { + ch = token[i]; + ++i; + } + + + // add attribute_value and attribute_name to atts + if (atts.list.is_in_domain(attribute_name)) + { + // attributes may not be multiply defined + return -1; + } + else + { + atts.list.add(attribute_name,attribute_value); + } + + + } + + // you can't have an element with no name + if (name.size() == 0) + return -1; + + return 0; + + } + +// ---------------------------------------------------------------------------------------- + + int xml_parser:: + parse_pi ( + const std::string& token, + std::string& target, + std::string& data + ) + { + target.erase(); + data.erase(); + + std::istream::int_type ch = token[2]; + std::string::size_type i = 3; + while (ch != ' ' && ch != '?' && ch != '\t' && ch != '\n' && ch!='\r') + { + target += ch; + ch = token[i]; + ++i; + } + if (target.size() == 0) + return -1; + + // if we aren't at a ? character then go to the next character + if (ch != '?' ) + { + ch = token[i]; + ++i; + } + + // if we still aren't at the end of the processing instruction then + // set this stuff in the data section + while (ch != '?') + { + data += ch; + ch = token[i]; + ++i; + } + + return 0; + } + +// ---------------------------------------------------------------------------------------- + + int xml_parser:: + parse_element_end ( + const std::string& token, + std::string& name + ) + { + name.erase(); + std::string::size_type end = token.size()-1; + for (std::string::size_type i = 2; i < end; ++i) + { + if (token[i] == ' ' || token[i] == '\t' || token[i] == '\n'|| token[i] == '\r') + break; + name += token[i]; + } + + if (name.size() == 0) + return -1; + + return 0; + } + +// ---------------------------------------------------------------------------------------- + + int xml_parser:: + change_entity ( + std::istream& in + ) + { + + std::istream::int_type buf[6]; + + + buf[1] = in.get(); + + // if this is an undefined entity reference then return error + if (buf[1] != 'a' && + buf[1] != 'l' && + buf[1] != 'g' && + buf[1] != 'q' + ) + return -1; + + + buf[2] = in.get(); + // if this is an undefined entity reference then return error + if (buf[2] != 'm' && + buf[2] != 't' && + buf[2] != 'p' && + buf[2] != 'u' + ) + return -1; + + + buf[3] = in.get(); + // if this is an undefined entity reference then return error + if (buf[3] != 'p' && + buf[3] != ';' && + buf[3] != 'o' + ) + return -1; + + // check if this is < or > + if (buf[3] == ';') + { + if (buf[2] != 't') + return -1; + + // if this is < then return '<' + if (buf[1] == 'l') + return '<'; + // if this is > then return '>' + if (buf[1] == 'g') + return '>'; + + // it is neither so it must be an undefined entity reference + return -1; + } + + + buf[4] = in.get(); + // if this should be & + if (buf[4] == ';') + { + // if this is not & then return error + if (buf[1] != 'a' || + buf[2] != 'm' || + buf[3] != 'p' + ) + return -1; + + return '&'; + } + + buf[5] = in.get(); + + // if this should be ' + if (buf[1] == 'a' && + buf[2] == 'p' && + buf[3] == 'o' && + buf[4] == 's' && + buf[5] == ';' + ) + return '\''; + + + // if this should be " + if (buf[1] == 'q' && + buf[2] == 'u' && + buf[3] == 'o' && + buf[4] == 't' && + buf[5] == ';' + ) + return '"'; + + + // it was an undefined entity reference + return -1; + + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class xml_parse_error : public error + { + public: + xml_parse_error( + const std::string& a + ): error(a) {} + }; + + namespace impl + { + class default_xml_error_handler : public error_handler + { + std::string filename; + + public: + + default_xml_error_handler ( + ) {} + + default_xml_error_handler ( + const std::string& filename_ + ) :filename(filename_) {} + + virtual void error ( + const unsigned long + ) + { + // just ignore non-fatal errors + } + + virtual void fatal_error ( + const unsigned long line_number + ) + { + std::ostringstream sout; + if (filename.size() != 0) + sout << "There is a fatal error on line " << line_number << " in the XML file '"<<filename<<"'."; + else + sout << "There is a fatal error on line " << line_number << " in the XML being processed."; + + throw xml_parse_error(sout.str()); + } + }; + } + + inline void parse_xml ( + std::istream& in, + document_handler& dh, + error_handler& eh + ) + { + if (!in) + throw xml_parse_error("Unexpected end of file during xml parsing."); + xml_parser parser; + parser.add_document_handler(dh); + parser.add_error_handler(eh); + parser.parse(in); + } + + inline void parse_xml ( + std::istream& in, + error_handler& eh, + document_handler& dh + ) + { + if (!in) + throw xml_parse_error("Unexpected end of file during xml parsing."); + xml_parser parser; + parser.add_document_handler(dh); + parser.add_error_handler(eh); + parser.parse(in); + } + + inline void parse_xml ( + std::istream& in, + error_handler& eh + ) + { + if (!in) + throw xml_parse_error("Unexpected end of file during xml parsing."); + xml_parser parser; + parser.add_error_handler(eh); + parser.parse(in); + } + + inline void parse_xml ( + std::istream& in, + document_handler& dh + ) + { + if (!in) + throw xml_parse_error("Unexpected end of file during xml parsing."); + xml_parser parser; + parser.add_document_handler(dh); + impl::default_xml_error_handler eh; + parser.add_error_handler(eh); + parser.parse(in); + } + +// ---------------------------------------------------------------------------------------- + + inline void parse_xml ( + const std::string& filename, + document_handler& dh, + error_handler& eh + ) + { + std::ifstream in(filename.c_str()); + if (!in) + throw xml_parse_error("Unable to open file '" + filename + "'."); + xml_parser parser; + parser.add_document_handler(dh); + parser.add_error_handler(eh); + parser.parse(in); + } + + inline void parse_xml ( + const std::string& filename, + error_handler& eh, + document_handler& dh + ) + { + std::ifstream in(filename.c_str()); + if (!in) + throw xml_parse_error("Unable to open file '" + filename + "'."); + xml_parser parser; + parser.add_document_handler(dh); + parser.add_error_handler(eh); + parser.parse(in); + } + + inline void parse_xml ( + const std::string& filename, + error_handler& eh + ) + { + std::ifstream in(filename.c_str()); + if (!in) + throw xml_parse_error("Unable to open file '" + filename + "'."); + xml_parser parser; + parser.add_error_handler(eh); + parser.parse(in); + } + + inline void parse_xml ( + const std::string& filename, + document_handler& dh + ) + { + std::ifstream in(filename.c_str()); + if (!in) + throw xml_parse_error("Unable to open file '" + filename + "'."); + xml_parser parser; + parser.add_document_handler(dh); + impl::default_xml_error_handler eh(filename); + parser.add_error_handler(eh); + parser.parse(in); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_XML_PARSER_KERNEl_1_ + diff --git a/ml/dlib/dlib/xml_parser/xml_parser_kernel_abstract.h b/ml/dlib/dlib/xml_parser/xml_parser_kernel_abstract.h new file mode 100644 index 000000000..45b513e55 --- /dev/null +++ b/ml/dlib/dlib/xml_parser/xml_parser_kernel_abstract.h @@ -0,0 +1,276 @@ +// Copyright (C) 2003 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_XML_PARSER_KERNEl_ABSTRACT_ +#ifdef DLIB_XML_PARSER_KERNEl_ABSTRACT_ + +#include <string> +#include <iosfwd> +#include "xml_parser_kernel_interfaces.h" + +namespace dlib +{ + + class xml_parser + { + + /*! + INITIAL VALUE + no objects are registered to receive events + + + WHAT THIS OBJECT REPRESENTS + This object represents a simple SAX style event driven XML parser. + It takes its input from an input stream object and sends events to all + registered document_handler and error_handler objects. + + note that this xml parser ignores all DTD related XML markup. It will + parse XML documents with DTD's but it just won't check if the document + is valid. This also means that entity references may not be used except + for the predefined ones which are as follows: + & + < + > + ' + " + + also note that there is no interpreting of entity references inside + a CDATA section or inside of tags, they are only interpreted inside + normal non-markup data. + + This parser considers the end of the xml document to be the closing + tag of the root tag (as opposed to using EOF as the end of the + document). This is a deviation from the xml standard. + + Aside from ignoring DTD stuff and entity references everywhere but + data, and the above comment regarding EOF, this parser should conform + to the rest of the XML standard. + !*/ + + public: + + + xml_parser( + ); + /*! + ensures + - #*this is properly initialized + throws + - std::bad_alloc + !*/ + + virtual ~xml_parser( + ); + /*! + ensures + - all memory associated with *this has been released + !*/ + + void clear( + ); + /*! + ensures + - #*this has its initial value + throws + - std::bad_alloc + if this exception is thrown then *this is unusable + until clear() is called and succeeds + !*/ + + void parse ( + std::istream& in + ); + /*! + requires + - in.fail() == false + ensures + - the data from the input stream in will be parsed and the appropriate + events will be generated + - parsing will stop when the parser has reached the closing tag + for the xml document or EOF (which ever comes first). Note that + hitting EOF first is a fatal error. + throws + - std::bad_alloc + if parse() throws then it will be unusable until clear() is + called and succeeds + - other exceptions + document_handlers and error_handlers my throw any exception. If + they throw while parse() is running then parse() will let the + exception propagate out and the xml_parser object will be unusable + until clear() is called and succeeds. note that end_document() + is still called. + !*/ + + void add_document_handler ( + document_handler& item + ); + /*! + ensures + - item will now receive document events from the parser + throws + - std::bad_alloc + if add_document_handler() throws then it has no effect + !*/ + + void add_error_handler ( + error_handler& item + ); + /*! + ensures + - item will now receive error events from the parser + throws + - std::bad_alloc + if add_error_handler() throws then it has no effect + !*/ + + + void swap ( + xml_parser& item + ); + /*! + ensures + - swaps *this and item + !*/ + + + private: + + // restricted functions + xml_parser(xml_parser&); // copy constructor + xml_parser& operator=(xml_parser&); // assignment operator + + }; + + + inline void swap ( + xml_parser& a, + xml_parser& b + ) { a.swap(b); } + /*! + provides a global swap function + !*/ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class xml_parse_error : public error + { + /*! + WHAT THIS OBJECT REPRESENTS + This is the exception object thrown by the parse_xml() routines defined + below. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + void parse_xml ( + std::istream& in, + document_handler& dh, + error_handler& eh + ); + /*! + ensures + - makes an xml_parser and tells it to parse the given input stream using the + supplied document_handler and error_handler. + !*/ + + void parse_xml ( + std::istream& in, + error_handler& eh, + document_handler& dh + ) + /*! + ensures + - makes an xml_parser and tells it to parse the given input stream using the + supplied document_handler and error_handler. + !*/ + + void parse_xml ( + std::istream& in, + error_handler& eh + ); + /*! + ensures + - makes an xml_parser and tells it to parse the given input stream using the + supplied error_handler. + !*/ + + void parse_xml ( + std::istream& in, + document_handler& dh + ); + /*! + ensures + - makes an xml_parser and tells it to parse the given input stream using the + supplied document_handler. + - Uses a default error handler that will throw an xml_parse_error exception + if a fatal parsing error is encountered. + throws + - xml_parse_error + Thrown if a fatal parsing error is encountered. + !*/ + +// ---------------------------------------------------------------------------------------- + + void parse_xml ( + const std::string& filename, + document_handler& dh, + error_handler& eh + ); + /*! + ensures + - makes an xml_parser and tells it to parse the given input file using the + supplied error_handler and document_handler. + throws + - xml_parse_error + Thrown if there is a problem parsing the input file. + !*/ + + void parse_xml ( + const std::string& filename, + error_handler& eh, + document_handler& dh + ); + /*! + ensures + - makes an xml_parser and tells it to parse the given input file using the + supplied error_handler and document_handler. + throws + - xml_parse_error + Thrown if there is a problem parsing the input file. + !*/ + + void parse_xml ( + const std::string& filename, + error_handler& eh + ); + /*! + ensures + - makes an xml_parser and tells it to parse the given input file using the + supplied error_handler. + throws + - xml_parse_error + Thrown if there is a problem parsing the input file. + !*/ + + void parse_xml ( + const std::string& filename, + document_handler& dh + ); + /*! + ensures + - makes an xml_parser and tells it to parse the given input file using the + supplied document_handler. + - Uses a default error handler that will throw an xml_parse_error exception + if a fatal parsing error is encountered. + throws + - xml_parse_error + Thrown if there is a problem parsing the input file. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_XML_PARSER_KERNEl_ABSTRACT_ + diff --git a/ml/dlib/dlib/xml_parser/xml_parser_kernel_interfaces.h b/ml/dlib/dlib/xml_parser/xml_parser_kernel_interfaces.h new file mode 100644 index 000000000..a0edf3317 --- /dev/null +++ b/ml/dlib/dlib/xml_parser/xml_parser_kernel_interfaces.h @@ -0,0 +1,244 @@ +// Copyright (C) 2003 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_XML_PARSER_KERNEl_INTERFACES_ +#define DLIB_XML_PARSER_KERNEl_INTERFACES_ + +#include <string> +#include "../interfaces/enumerable.h" +#include "../interfaces/map_pair.h" +#include "../error.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class xml_attribute_list_error : public dlib::error + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an exception object thrown by attribute_list objects if you try to + access a non-existent attribute. + !*/ + public: + xml_attribute_list_error(const std::string& msg) : dlib::error(msg){} + }; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class attribute_list : public enumerable<map_pair<std::string,std::string> > + { + + /*! + WHAT THIS OBJECT REPRESENTS + this object represents a list of the attributes found in + an XML element. each attribute is associated with a value. + !*/ + + + public: + + inline virtual ~attribute_list ( + ) =0; + + + virtual bool is_in_list ( + const std::string& key + ) const =0; + /*! + ensures + - returns true if there is an attribute named key in the list + - returns false + !*/ + + virtual const std::string& operator[] ( + const std::string& key + ) const =0; + /*! + ensures + if (is_in_list(key) == true) then + - returns a const reference to the value associated with the attribute + named key. + - else + - throws xml_attribute_list_error + !*/ + + protected: + + // restricted functions + attribute_list& operator=(attribute_list&) {return *this;} + }; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class document_handler + { + /*! + EXCEPTIONS + a document_handler is allowed to throw any exception + + + WHAT THIS OBJECT REPRESENTS + this object is an interface for handling the basic events + generated by an XML parser + !*/ + + + public: + + inline virtual ~document_handler ( + ) =0; + + virtual void start_document ( + )=0; + /*! + requires + - is called when the document parsing begins + !*/ + + virtual void end_document ( + )=0; + /*! + requires + - is called after the document parsing has ended. note that this + is always called, even if an error occurs. + !*/ + + virtual void start_element ( + const unsigned long line_number, + const std::string& name, + const dlib::attribute_list& atts + )=0; + /*! + requires + - is called when an opening element tag is encountered. + - line_number == the line number where the opening tag for this element + was encountered. + - name == the name of the element encountered + - atts == a list containing all the attributes in this element and their + associated values + !*/ + + virtual void end_element ( + const unsigned long line_number, + const std::string& name + )=0; + /*! + requires + - is called when a closing element tag is encountered. (note that this + includes tags such as <example_tag/>. I.e. the previous tag would + trigger a start_element() callback as well as an end_element() callback) + - line_number == the line number where the closing tag for this + element was encountered and + - name == the name of the element encountered + !*/ + + virtual void characters ( + const std::string& data + )=0; + /*! + requires + - is called just before we encounter a start_element, end_element, or + processing_instruction tag but only if there was data between the + last and next tag. + (i.e. data will never be "") + - data == all the normal non-markup data and CDATA between the next and + last tag in the document. + !*/ + + virtual void processing_instruction ( + const unsigned long line_number, + const std::string& target, + const std::string& data + )=0; + /*! + requires + - is called when a processing instruction is encountered + - line_number == the line number where this processing instruction + was encountered + - target == the target value for this processing instruction + - data == the data value for this processing instruction + !*/ + + protected: + + // restricted functions + document_handler& operator=(document_handler&) { return *this; } + }; + + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class error_handler + { + /*! + EXCEPTIONS + an error_handler is allowed to throw any exception + + + WHAT THIS OBJECT REPRESENTS + this object is an interface for handling the error/warning + events generated by an XML parser + !*/ + + public: + + inline virtual ~error_handler ( + ) =0; + + virtual void error ( + const unsigned long line_number + )=0; + /*! + requires + - is called when an error that does NOT require the parser to halt + is encountered. (i.e. somewhat minor errors in the input) + - line_number == the line number where this error was encountered + + the following events trigger an error: + an invalid processing instruction + !*/ + + virtual void fatal_error ( + const unsigned long line_number + )=0; + /*! + requires + - is called when an error that requires the parser to abort its parsing + is encountered (i.e. fatal errors in the input) + - line_number == the line number where this fatal error was encountered + + the following events trigger a fatal_error: + Everything other than the events listed above for error. + Also note that encountering an entity reference other than the + predefined ones listed in xml_parser_kernel_abstract is a fatal_error. + Hitting EOF before the closing tag for the document is also a fatal_error. + !*/ + + protected: + + // restricted functions + error_handler& operator=(error_handler&) { return *this;} + }; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + document_handler::~document_handler ( + ){} + attribute_list::~attribute_list ( + ){} + error_handler::~error_handler ( + ){} + +} + +#endif // DLIB_XML_PARSER_KERNEl_INTERFACES_ + |