/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifndef INCLUDED_ORCUS_SAX_PARSER_HPP #define INCLUDED_ORCUS_SAX_PARSER_HPP #include "sax_parser_base.hpp" #include namespace orcus { struct sax_parser_default_config { /** * An integer value representing a baseline XML version. A value of 10 * corresponds with version 1.0 whereas a value of 11 corresponds with * version 1.1. */ static constexpr uint8_t baseline_version = 10; }; class sax_handler { public: /** * Called when a doctype declaration <!DOCTYPE ... > is encountered. * * @param dtd struct containing doctype declaration data. */ void doctype(const orcus::sax::doctype_declaration& dtd) { (void)dtd; } /** * Called when <?... is encountered, where the '...' may be an * arbitraray dentifier. One common declaration is <?xml which is * typically given at the start of an XML stream. * * @param decl name of the identifier. */ void start_declaration(std::string_view decl) { (void)decl; } /** * Called when the closing tag (>) of a <?... ?> is encountered. * * @param decl name of the identifier. */ void end_declaration(std::string_view decl) { (void)decl; } /** * Called at the start of each element. * * @param elem information of the element being parsed. */ void start_element(const orcus::sax::parser_element& elem) { (void)elem; } /** * Called at the end of each element. * * @param elem information of the element being parsed. */ void end_element(const orcus::sax::parser_element& elem) { (void)elem; } /** * Called when a segment of a text content is parsed. Each text content * is a direct child of an element, which may have multiple child contents * when the element also has a child element that are direct sibling to * the text contents or the text contents are splitted by a comment. * * @param val value of the text content. * @param transient when true, the text content has been converted and is * stored in a temporary buffer due to presence of one or * more encoded characters, in which case the passed * text value needs to be either immediately converted to * a non-text value or be interned within the scope of * the callback. */ void characters(std::string_view val, bool transient) { (void)val; (void)transient; } /** * Called upon parsing of an attribute of an element. Note that when * the attribute's transient flag is set, the attribute value is stored in * a temporary buffer due to presence of one or more encoded characters, * and must be processed within the scope of the callback. * * @param attr struct containing attribute information. */ void attribute(const orcus::sax::parser_attribute& attr) { (void)attr; } }; /** * SAX parser for XML documents. * * This parser is barebone in that it only parses the document and picks up * all encountered elements and attributes without checking proper element * pairs. The user is responsible for checking whether or not the document is * well-formed in terms of element scopes. * * This parser additionally records the begin and end offset positions of each * element. * * @tparam HandlerT Handler type with member functions for event callbacks. * Refer to @ref sax_handler. * @tparam ConfigT Parser configuration. */ template class sax_parser : public sax::parser_base { public: typedef HandlerT handler_type; typedef ConfigT config_type; sax_parser(std::string_view content, handler_type& handler); ~sax_parser() = default; void parse(); private: /** * Parse XML header that occurs at the beginning of every XML stream i.e. * */ void header(); void body(); void element(); void element_open(std::ptrdiff_t begin_pos); void element_close(std::ptrdiff_t begin_pos); void special_tag(); void declaration(const char* name_check); void cdata(); void doctype(); void characters(); void attribute(); private: handler_type& m_handler; }; template sax_parser::sax_parser(std::string_view content, handler_type& handler) : sax::parser_base(content.data(), content.size()), m_handler(handler) { } template void sax_parser::parse() { m_nest_level = 0; mp_char = mp_begin; header(); skip_space_and_control(); body(); assert(m_buffer_pos == 0); } template void sax_parser::header() { // we don't handle multi byte encodings so we can just skip bom entry if exists. skip_bom(); // Allow leading whitespace in the XML stream. // TODO : Make this configurable since strictly speaking such an XML // sttream is invalid. skip_space_and_control(); if (!has_char() || cur_char() != '<') throw malformed_xml_error("xml file must begin with '<'.", offset()); if (config_type::baseline_version >= 11) { // XML version 1.1 requires a header declaration whereas in 1.0 it's // optional. if (next_char_checked() != '?') throw malformed_xml_error("xml file must begin with ' void sax_parser::body() { while (has_char()) { if (cur_char() == '<') { element(); if (!m_root_elem_open) // Root element closed. Stop parsing. return; } else if (m_nest_level) // Call characters only when in xml hierarchy. characters(); else next(); } } template void sax_parser::element() { assert(cur_char() == '<'); std::ptrdiff_t pos = offset(); char c = next_char_checked(); switch (c) { case '/': element_close(pos); return; case '!': special_tag(); return; case '?': declaration(nullptr); return; } element_open(pos); } template void sax_parser::element_open(std::ptrdiff_t begin_pos) { sax::parser_element elem; element_name(elem, begin_pos); while (true) { skip_space_and_control(); char c = cur_char_checked(); if (c == '/') { // Self-closing element: if (next_and_char() != '>') throw malformed_xml_error("expected '/>' to self-close the element.", offset()); next(); elem.end_pos = offset(); m_handler.start_element(elem); reset_buffer_pos(); m_handler.end_element(elem); if (!m_nest_level) m_root_elem_open = false; #if ORCUS_DEBUG_SAX_PARSER cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl; #endif return; } else if (c == '>') { // End of opening element: next(); elem.end_pos = offset(); nest_up(); m_handler.start_element(elem); reset_buffer_pos(); #if ORCUS_DEBUG_SAX_PARSER cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl; #endif return; } else attribute(); } } template void sax_parser::element_close(std::ptrdiff_t begin_pos) { assert(cur_char() == '/'); nest_down(); next_check(); sax::parser_element elem; element_name(elem, begin_pos); if (cur_char() != '>') throw malformed_xml_error("expected '>' to close the element.", offset()); next(); elem.end_pos = offset(); m_handler.end_element(elem); #if ORCUS_DEBUG_SAX_PARSER cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl; #endif if (!m_nest_level) m_root_elem_open = false; } template void sax_parser::special_tag() { assert(cur_char() == '!'); // This can be either void sax_parser::declaration(const char* name_check) { assert(cur_char() == '?'); next_check(); // Get the declaration name first. std::string_view decl_name; name(decl_name); #if ORCUS_DEBUG_SAX_PARSER cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl; #endif if (name_check && decl_name != name_check) { std::ostringstream os; os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead."; throw malformed_xml_error(os.str(), offset()); } m_handler.start_declaration(decl_name); skip_space_and_control(); // Parse the attributes. while (cur_char_checked() != '?') { attribute(); skip_space_and_control(); } if (next_char_checked() != '>') throw malformed_xml_error("declaration must end with '?>'.", offset()); m_handler.end_declaration(decl_name); reset_buffer_pos(); next(); #if ORCUS_DEBUG_SAX_PARSER cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl; #endif } template void sax_parser::cdata() { size_t len = available_size(); assert(len > 3); // Parse until we reach ']]>'. const char* p0 = mp_char; size_t i = 0, match = 0; for (char c = cur_char(); i < len; ++i, c = next_and_char()) { if (c == ']') { // Be aware that we may encounter a series of more than two ']' // characters, in which case we'll only count the last two. if (match == 0) // First ']' ++match; else if (match == 1) // Second ']' ++match; } else if (c == '>' && match == 2) { // Found ']]>'. size_t cdata_len = i - 2; m_handler.characters(std::string_view(p0, cdata_len), false); next(); return; } else match = 0; } throw malformed_xml_error("malformed CDATA section.", offset()); } template void sax_parser::doctype() { // Parse the root element first. sax::doctype_declaration param; name(param.root_element); skip_space_and_control(); // Either PUBLIC or SYSTEM. size_t len = available_size(); if (len < 6) throw malformed_xml_error("DOCTYPE section too short.", offset()); param.keyword = sax::doctype_declaration::keyword_type::dtd_private; char c = cur_char(); if (c == 'P') { if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C') throw malformed_xml_error("malformed DOCTYPE section.", offset()); param.keyword = sax::doctype_declaration::keyword_type::dtd_public; } else if (c == 'S') { if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M') throw malformed_xml_error("malformed DOCTYPE section.", offset()); } next_check(); skip_space_and_control(); // Parse FPI. value(param.fpi, false); has_char_throw("DOCTYPE section too short."); skip_space_and_control(); has_char_throw("DOCTYPE section too short."); if (cur_char() == '>') { // Optional URI not given. Exit. #if ORCUS_DEBUG_SAX_PARSER cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl; #endif m_handler.doctype(param); next(); return; } // Parse optional URI. value(param.uri, false); has_char_throw("DOCTYPE section too short."); skip_space_and_control(); has_char_throw("DOCTYPE section too short."); if (cur_char() != '>') throw malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset()); #if ORCUS_DEBUG_SAX_PARSER cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl; #endif m_handler.doctype(param); next(); } template void sax_parser::characters() { const char* p0 = mp_char; for (; has_char(); next()) { if (cur_char() == '<') break; if (cur_char() == '&') { // Text span with one or more encoded characters. Parse using cell buffer. cell_buffer& buf = get_cell_buffer(); buf.reset(); buf.append(p0, mp_char-p0); characters_with_encoded_char(buf); if (buf.empty()) m_handler.characters(std::string_view{}, false); else m_handler.characters(buf.str(), true); return; } } if (mp_char > p0) { std::string_view val(p0, mp_char-p0); m_handler.characters(val, false); } } template void sax_parser::attribute() { sax::parser_attribute attr; attribute_name(attr.ns, attr.name); #if ORCUS_DEBUG_SAX_PARSER cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl; #endif skip_space_and_control(); char c = cur_char_checked(); if (c != '=') { std::ostringstream os; os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')"; throw malformed_xml_error(os.str(), offset()); } next_check(); // skip the '='. skip_space_and_control(); attr.transient = value(attr.value, true); if (attr.transient) // Value is stored in a temporary buffer. Push a new buffer. inc_buffer_pos(); #if ORCUS_DEBUG_SAX_PARSER cout << "sax_parser::attribute: value='" << attr.value << "'" << endl; #endif m_handler.attribute(attr); } } #endif /* vim:set shiftwidth=4 softtabstop=4 expandtab: */