diff options
Diffstat (limited to 'include/orcus/sax_parser.hpp')
-rw-r--r-- | include/orcus/sax_parser.hpp | 576 |
1 files changed, 576 insertions, 0 deletions
diff --git a/include/orcus/sax_parser.hpp b/include/orcus/sax_parser.hpp new file mode 100644 index 0000000..f7283d2 --- /dev/null +++ b/include/orcus/sax_parser.hpp @@ -0,0 +1,576 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SAX_PARSER_HPP +#define INCLUDED_ORCUS_SAX_PARSER_HPP + +#include "sax_parser_base.hpp" + +#include <string_view> + +namespace orcus { + +struct sax_parser_default_config +{ + /** + * An integer value representing a baseline XML version. A value of 10 + * corresponds with version 1.0 whereas a value of 11 corresponds with + * version 1.1. + */ + static constexpr uint8_t baseline_version = 10; +}; + +class sax_handler +{ +public: + /** + * Called when a doctype declaration <!DOCTYPE ... > is encountered. + * + * @param dtd struct containing doctype declaration data. + */ + void doctype(const orcus::sax::doctype_declaration& dtd) + { + (void)dtd; + } + + /** + * Called when <?... is encountered, where the '...' may be an + * arbitraray dentifier. One common declaration is <?xml which is + * typically given at the start of an XML stream. + * + * @param decl name of the identifier. + */ + void start_declaration(std::string_view decl) + { + (void)decl; + } + + /** + * Called when the closing tag (>) of a <?... ?> is encountered. + * + * @param decl name of the identifier. + */ + void end_declaration(std::string_view decl) + { + (void)decl; + } + + /** + * Called at the start of each element. + * + * @param elem information of the element being parsed. + */ + void start_element(const orcus::sax::parser_element& elem) + { + (void)elem; + } + + /** + * Called at the end of each element. + * + * @param elem information of the element being parsed. + */ + void end_element(const orcus::sax::parser_element& elem) + { + (void)elem; + } + + /** + * Called when a segment of a text content is parsed. Each text content + * is a direct child of an element, which may have multiple child contents + * when the element also has a child element that are direct sibling to + * the text contents or the text contents are splitted by a comment. + * + * @param val value of the text content. + * @param transient when true, the text content has been converted and is + * stored in a temporary buffer due to presence of one or + * more encoded characters, in which case <em>the passed + * text value needs to be either immediately converted to + * a non-text value or be interned within the scope of + * the callback</em>. + */ + void characters(std::string_view val, bool transient) + { + (void)val; (void)transient; + } + + /** + * Called upon parsing of an attribute of an element. Note that <em>when + * the attribute's transient flag is set, the attribute value is stored in + * a temporary buffer due to presence of one or more encoded characters, + * and must be processed within the scope of the callback</em>. + * + * @param attr struct containing attribute information. + */ + void attribute(const orcus::sax::parser_attribute& attr) + { + (void)attr; + } +}; + +/** + * SAX parser for XML documents. + * + * This parser is barebone in that it only parses the document and picks up + * all encountered elements and attributes without checking proper element + * pairs. The user is responsible for checking whether or not the document is + * well-formed in terms of element scopes. + * + * This parser additionally records the begin and end offset positions of each + * element. + * + * @tparam HandlerT Handler type with member functions for event callbacks. + * Refer to @ref sax_handler. + * @tparam ConfigT Parser configuration. + */ +template<typename HandlerT, typename ConfigT = sax_parser_default_config> +class sax_parser : public sax::parser_base +{ +public: + typedef HandlerT handler_type; + typedef ConfigT config_type; + + sax_parser(std::string_view content, handler_type& handler); + ~sax_parser() = default; + + void parse(); + +private: + + /** + * Parse XML header that occurs at the beginning of every XML stream i.e. + * <?xml version="..." encoding="..." ?> + */ + void header(); + void body(); + void element(); + void element_open(std::ptrdiff_t begin_pos); + void element_close(std::ptrdiff_t begin_pos); + void special_tag(); + void declaration(const char* name_check); + void cdata(); + void doctype(); + void characters(); + void attribute(); + +private: + handler_type& m_handler; +}; + +template<typename HandlerT, typename ConfigT> +sax_parser<HandlerT,ConfigT>::sax_parser(std::string_view content, handler_type& handler) : + sax::parser_base(content.data(), content.size()), + m_handler(handler) +{ +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::parse() +{ + m_nest_level = 0; + mp_char = mp_begin; + header(); + skip_space_and_control(); + body(); + + assert(m_buffer_pos == 0); +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::header() +{ + // we don't handle multi byte encodings so we can just skip bom entry if exists. + skip_bom(); + + // Allow leading whitespace in the XML stream. + // TODO : Make this configurable since strictly speaking such an XML + // sttream is invalid. + skip_space_and_control(); + + if (!has_char() || cur_char() != '<') + throw malformed_xml_error("xml file must begin with '<'.", offset()); + + if (config_type::baseline_version >= 11) + { + // XML version 1.1 requires a header declaration whereas in 1.0 it's + // optional. + if (next_char_checked() != '?') + throw malformed_xml_error("xml file must begin with '<?'.", offset()); + + declaration("xml"); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::body() +{ + while (has_char()) + { + if (cur_char() == '<') + { + element(); + if (!m_root_elem_open) + // Root element closed. Stop parsing. + return; + } + else if (m_nest_level) + // Call characters only when in xml hierarchy. + characters(); + else + next(); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::element() +{ + assert(cur_char() == '<'); + std::ptrdiff_t pos = offset(); + char c = next_char_checked(); + switch (c) + { + case '/': + element_close(pos); + return; + case '!': + special_tag(); + return; + case '?': + declaration(nullptr); + return; + } + + element_open(pos); +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::element_open(std::ptrdiff_t begin_pos) +{ + sax::parser_element elem; + element_name(elem, begin_pos); + + while (true) + { + skip_space_and_control(); + char c = cur_char_checked(); + if (c == '/') + { + // Self-closing element: <element/> + if (next_and_char() != '>') + throw malformed_xml_error("expected '/>' to self-close the element.", offset()); + next(); + elem.end_pos = offset(); + m_handler.start_element(elem); + reset_buffer_pos(); + m_handler.end_element(elem); + if (!m_nest_level) + m_root_elem_open = false; +#if ORCUS_DEBUG_SAX_PARSER + cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl; +#endif + return; + } + else if (c == '>') + { + // End of opening element: <element> + next(); + elem.end_pos = offset(); + nest_up(); + m_handler.start_element(elem); + reset_buffer_pos(); +#if ORCUS_DEBUG_SAX_PARSER + cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl; +#endif + return; + } + else + attribute(); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::element_close(std::ptrdiff_t begin_pos) +{ + assert(cur_char() == '/'); + nest_down(); + next_check(); + sax::parser_element elem; + element_name(elem, begin_pos); + + if (cur_char() != '>') + throw malformed_xml_error("expected '>' to close the element.", offset()); + next(); + elem.end_pos = offset(); + + m_handler.end_element(elem); +#if ORCUS_DEBUG_SAX_PARSER + cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl; +#endif + if (!m_nest_level) + m_root_elem_open = false; +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::special_tag() +{ + assert(cur_char() == '!'); + // This can be either <![CDATA, <!--, or <!DOCTYPE. + size_t len = available_size(); + if (len < 2) + throw malformed_xml_error("special tag too short.", offset()); + + switch (next_and_char()) + { + case '-': + { + // Possibly comment. + if (next_and_char() != '-') + throw malformed_xml_error("comment expected.", offset()); + + len -= 2; + if (len < 3) + throw malformed_xml_error("malformed comment.", offset()); + + next(); + comment(); + } + break; + case '[': + { + // Possibly a CDATA. + expects_next("CDATA[", 6); + if (has_char()) + cdata(); + } + break; + case 'D': + { + // check if this is a DOCTYPE. + expects_next("OCTYPE", 6); + skip_space_and_control(); + if (has_char()) + doctype(); + } + break; + default: + throw malformed_xml_error("failed to parse special tag.", offset()); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::declaration(const char* name_check) +{ + assert(cur_char() == '?'); + next_check(); + + // Get the declaration name first. + std::string_view decl_name; + name(decl_name); +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl; +#endif + + if (name_check && decl_name != name_check) + { + std::ostringstream os; + os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead."; + throw malformed_xml_error(os.str(), offset()); + } + + m_handler.start_declaration(decl_name); + skip_space_and_control(); + + // Parse the attributes. + while (cur_char_checked() != '?') + { + attribute(); + skip_space_and_control(); + } + if (next_char_checked() != '>') + throw malformed_xml_error("declaration must end with '?>'.", offset()); + + m_handler.end_declaration(decl_name); + reset_buffer_pos(); + next(); +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl; +#endif +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::cdata() +{ + size_t len = available_size(); + assert(len > 3); + + // Parse until we reach ']]>'. + const char* p0 = mp_char; + size_t i = 0, match = 0; + for (char c = cur_char(); i < len; ++i, c = next_and_char()) + { + if (c == ']') + { + // Be aware that we may encounter a series of more than two ']' + // characters, in which case we'll only count the last two. + + if (match == 0) + // First ']' + ++match; + else if (match == 1) + // Second ']' + ++match; + } + else if (c == '>' && match == 2) + { + // Found ']]>'. + size_t cdata_len = i - 2; + m_handler.characters(std::string_view(p0, cdata_len), false); + next(); + return; + } + else + match = 0; + } + throw malformed_xml_error("malformed CDATA section.", offset()); +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::doctype() +{ + // Parse the root element first. + sax::doctype_declaration param; + name(param.root_element); + skip_space_and_control(); + + // Either PUBLIC or SYSTEM. + size_t len = available_size(); + if (len < 6) + throw malformed_xml_error("DOCTYPE section too short.", offset()); + + param.keyword = sax::doctype_declaration::keyword_type::dtd_private; + char c = cur_char(); + if (c == 'P') + { + if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C') + throw malformed_xml_error("malformed DOCTYPE section.", offset()); + + param.keyword = sax::doctype_declaration::keyword_type::dtd_public; + } + else if (c == 'S') + { + if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M') + throw malformed_xml_error("malformed DOCTYPE section.", offset()); + } + + next_check(); + skip_space_and_control(); + + // Parse FPI. + value(param.fpi, false); + + has_char_throw("DOCTYPE section too short."); + skip_space_and_control(); + has_char_throw("DOCTYPE section too short."); + + if (cur_char() == '>') + { + // Optional URI not given. Exit. +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl; +#endif + m_handler.doctype(param); + next(); + return; + } + + // Parse optional URI. + value(param.uri, false); + + has_char_throw("DOCTYPE section too short."); + skip_space_and_control(); + has_char_throw("DOCTYPE section too short."); + + if (cur_char() != '>') + throw malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset()); + +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl; +#endif + m_handler.doctype(param); + next(); +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::characters() +{ + const char* p0 = mp_char; + for (; has_char(); next()) + { + if (cur_char() == '<') + break; + + if (cur_char() == '&') + { + // Text span with one or more encoded characters. Parse using cell buffer. + cell_buffer& buf = get_cell_buffer(); + buf.reset(); + buf.append(p0, mp_char-p0); + characters_with_encoded_char(buf); + if (buf.empty()) + m_handler.characters(std::string_view{}, false); + else + m_handler.characters(buf.str(), true); + return; + } + } + + if (mp_char > p0) + { + std::string_view val(p0, mp_char-p0); + m_handler.characters(val, false); + } +} + +template<typename HandlerT, typename ConfigT> +void sax_parser<HandlerT,ConfigT>::attribute() +{ + sax::parser_attribute attr; + attribute_name(attr.ns, attr.name); + +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl; +#endif + + skip_space_and_control(); + + char c = cur_char_checked(); + if (c != '=') + { + std::ostringstream os; + os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')"; + throw malformed_xml_error(os.str(), offset()); + } + + next_check(); // skip the '='. + skip_space_and_control(); + + attr.transient = value(attr.value, true); + if (attr.transient) + // Value is stored in a temporary buffer. Push a new buffer. + inc_buffer_pos(); + +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::attribute: value='" << attr.value << "'" << endl; +#endif + + m_handler.attribute(attr); +} + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |