diff options
Diffstat (limited to 'include/orcus/sax_ns_parser.hpp')
-rw-r--r-- | include/orcus/sax_ns_parser.hpp | 374 |
1 files changed, 374 insertions, 0 deletions
diff --git a/include/orcus/sax_ns_parser.hpp b/include/orcus/sax_ns_parser.hpp new file mode 100644 index 0000000..f888fa2 --- /dev/null +++ b/include/orcus/sax_ns_parser.hpp @@ -0,0 +1,374 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_ORCUS_SAX_NS_PARSER_HPP +#define INCLUDED_ORCUS_SAX_NS_PARSER_HPP + +#include "sax_parser.hpp" +#include "xml_namespace.hpp" + +#include <unordered_set> +#include <vector> +#include <algorithm> + +namespace orcus { + +struct sax_ns_parser_element +{ + /** Element namespace identifier. */ + xmlns_id_t ns; + /** Element namespace alias. */ + std::string_view ns_alias; + /** Element name. */ + std::string_view name; + /** Position of the opening brace '<'. */ + std::ptrdiff_t begin_pos; + /** Position immediately after the closing brace '>'. */ + std::ptrdiff_t end_pos; +}; + +struct sax_ns_parser_attribute +{ + /** Attribute namespace identifier. */ + xmlns_id_t ns; + /** Attribute namespace alias. */ + std::string_view ns_alias; + /** Attribute name. */ + std::string_view name; + /** Attribute value. */ + std::string_view value; + /** Whether or not the attribute value is transient. */ + bool transient; +}; + +namespace sax { namespace detail { + +struct entity_name +{ + std::string_view ns; + std::string_view name; + + entity_name(std::string_view _ns, std::string_view _name) : + ns(_ns), name(_name) {} + + bool operator== (const entity_name& other) const + { + return other.ns == ns && other.name == name; + } + + struct hash + { + size_t operator() (const entity_name& v) const + { + std::hash<std::string_view> hasher; + return hasher(v.ns) + hasher(v.name); + } + }; +}; + +typedef std::unordered_set<std::string_view> ns_keys_type; +typedef std::unordered_set<entity_name, entity_name::hash> entity_names_type; + +struct elem_scope +{ + xmlns_id_t ns; + std::string_view name; + ns_keys_type ns_keys; + + elem_scope() {} + elem_scope(const elem_scope&) = delete; + elem_scope(elem_scope&& other) = default; +}; + +using elem_scopes_type = std::vector<elem_scope>; + +}} // namespace sax::detail + +class sax_ns_handler +{ +public: + /** + * Called when a doctype declaration <!DOCTYPE ... > is encountered. + * + * @param dtd struct containing doctype declaration data. + */ + void doctype(const orcus::sax::doctype_declaration& dtd) + { + (void)dtd; + } + + /** + * Called when <?... is encountered, where the '...' may be an + * arbitraray dentifier. One common declaration is <?xml which is + * typically given at the start of an XML stream. + * + * @param decl name of the identifier. + */ + void start_declaration(std::string_view decl) + { + (void)decl; + } + + /** + * Called when the closing tag (>) of a <?... ?> is encountered. + * + * @param decl name of the identifier. + */ + void end_declaration(std::string_view decl) + { + (void)decl; + } + + /** + * Called at the start of each element. + * + * @param elem information of the element being parsed. + */ + void start_element(const orcus::sax_ns_parser_element& elem) + { + (void)elem; + } + + /** + * Called at the end of each element. + * + * @param elem information of the element being parsed. + */ + void end_element(const orcus::sax_ns_parser_element& elem) + { + (void)elem; + } + + /** + * Called when a segment of a text content is parsed. Each text content + * is a direct child of an element, which may have multiple child contents + * when the element also has a child element that are direct sibling to + * the text contents or the text contents are splitted by a comment. + * + * @param val value of the text content. + * @param transient when true, the text content has been converted and is + * stored in a temporary buffer due to presence of one or + * more encoded characters, in which case <em>the passed + * text value needs to be either immediately converted to + * a non-text value or be interned within the scope of + * the callback</em>. + */ + void characters(std::string_view val, bool transient) + { + (void)val; + (void)transient; + } + + /** + * Called upon parsing of an attribute of a declaration. The value of an + * attribute is assumed to be transient thus should be consumed within the + * scope of this callback. + * + * @param name name of an attribute. + * @param val value of an attribute. + * + * @todo Perhaps we should pass the transient flag here as well like all the + * other places. + */ + void attribute(std::string_view name, std::string_view val) + { + (void)name; + (void)val; + } + + /** + * Called upon parsing of an attribute of an element. Note that <em>when + * the attribute's transient flag is set, the attribute value is stored in + * a temporary buffer due to a presence of encoded characters, and must be + * processed within the scope of the callback</em>. + * + * @param attr struct containing attribute information. + */ + void attribute(const orcus::sax_ns_parser_attribute& attr) + { + (void)attr; + } +}; + +/** + * SAX based XML parser with extra namespace handling. + * + * It uses an instance of xmlns_context passed by the caller to validate and + * convert namespace values into identifiers. The namespace identifier of + * each encountered element is always given even if one is not explicitly + * given. + * + * This parser keeps track of element scopes and detects non-matching element + * pairs. + * + * @tparam HandlerT Handler type with member functions for event callbacks. + * Refer to @ref sax_ns_handler. + */ +template<typename HandlerT> +class sax_ns_parser +{ +public: + typedef HandlerT handler_type; + + sax_ns_parser(std::string_view content, xmlns_context& ns_cxt, handler_type& handler); + ~sax_ns_parser() = default; + + /** + * Start parsing the document. + * + * @exception orcus::malformed_xml_error when it encounters a + * non-matching closing element. + */ + void parse(); + +private: + /** + * Re-route callbacks from the internal sax_parser into sax_ns_parser + * callbacks. + */ + class handler_wrapper + { + sax::detail::elem_scopes_type m_scopes; + sax::detail::ns_keys_type m_ns_keys; + sax::detail::entity_names_type m_attrs; + + sax_ns_parser_element m_elem; + sax_ns_parser_attribute m_attr; + + xmlns_context& m_ns_cxt; + handler_type& m_handler; + + bool m_declaration; + + public: + handler_wrapper(xmlns_context& ns_cxt, handler_type& handler) : m_ns_cxt(ns_cxt), m_handler(handler), m_declaration(false) {} + + void doctype(const sax::doctype_declaration& dtd) + { + m_handler.doctype(dtd); + } + + void start_declaration(std::string_view name) + { + m_declaration = true; + m_handler.start_declaration(name); + } + + void end_declaration(std::string_view name) + { + m_declaration = false; + m_handler.end_declaration(name); + } + + void start_element(const sax::parser_element& elem) + { + m_scopes.emplace_back(); + sax::detail::elem_scope& scope = m_scopes.back(); + scope.ns = m_ns_cxt.get(elem.ns); + scope.name = elem.name; + scope.ns_keys.swap(m_ns_keys); + + m_elem.ns = scope.ns; + m_elem.ns_alias = elem.ns; + m_elem.name = scope.name; + m_elem.begin_pos = elem.begin_pos; + m_elem.end_pos = elem.end_pos; + m_handler.start_element(m_elem); + + m_attrs.clear(); + } + + void end_element(const sax::parser_element& elem) + { + sax::detail::elem_scope& scope = m_scopes.back(); + if (scope.ns != m_ns_cxt.get(elem.ns) || scope.name != elem.name) + throw malformed_xml_error("mis-matching closing element.", -1); + + m_elem.ns = scope.ns; + m_elem.ns_alias = elem.ns; + m_elem.name = scope.name; + m_elem.begin_pos = elem.begin_pos; + m_elem.end_pos = elem.end_pos; + m_handler.end_element(m_elem); + + // Pop all namespaces declared in this scope. + for (const std::string_view& key : scope.ns_keys) + m_ns_cxt.pop(key); + + m_scopes.pop_back(); + } + + void characters(std::string_view val, bool transient) + { + m_handler.characters(val, transient); + } + + void attribute(const sax::parser_attribute& attr) + { + if (m_declaration) + { + // XML declaration attribute. Pass it through to the handler without namespace. + m_handler.attribute(attr.name, attr.value); + return; + } + + if (m_attrs.count(sax::detail::entity_name(attr.ns, attr.name)) > 0) + throw malformed_xml_error( + "You can't define two attributes of the same name in the same element.", -1); + + m_attrs.insert(sax::detail::entity_name(attr.ns, attr.name)); + + if (attr.ns.empty() && attr.name == "xmlns") + { + // Default namespace + m_ns_cxt.push(std::string_view{}, attr.value); + m_ns_keys.insert(std::string_view{}); + return; + } + + if (attr.ns == "xmlns") + { + // Namespace alias + if (!attr.name.empty()) + { + m_ns_cxt.push(attr.name, attr.value); + m_ns_keys.insert(attr.name); + } + return; + } + + m_attr.ns = attr.ns.empty() ? XMLNS_UNKNOWN_ID : m_ns_cxt.get(attr.ns); + m_attr.ns_alias = attr.ns; + m_attr.name = attr.name; + m_attr.value = attr.value; + m_attr.transient = attr.transient; + m_handler.attribute(m_attr); + } + }; + +private: + handler_wrapper m_wrapper; + sax_parser<handler_wrapper> m_parser; +}; + +template<typename HandlerT> +sax_ns_parser<HandlerT>::sax_ns_parser( + std::string_view content, xmlns_context& ns_cxt, handler_type& handler) : + m_wrapper(ns_cxt, handler), m_parser(content, m_wrapper) +{ +} + +template<typename HandlerT> +void sax_ns_parser<HandlerT>::parse() +{ + m_parser.parse(); +} + +} + +#endif +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |