summaryrefslogtreecommitdiffstats
path: root/src/liborcus/dom_tree.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/liborcus/dom_tree.cpp')
-rw-r--r--src/liborcus/dom_tree.cpp741
1 files changed, 741 insertions, 0 deletions
diff --git a/src/liborcus/dom_tree.cpp b/src/liborcus/dom_tree.cpp
new file mode 100644
index 0000000..9e7a764
--- /dev/null
+++ b/src/liborcus/dom_tree.cpp
@@ -0,0 +1,741 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <orcus/dom_tree.hpp>
+#include <orcus/exception.hpp>
+#include <orcus/xml_namespace.hpp>
+#include <orcus/sax_ns_parser.hpp>
+#include <orcus/string_pool.hpp>
+
+#include <iostream>
+#include <sstream>
+#include <cassert>
+#include <unordered_map>
+#include <vector>
+#include <algorithm>
+#include <deque>
+
+namespace orcus {
+
+namespace dom {
+
+namespace {
+
+/**
+ * Escape certain characters with backslash (\).
+ */
+void escape(std::ostream& os, std::string_view val)
+{
+ if (val.empty())
+ return;
+
+ const char* p = val.data();
+ const char* p_end = p + val.size();
+ for (; p != p_end; ++p)
+ {
+ if (*p == '"')
+ os << "\\\"";
+ else if (*p == '\\')
+ os << "\\\\";
+ else
+ os << *p;
+ }
+}
+
+struct attr
+{
+ dom::entity_name name;
+ std::string_view value;
+
+ attr(xmlns_id_t _ns, std::string_view _name, std::string_view _value);
+};
+
+struct entity_name_hash
+{
+ std::size_t operator()(const entity_name& v) const
+ {
+ return std::hash<std::string_view>{}(v.name) ^ reinterpret_cast<std::size_t>(v.ns);
+ }
+};
+
+using attrs_type = std::vector<attr>;
+using attr_map_type = std::unordered_map<entity_name, size_t, entity_name_hash>;
+
+struct declaration
+{
+ attrs_type attrs;
+ attr_map_type attr_map;
+};
+
+enum class node_type { element, content };
+
+struct element;
+
+struct node
+{
+ const element* parent;
+ node_type type;
+
+ node(node_type _type) : parent(nullptr), type(_type) {}
+
+ virtual ~node() = 0;
+ virtual void print(std::ostream& os, const xmlns_context& cxt) const = 0;
+};
+
+typedef std::vector<std::unique_ptr<node>> nodes_type;
+
+struct element : public node
+{
+ entity_name name;
+ attrs_type attrs;
+ attr_map_type attr_map;
+ nodes_type child_nodes;
+ std::vector<size_t> child_elem_positions;
+
+ element() = delete;
+ element(xmlns_id_t _ns, std::string_view _name);
+ virtual void print(std::ostream& os, const xmlns_context& cxt) const;
+ virtual ~element();
+};
+
+struct content : public node
+{
+ std::string_view value;
+
+ content(std::string_view _value);
+ virtual void print(std::ostream& os, const xmlns_context& cxt) const;
+ virtual ~content();
+};
+
+void print(std::ostream& os, const entity_name& name, const xmlns_context& cxt)
+{
+ if (name.ns)
+ {
+ size_t index = cxt.get_index(name.ns);
+ if (index != INDEX_NOT_FOUND)
+ os << "ns" << index << ':';
+ }
+ os << name.name;
+}
+
+void print(std::ostream& os, const attr& at, const xmlns_context& cxt)
+{
+ dom::print(os, at.name, cxt);
+ os << "=\"";
+ escape(os, at.value);
+ os << '"';
+}
+
+attr::attr(xmlns_id_t _ns, std::string_view _name, std::string_view _value) :
+ name(_ns, _name), value(_value) {}
+
+node::~node() {}
+
+element::element(xmlns_id_t _ns, std::string_view _name) :
+ node(node_type::element), name(_ns, _name) {}
+
+void element::print(std::ostream& os, const xmlns_context& cxt) const
+{
+ dom::print(os, name, cxt);
+}
+
+element::~element() = default;
+
+content::content(std::string_view _value) : node(node_type::content), value(_value) {}
+
+void content::print(std::ostream& os, const xmlns_context& /*cxt*/) const
+{
+ os << '"';
+ escape(os, value);
+ os << '"';
+}
+
+content::~content() = default;
+
+} // anonymous namespace
+
+entity_name::entity_name() : ns(XMLNS_UNKNOWN_ID) {}
+
+entity_name::entity_name(std::string_view _name) :
+ ns(XMLNS_UNKNOWN_ID), name(_name) {}
+
+entity_name::entity_name(xmlns_id_t _ns, std::string_view _name) :
+ ns(_ns), name(_name) {}
+
+bool entity_name::operator== (const entity_name& other) const
+{
+ return ns == other.ns && name == other.name;
+}
+
+bool entity_name::operator!= (const entity_name& other) const
+{
+ return !operator==(other);
+}
+
+struct const_node::impl
+{
+ node_t type;
+
+ union
+ {
+ const dom::declaration* decl;
+ const dom::element* elem;
+
+ struct
+ {
+ const char* p;
+ size_t n;
+
+ } str;
+
+ } value;
+
+ impl() : type(node_t::unset) {}
+
+ impl(const impl& other) : type(other.type)
+ {
+ switch (type)
+ {
+ case node_t::declaration:
+ value.decl = other.value.decl;
+ break;
+ case node_t::element:
+ value.elem = other.value.elem;
+ break;
+ case node_t::unset:
+ default:
+ ;
+ }
+ }
+
+ impl(const dom::element* _elem) : type(node_t::element)
+ {
+ value.elem = _elem;
+ }
+
+ impl(const dom::declaration* _decl) : type(node_t::declaration)
+ {
+ value.decl = _decl;
+ }
+};
+
+const_node::const_node(std::unique_ptr<impl>&& _impl) : mp_impl(std::move(_impl)) {}
+const_node::const_node() : mp_impl(std::make_unique<impl>()) {}
+const_node::const_node(const const_node& other) : mp_impl(std::make_unique<impl>(*other.mp_impl)) {}
+const_node::const_node(const_node&& other) : mp_impl(std::move(other.mp_impl)) {}
+const_node::~const_node() {}
+
+node_t const_node::type() const
+{
+ return mp_impl->type;
+}
+
+size_t const_node::child_count() const
+{
+ switch (mp_impl->type)
+ {
+ case node_t::element:
+ {
+ const dom::element* p = mp_impl->value.elem;
+ return p->child_elem_positions.size();
+ }
+ default:
+ ;
+ }
+
+ return 0;
+}
+
+const_node const_node::child(size_t index) const
+{
+ switch (mp_impl->type)
+ {
+ case node_t::element:
+ {
+ const dom::element* p = mp_impl->value.elem;
+
+ size_t elem_pos = p->child_elem_positions.at(index);
+ assert(elem_pos < p->child_nodes.size());
+
+ const dom::node* child_node = p->child_nodes[elem_pos].get();
+ assert(child_node->type == node_type::element);
+
+ auto v = std::make_unique<impl>(static_cast<const dom::element*>(child_node));
+ return const_node(std::move(v));
+ }
+ default:
+ ;
+ }
+ return const_node();
+}
+
+entity_name const_node::name() const
+{
+ switch (mp_impl->type)
+ {
+ case node_t::element:
+ {
+ const dom::element* p = mp_impl->value.elem;
+ return p->name;
+ }
+ default:
+ ;
+ }
+
+ return entity_name();
+}
+
+std::string_view const_node::attribute(const entity_name& name) const
+{
+ switch (mp_impl->type)
+ {
+ case node_t::element:
+ {
+ const dom::element* p = mp_impl->value.elem;
+ auto it = p->attr_map.find(name);
+ if (it == p->attr_map.end())
+ break;
+
+ size_t pos = it->second;
+ assert(pos < p->attrs.size());
+ return p->attrs[pos].value;
+ }
+ default:
+ ;
+ }
+
+ return std::string_view();
+}
+
+std::string_view const_node::attribute(std::string_view name) const
+{
+ switch (mp_impl->type)
+ {
+ case node_t::declaration:
+ {
+ const dom::declaration* p = mp_impl->value.decl;
+ auto it = p->attr_map.find(name);
+ if (it == p->attr_map.end())
+ return std::string_view();
+
+ size_t pos = it->second;
+ assert(pos < p->attrs.size());
+ return p->attrs[pos].value;
+ }
+ default:
+ ;
+ }
+
+ return attribute(entity_name(name));
+}
+
+size_t const_node::attribute_count() const
+{
+ switch (mp_impl->type)
+ {
+ case node_t::declaration:
+ {
+ const dom::declaration* p = mp_impl->value.decl;
+ return p->attrs.size();
+ }
+ case node_t::element:
+ {
+ const dom::element* p = mp_impl->value.elem;
+ return p->attrs.size();
+ }
+ default:
+ ;
+ }
+ return 0;
+}
+
+const_node const_node::parent() const
+{
+ if (mp_impl->type != node_t::element)
+ return const_node();
+
+ const dom::element* p = mp_impl->value.elem->parent;
+ if (!p)
+ return const_node();
+
+ auto v = std::make_unique<impl>(p);
+ return const_node(std::move(v));
+}
+
+void const_node::swap(const_node& other)
+{
+ mp_impl.swap(other.mp_impl);
+}
+
+const_node& const_node::operator= (const const_node& other)
+{
+ const_node tmp(other);
+ swap(tmp);
+ return *this;
+}
+
+bool const_node::operator== (const const_node& other) const
+{
+ if (mp_impl->type != other.mp_impl->type)
+ return false;
+
+ switch (mp_impl->type)
+ {
+ case node_t::unset:
+ return true;
+ case node_t::declaration:
+ return mp_impl->value.decl == other.mp_impl->value.decl;
+ case node_t::element:
+ return mp_impl->value.elem == other.mp_impl->value.elem;
+ default:
+ ;
+ }
+
+ return false;
+}
+
+bool const_node::operator!= (const const_node& other) const
+{
+ return !operator==(other);
+}
+
+/**
+ * This impl class also serves as the handler for the sax_ns_parser.
+ */
+struct document_tree::impl
+{
+ typedef std::vector<dom::element*> element_stack_type;
+ typedef std::unordered_map<std::string_view, dom::declaration> declarations_type;
+
+ xmlns_context& m_ns_cxt;
+ string_pool m_pool;
+
+ std::unique_ptr<sax::doctype_declaration> m_doctype;
+
+ std::string_view m_cur_decl_name;
+ declarations_type m_decls;
+ dom::attrs_type m_doc_attrs;
+ dom::attrs_type m_cur_attrs;
+ dom::attr_map_type m_cur_attr_map;
+ element_stack_type m_elem_stack;
+ std::unique_ptr<dom::element> m_root;
+
+ impl(xmlns_context& cxt) : m_ns_cxt(cxt) {}
+
+ void start_declaration(std::string_view name)
+ {
+ m_cur_decl_name = name;
+ }
+
+ void end_declaration(std::string_view name);
+ void start_element(const sax_ns_parser_element& elem);
+ void end_element(const sax_ns_parser_element& elem);
+ void characters(std::string_view val, bool transient);
+ void doctype(const sax::doctype_declaration& dtd);
+
+ void attribute(std::string_view name, std::string_view val)
+ {
+ set_attribute(XMLNS_UNKNOWN_ID, name, val);
+ }
+
+ void attribute(const sax_ns_parser_attribute& attr)
+ {
+ set_attribute(attr.ns, attr.name, attr.value);
+ }
+
+ void set_attribute(xmlns_id_t ns, std::string_view name, std::string_view val);
+};
+
+void document_tree::impl::end_declaration(std::string_view name)
+{
+ assert(m_cur_decl_name == name);
+
+ dom::declaration decl;
+ decl.attrs.swap(m_cur_attrs);
+ decl.attr_map.swap(m_cur_attr_map);
+
+ declarations_type::iterator it = m_decls.find(name);
+ if (it == m_decls.end())
+ {
+ // Insert a new entry for this name.
+ std::pair<declarations_type::iterator,bool> r =
+ m_decls.insert(
+ declarations_type::value_type(
+ m_pool.intern(name).first, std::move(decl)));
+
+ if (!r.second)
+ // Insertion failed.
+ throw general_error("dom_tree::end_declaration: failed to insert a new declaration entry.");
+ }
+ else
+ it->second = std::move(decl);
+}
+
+void document_tree::impl::start_element(const sax_ns_parser_element& elem)
+{
+ xmlns_id_t ns = elem.ns;
+ std::string_view name = elem.name;
+
+ // These strings must be persistent.
+ std::string_view name_safe = m_pool.intern(name).first;
+
+ dom::element* p = nullptr;
+ if (!m_root)
+ {
+ // This must be the root element!
+ m_root = std::make_unique<dom::element>(ns, name_safe);
+ m_elem_stack.push_back(m_root.get());
+ p = m_elem_stack.back();
+ p->attrs.swap(m_cur_attrs);
+ p->attr_map.swap(m_cur_attr_map);
+ return;
+ }
+
+ // Append new element as a child element of the current element.
+ p = m_elem_stack.back();
+
+ size_t elem_pos = p->child_nodes.size();
+ p->child_elem_positions.push_back(elem_pos);
+
+ p->child_nodes.push_back(std::make_unique<dom::element>(ns, name_safe));
+ const dom::element* parent = p;
+ p = static_cast<dom::element*>(p->child_nodes.back().get());
+ p->parent = parent;
+ p->attrs.swap(m_cur_attrs);
+ p->attr_map.swap(m_cur_attr_map);
+
+ m_elem_stack.push_back(p);
+}
+
+void document_tree::impl::end_element(const sax_ns_parser_element& elem)
+{
+ xmlns_id_t ns = elem.ns;
+ std::string_view name = elem.name;
+
+ const dom::element* p = m_elem_stack.back();
+ if (p->name.ns != ns || p->name.name != name)
+ throw general_error("non-matching end element.");
+
+ m_elem_stack.pop_back();
+}
+
+void document_tree::impl::characters(std::string_view val, bool /*transient*/)
+{
+ if (m_elem_stack.empty())
+ // No root element has been encountered. Ignore this.
+ return;
+
+ std::string_view val2 = trim(val);
+ if (val2.empty())
+ return;
+
+ dom::element* p = m_elem_stack.back();
+ val2 = m_pool.intern(val2).first; // Make sure the string is persistent.
+ auto child = std::make_unique<dom::content>(val2);
+ child->parent = p;
+ p->child_nodes.push_back(std::move(child));
+}
+
+void document_tree::impl::set_attribute(xmlns_id_t ns, std::string_view name, std::string_view val)
+{
+ // These strings must be persistent.
+ std::string_view name2 = m_pool.intern(name).first;
+ std::string_view val2 = m_pool.intern(val).first;
+
+ size_t pos = m_cur_attrs.size();
+ m_cur_attrs.push_back(dom::attr(ns, name2, val2));
+ m_cur_attr_map.insert({dom::entity_name(ns, name2), pos});
+}
+
+void document_tree::impl::doctype(const sax::doctype_declaration& dtd)
+{
+ m_doctype = std::make_unique<sax::doctype_declaration>(dtd); // make a copy.
+
+ sax::doctype_declaration& this_dtd = *m_doctype;
+ string_pool& pool = m_pool;
+
+ // Intern the strings.
+ this_dtd.root_element = pool.intern(dtd.root_element).first;
+ this_dtd.fpi = pool.intern(dtd.fpi).first;
+ this_dtd.uri = pool.intern(dtd.uri).first;
+}
+
+document_tree::document_tree(xmlns_context& cxt) :
+ mp_impl(std::make_unique<impl>(cxt)) {}
+
+document_tree::document_tree(document_tree&& other) :
+ mp_impl(std::move(other.mp_impl))
+{
+ other.mp_impl = std::make_unique<impl>(mp_impl->m_ns_cxt);
+}
+
+document_tree::~document_tree() {}
+
+void document_tree::load(std::string_view strm)
+{
+ sax_ns_parser<impl> parser(strm, mp_impl->m_ns_cxt, *mp_impl);
+ parser.parse();
+}
+
+dom::const_node document_tree::root() const
+{
+ const dom::element* p = mp_impl->m_root.get();
+ auto v = std::make_unique<const_node::impl>(p);
+ return dom::const_node(std::move(v));
+}
+
+dom::const_node document_tree::declaration(std::string_view name) const
+{
+ impl::declarations_type::const_iterator it = mp_impl->m_decls.find(name);
+ if (it == mp_impl->m_decls.end())
+ return dom::const_node();
+
+ const dom::declaration* decl = &it->second;
+ auto v = std::make_unique<dom::const_node::impl>(decl);
+ return dom::const_node(std::move(v));
+}
+
+void document_tree::swap(document_tree& other)
+{
+ mp_impl.swap(other.mp_impl);
+}
+
+const sax::doctype_declaration* document_tree::get_doctype() const
+{
+ return mp_impl->m_doctype.get();
+}
+
+namespace {
+
+struct scope
+{
+ typedef std::vector<const dom::node*> nodes_type;
+ std::string name;
+ nodes_type nodes;
+ nodes_type::const_iterator current_pos;
+
+ scope(const scope&) = delete;
+ scope& operator=(const scope&) = delete;
+
+ scope(const std::string& _name, dom::node* _node) :
+ name(_name)
+ {
+ nodes.push_back(_node);
+ current_pos = nodes.begin();
+ }
+
+ scope(const std::string& _name) : name(_name) {}
+};
+
+typedef std::deque<scope> scopes_type;
+
+void print_scope(std::ostream& os, const scopes_type& scopes)
+{
+ if (scopes.empty())
+ throw general_error("scope stack shouldn't be empty while dumping tree.");
+
+ // Skip the first scope which is root.
+ scopes_type::const_iterator it = scopes.begin(), it_end = scopes.end();
+ for (++it; it != it_end; ++it)
+ os << "/" << it->name;
+}
+
+}
+
+void document_tree::dump_compact(std::ostream& os) const
+{
+ if (!mp_impl->m_root)
+ return;
+
+ // Dump namespaces first.
+ mp_impl->m_ns_cxt.dump(os);
+
+ scopes_type scopes;
+
+ scopes.emplace_back(std::string(), mp_impl->m_root.get());
+ while (!scopes.empty())
+ {
+ bool new_scope = false;
+
+ // Iterate through all elements in the current scope.
+ scope& cur_scope = scopes.back();
+ for (; cur_scope.current_pos != cur_scope.nodes.end(); ++cur_scope.current_pos)
+ {
+ const dom::node* this_node = *cur_scope.current_pos;
+ assert(this_node);
+ print_scope(os, scopes);
+ if (this_node->type == dom::node_type::content)
+ {
+ // This is a text content.
+ this_node->print(os, mp_impl->m_ns_cxt);
+ os << std::endl;
+ continue;
+ }
+
+ assert(this_node->type == dom::node_type::element);
+ const dom::element* elem = static_cast<const dom::element*>(this_node);
+ os << "/";
+ elem->print(os, mp_impl->m_ns_cxt);
+ os << std::endl;
+
+ {
+ // Dump attributes.
+ dom::attrs_type attrs = elem->attrs;
+ std::sort(attrs.begin(), attrs.end(),
+ [](const dom::attr& left, const dom::attr& right) -> bool
+ {
+ return left.name.name < right.name.name;
+ }
+ );
+
+ for (const dom::attr& a : attrs)
+ {
+ print_scope(os, scopes);
+ os << "/";
+ elem->print(os, mp_impl->m_ns_cxt);
+ os << "@";
+ dom::print(os, a, mp_impl->m_ns_cxt);
+ os << std::endl;
+ }
+ }
+
+ if (elem->child_nodes.empty())
+ continue;
+
+ // This element has child nodes. Push a new scope and populate it
+ // with all child elements, but skip content nodes.
+ scope::nodes_type nodes;
+ for (const std::unique_ptr<dom::node>& p : elem->child_nodes)
+ nodes.push_back(p.get());
+
+ assert(!nodes.empty());
+
+ // Push a new scope, and restart the loop with the new scope.
+ ++cur_scope.current_pos;
+ std::ostringstream elem_name;
+ elem->print(elem_name, mp_impl->m_ns_cxt);
+ scopes.emplace_back(elem_name.str());
+ scope& child_scope = scopes.back();
+ child_scope.nodes.swap(nodes);
+ child_scope.current_pos = child_scope.nodes.begin();
+
+ new_scope = true;
+ break;
+ }
+
+ if (new_scope)
+ continue;
+
+ scopes.pop_back();
+ }
+}
+
+} // namespace dom
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */