summaryrefslogtreecommitdiffstats
path: root/src/parser/xml_namespace.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser/xml_namespace.cpp')
-rw-r--r--src/parser/xml_namespace.cpp490
1 files changed, 490 insertions, 0 deletions
diff --git a/src/parser/xml_namespace.cpp b/src/parser/xml_namespace.cpp
new file mode 100644
index 0000000..2aafea3
--- /dev/null
+++ b/src/parser/xml_namespace.cpp
@@ -0,0 +1,490 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <orcus/xml_namespace.hpp>
+#include <orcus/exception.hpp>
+#include <orcus/string_pool.hpp>
+
+#include <unordered_map>
+#include <vector>
+#include <limits>
+#include <sstream>
+#include <algorithm>
+#include <cassert>
+
+#define ORCUS_DEBUG_XML_NAMESPACE 0
+
+using namespace std;
+
+#if ORCUS_DEBUG_XML_NAMESPACE
+#include <cstdio>
+#include <iostream>
+#endif
+
+namespace orcus {
+
+namespace {
+
+#if ORCUS_DEBUG_XML_NAMESPACE
+template<typename _MapType>
+void print_map_keys(const _MapType& map_store)
+{
+ cout << "keys: (";
+ bool first = true;
+ typename _MapType::const_iterator it = map_store.begin(), it_end = map_store.end();
+ for (; it != it_end; ++it)
+ {
+ if (first)
+ first = false;
+ else
+ cout << " ";
+ cout << "'" << it->first << "'";
+ }
+ cout << ")";
+};
+#endif
+
+}
+
+typedef std::unordered_map<std::string_view, std::size_t> strid_map_type;
+
+struct xmlns_repository::impl
+{
+ size_t m_predefined_ns_size;
+ string_pool m_pool; /// storage of live string instances.
+ std::vector<std::string_view> m_identifiers; /// map strings to numerical identifiers.
+ strid_map_type m_strid_map; /// string-to-numerical identifiers map for quick lookup.
+
+ impl() : m_predefined_ns_size(0) {}
+};
+
+xmlns_repository::xmlns_repository() : mp_impl(std::make_unique<impl>()) {}
+xmlns_repository::xmlns_repository(xmlns_repository&& other) : mp_impl(std::move(other.mp_impl)) {}
+xmlns_repository::~xmlns_repository() = default;
+
+xmlns_repository& xmlns_repository::operator= (xmlns_repository&& other)
+{
+ mp_impl = std::move(other.mp_impl);
+ return *this;
+}
+
+xmlns_id_t xmlns_repository::intern(std::string_view uri)
+{
+ // See if the uri is already registered.
+ strid_map_type::iterator it = mp_impl->m_strid_map.find(uri);
+ if (it != mp_impl->m_strid_map.end())
+ return it->first.data();
+
+ try
+ {
+ auto r = mp_impl->m_pool.intern(uri);
+ std::string_view uri_interned = r.first;
+
+ if (!uri_interned.empty())
+ {
+ // Intern successful.
+ if (r.second)
+ {
+ // This is a new instance. Assign a numerical identifier.
+ mp_impl->m_strid_map.insert(
+ strid_map_type::value_type(r.first, mp_impl->m_identifiers.size()));
+#if ORCUS_DEBUG_XML_NAMESPACE
+ cout << "xmlns_repository::intern: uri='" << uri_interned << "' (" << mp_impl->m_identifiers.size() << ")" << endl;
+#endif
+ mp_impl->m_identifiers.push_back(r.first);
+
+#if ORCUS_DEBUG_XML_NAMESPACE
+ cout << "pool size=" << mp_impl->m_pool.size() << ", predefined ns size=" << mp_impl->m_predefined_ns_size <<
+ ", identifiers size=" << mp_impl->m_identifiers.size() << ", map size=" << mp_impl->m_strid_map.size() << endl;
+#endif
+ assert(mp_impl->m_pool.size()+mp_impl->m_predefined_ns_size == mp_impl->m_identifiers.size());
+ assert(mp_impl->m_pool.size()+mp_impl->m_predefined_ns_size == mp_impl->m_strid_map.size());
+ }
+ return uri_interned.data();
+ }
+ }
+ catch (const general_error&)
+ {
+ }
+
+ return XMLNS_UNKNOWN_ID;
+}
+
+void xmlns_repository::add_predefined_values(const xmlns_id_t* predefined_ns)
+{
+ if (!predefined_ns)
+ return;
+
+ const xmlns_id_t* val = &predefined_ns[0];
+ for (; *val; ++val)
+ {
+ std::string_view s(*val);
+ mp_impl->m_strid_map.insert(
+ strid_map_type::value_type(s, mp_impl->m_identifiers.size()));
+ mp_impl->m_identifiers.push_back(s);
+
+ ++mp_impl->m_predefined_ns_size;
+
+#if ORCUS_DEBUG_XML_NAMESPACE
+ cout << "xlmns_repository: predefined ns='" << s << "'" << endl;
+#endif
+ }
+}
+
+xmlns_context xmlns_repository::create_context()
+{
+ return xmlns_context(*this);
+}
+
+xmlns_id_t xmlns_repository::get_identifier(size_t index) const
+{
+ if (index >= mp_impl->m_identifiers.size())
+ return XMLNS_UNKNOWN_ID;
+
+ // All identifier strings are interned which means they are all null-terminated.
+ return mp_impl->m_identifiers[index].data();
+}
+
+string xmlns_repository::get_short_name(xmlns_id_t ns_id) const
+{
+ size_t index = get_index(ns_id);
+
+ if (index == INDEX_NOT_FOUND)
+ return string("???");
+
+ std::ostringstream os;
+ os << "ns" << index;
+ return os.str();
+}
+
+size_t xmlns_repository::get_index(xmlns_id_t ns_id) const
+{
+ if (!ns_id)
+ return INDEX_NOT_FOUND;
+
+ auto it = mp_impl->m_strid_map.find(std::string_view(ns_id));
+ if (it == mp_impl->m_strid_map.end())
+ return INDEX_NOT_FOUND;
+
+ return it->second;
+}
+
+typedef std::vector<xmlns_id_t> xmlns_list_type;
+typedef std::unordered_map<std::string_view, xmlns_list_type> alias_map_type;
+
+struct xmlns_context::impl
+{
+ xmlns_repository* repo = nullptr;
+ xmlns_list_type m_all_ns; /// all namespaces ever used in this context.
+ xmlns_list_type m_default;
+ alias_map_type m_map;
+
+ bool m_trim_all_ns = true;
+
+ impl() {}
+ impl(xmlns_repository& _repo) : repo(&_repo) {}
+ impl(const impl& r) :
+ repo(r.repo), m_all_ns(r.m_all_ns), m_default(r.m_default), m_map(r.m_map), m_trim_all_ns(r.m_trim_all_ns) {}
+};
+
+xmlns_context::xmlns_context() : mp_impl(std::make_unique<impl>()) {}
+xmlns_context::xmlns_context(xmlns_repository& repo) : mp_impl(std::make_unique<impl>(repo)) {}
+xmlns_context::xmlns_context(const xmlns_context& r) : mp_impl(std::make_unique<impl>(*r.mp_impl)) {}
+xmlns_context::xmlns_context(xmlns_context&& r) : mp_impl(std::move(r.mp_impl))
+{
+ r.mp_impl = std::make_unique<impl>();
+}
+
+xmlns_context::~xmlns_context() = default;
+
+xmlns_context& xmlns_context::operator= (const xmlns_context& r)
+{
+ xmlns_context tmp(r);
+ tmp.swap(*this);
+ return *this;
+}
+
+xmlns_context& xmlns_context::operator= (xmlns_context&& r)
+{
+ xmlns_context tmp(std::move(r));
+ tmp.swap(*this);
+ return *this;
+}
+
+xmlns_id_t xmlns_context::push(std::string_view alias, std::string_view uri)
+{
+ if (!mp_impl->repo)
+ throw general_error("this context is not associated with any repo.");
+
+#if ORCUS_DEBUG_XML_NAMESPACE
+ cout << "xmlns_context::push: key='" << alias << "', uri='" << uri << "'" << endl;
+#endif
+ mp_impl->m_trim_all_ns = true;
+
+ xmlns_id_t id = mp_impl->repo->intern(uri);
+ std::string_view uri_interned = id ? std::string_view(id) : std::string_view();
+
+ if (alias.empty())
+ {
+ // empty alias value is associated with default namespace.
+ mp_impl->m_default.push_back(uri_interned.data());
+ mp_impl->m_all_ns.push_back(uri_interned.data());
+ return mp_impl->m_default.back();
+ }
+
+ // See if this alias already exists.
+ alias_map_type::iterator it = mp_impl->m_map.find(alias);
+ if (it == mp_impl->m_map.end())
+ {
+ // This is the first time this alias is used.
+ xmlns_list_type nslist;
+ nslist.push_back(uri_interned.data());
+ mp_impl->m_all_ns.push_back(uri_interned.data());
+ std::pair<alias_map_type::iterator,bool> r =
+ mp_impl->m_map.insert(alias_map_type::value_type(alias, nslist));
+
+ if (!r.second)
+ // insertion failed.
+ throw general_error("Failed to insert new namespace.");
+
+ return nslist.back();
+ }
+
+ // The alias already exists.
+ xmlns_list_type& nslist = it->second;
+ nslist.push_back(uri_interned.data());
+ mp_impl->m_all_ns.push_back(uri_interned.data());
+ return nslist.back();
+}
+
+void xmlns_context::pop(std::string_view alias)
+{
+#if ORCUS_DEBUG_XML_NAMESPACE
+ cout << "xmlns_context::pop: alias='" << alias << "'" << endl;
+#endif
+ if (alias.empty())
+ {
+ // empty alias value is associated with default namespace.
+ if (mp_impl->m_default.empty())
+ throw general_error("default namespace stack is empty.");
+
+ mp_impl->m_default.pop_back();
+ return;
+ }
+
+ // See if this alias really exists.
+ alias_map_type::iterator it = mp_impl->m_map.find(alias);
+ if (it == mp_impl->m_map.end())
+ {
+ std::ostringstream os;
+ os << "alias named '" << alias << "' was attempted to be popped, but was not found in the stack";
+ throw general_error(os.str());
+ }
+
+ xmlns_list_type& nslist = it->second;
+ if (nslist.empty())
+ throw general_error("namespace stack for this key is empty.");
+
+ nslist.pop_back();
+}
+
+xmlns_id_t xmlns_context::get(std::string_view alias) const
+{
+#if ORCUS_DEBUG_XML_NAMESPACE
+ cout << "xmlns_context::get: alias='" << alias << "', default ns stack size="
+ << mp_impl->m_default.size() << ", non-default alias count=" << mp_impl->m_map.size();
+ cout << ", ";
+ print_map_keys(mp_impl->m_map);
+ cout << endl;
+#endif
+ if (alias.empty())
+ return mp_impl->m_default.empty() ? XMLNS_UNKNOWN_ID : mp_impl->m_default.back();
+
+ alias_map_type::const_iterator it = mp_impl->m_map.find(alias);
+ if (it == mp_impl->m_map.end())
+ {
+#if ORCUS_DEBUG_XML_NAMESPACE
+ cout << "xmlns_context::get: alias not in this context" << endl;
+#endif
+ return XMLNS_UNKNOWN_ID;
+ }
+
+#if ORCUS_DEBUG_XML_NAMESPACE
+ cout << "xmlns_context::get: alias stack size=" << it->second.size() << endl;
+#endif
+ return it->second.empty() ? XMLNS_UNKNOWN_ID : it->second.back();
+}
+
+size_t xmlns_context::get_index(xmlns_id_t ns_id) const
+{
+ if (!mp_impl->repo)
+ throw general_error("this context is not associated with any repo.");
+
+ return mp_impl->repo->get_index(ns_id);
+}
+
+string xmlns_context::get_short_name(xmlns_id_t ns_id) const
+{
+ if (!mp_impl->repo)
+ throw general_error("this context is not associated with any repo.");
+
+ return mp_impl->repo->get_short_name(ns_id);
+}
+
+std::string_view xmlns_context::get_alias(xmlns_id_t ns_id) const
+{
+ alias_map_type::const_iterator it = mp_impl->m_map.begin(), it_end = mp_impl->m_map.end();
+ for (; it != it_end; ++it)
+ {
+ const xmlns_list_type& lst = it->second;
+ if (lst.empty())
+ continue;
+
+ if (lst.back() == ns_id)
+ return it->first;
+ }
+
+ return std::string_view{};
+}
+
+namespace {
+
+#if ORCUS_DEBUG_XML_NAMESPACE
+struct print_ns
+{
+ void operator() (xmlns_id_t ns_id) const
+ {
+ const char* p = ns_id;
+ printf("%p: %s\n", p, p);
+ }
+};
+#endif
+
+struct ns_item
+{
+ size_t index;
+ xmlns_id_t ns;
+
+ ns_item(size_t _index, xmlns_id_t _ns) : index(_index), ns(_ns) {}
+};
+
+struct less_ns_by_index
+{
+ bool operator() (const ns_item& left, const ns_item& right) const
+ {
+ return left.index < right.index;
+ }
+};
+
+class push_back_ns_to_item
+{
+ vector<ns_item>& m_store;
+ const xmlns_context& m_cxt;
+public:
+ push_back_ns_to_item(vector<ns_item>& store, const xmlns_context& cxt) : m_store(store), m_cxt(cxt) {}
+ void operator() (xmlns_id_t ns)
+ {
+ size_t num_id = m_cxt.get_index(ns);
+ if (num_id != INDEX_NOT_FOUND)
+ m_store.push_back(ns_item(num_id, ns));
+ }
+};
+
+class push_back_item_to_ns
+{
+ std::vector<xmlns_id_t>& m_store;
+public:
+ push_back_item_to_ns(std::vector<xmlns_id_t>& store) : m_store(store) {}
+ void operator() (const ns_item& item)
+ {
+ m_store.push_back(item.ns);
+ }
+};
+
+}
+
+std::vector<xmlns_id_t> xmlns_context::get_all_namespaces() const
+{
+#if ORCUS_DEBUG_XML_NAMESPACE
+ cout << "xmlns_context::get_all_namespaces: count=" << mp_impl->m_all_ns.size() << endl;
+ std::for_each(mp_impl->m_all_ns.begin(), mp_impl->m_all_ns.end(), print_ns());
+#endif
+
+ std::vector<xmlns_id_t> nslist;
+
+ if (mp_impl->m_trim_all_ns)
+ {
+ xmlns_list_type& all_ns = mp_impl->m_all_ns;
+
+ nslist.assign(mp_impl->m_all_ns.begin(), mp_impl->m_all_ns.end());
+
+ // Sort it and remove duplicate.
+ std::sort(all_ns.begin(), all_ns.end());
+ xmlns_list_type::iterator it_unique_end =
+ std::unique(all_ns.begin(), all_ns.end());
+ all_ns.erase(it_unique_end, all_ns.end());
+
+ // Now, sort by indices.
+ vector<ns_item> items;
+ std::for_each(all_ns.begin(), all_ns.end(), push_back_ns_to_item(items, *this));
+ std::sort(items.begin(), items.end(), less_ns_by_index());
+
+ all_ns.clear();
+ std::for_each(items.begin(), items.end(), push_back_item_to_ns(all_ns));
+
+ mp_impl->m_trim_all_ns = false;
+ }
+
+ nslist.assign(mp_impl->m_all_ns.begin(), mp_impl->m_all_ns.end());
+ return nslist;
+}
+
+void xmlns_context::dump(std::ostream& os) const
+{
+ vector<xmlns_id_t> nslist = get_all_namespaces();
+ vector<xmlns_id_t>::const_iterator it = nslist.begin(), it_end = nslist.end();
+ for (; it != it_end; ++it)
+ {
+ xmlns_id_t ns_id = *it;
+ size_t num_id = get_index(ns_id);
+ if (num_id == INDEX_NOT_FOUND)
+ continue;
+
+ os << "ns" << num_id << "=\"" << ns_id << '"' << endl;
+ }
+}
+
+void xmlns_context::dump_state(std::ostream& os) const
+{
+ os << "namespaces:" << std::endl;
+ for (xmlns_id_t ns_id : get_all_namespaces())
+ {
+ size_t num_id = get_index(ns_id);
+ if (num_id == INDEX_NOT_FOUND)
+ continue;
+
+ os << " ns" << num_id << ": \"" << ns_id << '"' << std::endl;
+ }
+
+ os << "aliases:" << std::endl;
+ for (const auto& [alias, ns_list] : mp_impl->m_map)
+ {
+ os << " " << alias << ":" << std::endl;
+
+ for (const xmlns_id_t ns : ns_list)
+ os << " - " << ns << std::endl;
+ }
+}
+
+void xmlns_context::swap(xmlns_context& other) noexcept
+{
+ mp_impl.swap(other.mp_impl);
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */