summaryrefslogtreecommitdiffstats
path: root/src/parser/sax_parser_base.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser/sax_parser_base.cpp')
-rw-r--r--src/parser/sax_parser_base.cpp421
1 files changed, 421 insertions, 0 deletions
diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp
new file mode 100644
index 0000000..58f750e
--- /dev/null
+++ b/src/parser/sax_parser_base.cpp
@@ -0,0 +1,421 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "orcus/sax_parser_base.hpp"
+
+#include "utf8.hpp"
+
+#include <cstring>
+#include <vector>
+#include <memory>
+
+#ifdef __ORCUS_CPU_FEATURES
+#include <immintrin.h>
+#endif
+
+namespace orcus { namespace sax {
+
+char decode_xml_encoded_char(const char* p, size_t n)
+{
+ if (n == 2)
+ {
+ if (!std::strncmp(p, "lt", n))
+ return '<';
+ else if (!std::strncmp(p, "gt", n))
+ return '>';
+ else
+ return '\0';
+ }
+ else if (n == 3)
+ {
+ if (!std::strncmp(p, "amp", n))
+ return '&';
+ else
+ return '\0';
+ }
+ else if (n == 4)
+ {
+ if (!std::strncmp(p, "apos", n))
+ return '\'';
+ else if (!std::strncmp(p, "quot", 4))
+ return '"';
+ else
+ return '\0';
+ }
+
+ return '\0';
+}
+
+std::string decode_xml_unicode_char(const char* p, size_t n)
+{
+ if (*p == '#' && n >= 2)
+ {
+ uint32_t point = 0;
+ if (p[1] == 'x')
+ {
+ if (n == 2)
+ throw orcus::xml_structure_error(
+ "invalid number of characters for hexadecimal unicode reference");
+
+ point = std::stoi(std::string(p + 2, n - 2), nullptr, 16);
+ }
+ else
+ point = std::stoi(std::string(p + 1, n - 1), nullptr, 10);
+
+ if (point < 0x80)
+ {
+ // is it really necessary to do the bit manipulation here?
+ std::string s(1, static_cast<char>(point & 0x7F));
+ return s;
+ }
+ else if (point < 0x0800)
+ {
+ std::string s(1, static_cast<char>((point >> 6 & 0x1F) | 0xC0));
+ s += static_cast<char>((point & 0x3F) | 0x80);
+ return s;
+ }
+ else if (point < 0x010000)
+ {
+ std::string s(1, static_cast<char>((point >> 12 & 0x0F) | 0xE0));
+ s += static_cast<char>((point >> 6 & 0x3F) | 0x80);
+ s += static_cast<char>((point & 0x3F) | 0x80);
+ return s;
+ }
+ else if (point < 0x110000)
+ {
+ std::string s(1, static_cast<char>((point >> 18 & 0x07) | 0xF0));
+ s += static_cast<char>((point >> 12 & 0x3F) | 0x80);
+ s += static_cast<char>((point >> 6 & 0x3F) | 0x80);
+ s += static_cast<char>((point & 0x3F) | 0x80);
+ return s;
+ }
+ else
+ {
+ // should not happen as that is not represented by utf-8
+ assert(false);
+ }
+ }
+
+ return std::string();
+}
+
+struct parser_base::impl
+{
+ std::vector<std::unique_ptr<cell_buffer>> m_cell_buffers;
+};
+
+parser_base::parser_base(const char* content, size_t size) :
+ ::orcus::parser_base(content, size),
+ mp_impl(std::make_unique<impl>()),
+ m_nest_level(0),
+ m_buffer_pos(0),
+ m_root_elem_open(true)
+{
+ mp_impl->m_cell_buffers.push_back(std::make_unique<cell_buffer>());
+}
+
+parser_base::~parser_base() {}
+
+void parser_base::inc_buffer_pos()
+{
+ ++m_buffer_pos;
+ if (m_buffer_pos == mp_impl->m_cell_buffers.size())
+ mp_impl->m_cell_buffers.push_back(std::make_unique<cell_buffer>());
+}
+
+cell_buffer& parser_base::get_cell_buffer()
+{
+ return *mp_impl->m_cell_buffers[m_buffer_pos];
+}
+
+void parser_base::comment()
+{
+ // Parse until we reach '-->'.
+ size_t len = available_size();
+ assert(len > 3);
+ char c = cur_char();
+ size_t i = 0;
+ bool hyphen = false;
+ for (; i < len; ++i, c = next_and_char())
+ {
+ if (c == '-')
+ {
+ if (!hyphen)
+ // first hyphen.
+ hyphen = true;
+ else
+ // second hyphen.
+ break;
+ }
+ else
+ hyphen = false;
+ }
+
+ if (len - i < 2 || next_and_char() != '>')
+ throw malformed_xml_error(
+ "'--' should not occur in comment other than in the closing tag.", offset());
+
+ next();
+}
+
+void parser_base::expects_next(const char* p, size_t n)
+{
+ if (available_size() < n+1)
+ throw malformed_xml_error(
+ "not enough stream left to check for an expected string segment.", offset());
+
+ const char* p0 = p;
+ const char* p_end = p + n;
+ char c = next_and_char();
+ for (; p != p_end; ++p, c = next_and_char())
+ {
+ if (c == *p)
+ continue;
+
+ std::ostringstream os;
+ os << "'" << std::string(p0, n) << "' was expected, but not found.";
+ throw malformed_xml_error(os.str(), offset());
+ }
+}
+
+void parser_base::parse_encoded_char(cell_buffer& buf)
+{
+ assert(cur_char() == '&');
+ next();
+ const char* p0 = mp_char;
+ for (; has_char(); next())
+ {
+ if (cur_char() != ';')
+ continue;
+
+ size_t n = mp_char - p0;
+ if (!n)
+ throw malformed_xml_error("empty encoded character.", offset());
+
+#if ORCUS_DEBUG_SAX_PARSER
+ cout << "sax_parser::parse_encoded_char: raw='" << std::string(p0, n) << "'" << endl;
+#endif
+
+ char c = decode_xml_encoded_char(p0, n);
+ if (c)
+ buf.append(&c, 1);
+ else
+ {
+ std::string utf8 = decode_xml_unicode_char(p0, n);
+
+ if (!utf8.empty())
+ {
+ buf.append(utf8.data(), utf8.size());
+ c = '1'; // just to avoid hitting the !c case below
+ }
+ }
+
+ // Move to the character past ';' before returning to the parent call.
+ next();
+
+ if (!c)
+ {
+#if ORCUS_DEBUG_SAX_PARSER
+ cout << "sax_parser::parse_encoded_char: not a known encoding name. Use the original." << endl;
+#endif
+ // Unexpected encoding name. Use the original text.
+ buf.append(p0, mp_char-p0);
+ }
+
+ return;
+ }
+
+ throw malformed_xml_error(
+ "error parsing encoded character: terminating character is not found.", offset());
+}
+
+void parser_base::value_with_encoded_char(cell_buffer& buf, std::string_view& str, char quote_char)
+{
+ assert(cur_char() == '&');
+ parse_encoded_char(buf);
+
+ const char* p0 = mp_char;
+
+ while (has_char())
+ {
+ if (cur_char() == '&')
+ {
+ if (mp_char > p0)
+ buf.append(p0, mp_char-p0);
+
+ parse_encoded_char(buf);
+ p0 = mp_char;
+ }
+
+ if (cur_char() == quote_char)
+ break;
+
+ if (cur_char() != '&')
+ next();
+ }
+
+ if (mp_char > p0)
+ buf.append(p0, mp_char-p0);
+
+ if (!buf.empty())
+ str = buf.str();
+
+ // Skip the closing quote.
+ assert(!has_char() || cur_char() == quote_char);
+ if (has_char())
+ next();
+}
+
+bool parser_base::value(std::string_view& str, bool decode)
+{
+ char c = cur_char_checked();
+ if (c != '"' && c != '\'')
+ throw malformed_xml_error("value must be quoted", offset());
+
+ char quote_char = c;
+
+ c = next_char_checked();
+
+ const char* p0 = mp_char;
+ for (; c != quote_char; c = next_char_checked())
+ {
+ if (decode && c == '&')
+ {
+ // This value contains one or more encoded characters.
+ cell_buffer& buf = get_cell_buffer();
+ buf.reset();
+ buf.append(p0, mp_char-p0);
+ value_with_encoded_char(buf, str, quote_char);
+ return true;
+ }
+ }
+
+ str = std::string_view(p0, mp_char-p0);
+
+ // Skip the closing quote.
+ next();
+
+ return false;
+}
+
+void parser_base::name(std::string_view& str)
+{
+ const char* p0 = mp_char;
+ mp_char = parse_utf8_xml_name_start_char(mp_char, mp_end);
+ if (mp_char == p0)
+ {
+ ::std::ostringstream os;
+ os << "name must begin with an alphabet, but got this instead '" << cur_char() << "'";
+ throw malformed_xml_error(os.str(), offset());
+ }
+
+#if defined(__ORCUS_CPU_FEATURES) && defined(__SSE4_2__)
+
+ const __m128i match = _mm_loadu_si128((const __m128i*)"azAZ09--__..");
+ const int mode = _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY;
+
+ size_t n_total = available_size();
+
+ while (n_total)
+ {
+ __m128i char_block = _mm_loadu_si128((const __m128i*)mp_char);
+
+ int n = std::min<size_t>(16u, n_total);
+ int r = _mm_cmpestri(match, 12, char_block, n, mode);
+ mp_char += r; // Move the current char position.
+ n_total -= r;
+
+ if (r < 16 && n_total)
+ {
+ // There is a character that does not match the SSE-based ASCII-only check.
+ // It may either by an ascii character that is not allowed, in which case stop,
+ // or it may possibly be an allowed utf-8 character, in which case move over it
+ // using the slow function.
+
+ const char* p = parse_utf8_xml_name_char(mp_char, mp_end);
+ if (p == mp_char)
+ break;
+
+ n_total -= p - mp_char;
+ mp_char = p;
+ }
+
+ }
+ cur_char_checked(); // check end of xml stream
+
+#else
+ for(;;)
+ {
+ cur_char_checked(); // check end of xml stream
+ const char* p = parse_utf8_xml_name_char(mp_char, mp_end);
+
+ if (p == mp_char)
+ break;
+
+ mp_char = p;
+ }
+#endif
+
+ str = std::string_view(p0, mp_char-p0);
+}
+
+void parser_base::element_name(parser_element& elem, std::ptrdiff_t begin_pos)
+{
+ elem.begin_pos = begin_pos;
+ name(elem.name);
+ if (cur_char() == ':')
+ {
+ elem.ns = elem.name;
+ next_check();
+ name(elem.name);
+ }
+}
+
+void parser_base::attribute_name(std::string_view& attr_ns, std::string_view& attr_name)
+{
+ name(attr_name);
+ if (cur_char() == ':')
+ {
+ // Attribute name is namespaced.
+ attr_ns = attr_name;
+ next_check();
+ name(attr_name);
+ }
+}
+
+void parser_base::characters_with_encoded_char(cell_buffer& buf)
+{
+ assert(cur_char() == '&');
+ parse_encoded_char(buf);
+
+ const char* p0 = mp_char;
+
+ while (has_char())
+ {
+ if (cur_char() == '&')
+ {
+ if (mp_char > p0)
+ buf.append(p0, mp_char-p0);
+
+ parse_encoded_char(buf);
+ p0 = mp_char;
+ }
+
+ if (cur_char() == '<')
+ break;
+
+ if (cur_char() != '&')
+ next();
+ }
+
+ if (mp_char > p0)
+ buf.append(p0, mp_char-p0);
+}
+
+}}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */