diff options
Diffstat (limited to 'src/parser/sax_parser_base.cpp')
-rw-r--r-- | src/parser/sax_parser_base.cpp | 421 |
1 files changed, 421 insertions, 0 deletions
diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp new file mode 100644 index 0000000..58f750e --- /dev/null +++ b/src/parser/sax_parser_base.cpp @@ -0,0 +1,421 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "orcus/sax_parser_base.hpp" + +#include "utf8.hpp" + +#include <cstring> +#include <vector> +#include <memory> + +#ifdef __ORCUS_CPU_FEATURES +#include <immintrin.h> +#endif + +namespace orcus { namespace sax { + +char decode_xml_encoded_char(const char* p, size_t n) +{ + if (n == 2) + { + if (!std::strncmp(p, "lt", n)) + return '<'; + else if (!std::strncmp(p, "gt", n)) + return '>'; + else + return '\0'; + } + else if (n == 3) + { + if (!std::strncmp(p, "amp", n)) + return '&'; + else + return '\0'; + } + else if (n == 4) + { + if (!std::strncmp(p, "apos", n)) + return '\''; + else if (!std::strncmp(p, "quot", 4)) + return '"'; + else + return '\0'; + } + + return '\0'; +} + +std::string decode_xml_unicode_char(const char* p, size_t n) +{ + if (*p == '#' && n >= 2) + { + uint32_t point = 0; + if (p[1] == 'x') + { + if (n == 2) + throw orcus::xml_structure_error( + "invalid number of characters for hexadecimal unicode reference"); + + point = std::stoi(std::string(p + 2, n - 2), nullptr, 16); + } + else + point = std::stoi(std::string(p + 1, n - 1), nullptr, 10); + + if (point < 0x80) + { + // is it really necessary to do the bit manipulation here? + std::string s(1, static_cast<char>(point & 0x7F)); + return s; + } + else if (point < 0x0800) + { + std::string s(1, static_cast<char>((point >> 6 & 0x1F) | 0xC0)); + s += static_cast<char>((point & 0x3F) | 0x80); + return s; + } + else if (point < 0x010000) + { + std::string s(1, static_cast<char>((point >> 12 & 0x0F) | 0xE0)); + s += static_cast<char>((point >> 6 & 0x3F) | 0x80); + s += static_cast<char>((point & 0x3F) | 0x80); + return s; + } + else if (point < 0x110000) + { + std::string s(1, static_cast<char>((point >> 18 & 0x07) | 0xF0)); + s += static_cast<char>((point >> 12 & 0x3F) | 0x80); + s += static_cast<char>((point >> 6 & 0x3F) | 0x80); + s += static_cast<char>((point & 0x3F) | 0x80); + return s; + } + else + { + // should not happen as that is not represented by utf-8 + assert(false); + } + } + + return std::string(); +} + +struct parser_base::impl +{ + std::vector<std::unique_ptr<cell_buffer>> m_cell_buffers; +}; + +parser_base::parser_base(const char* content, size_t size) : + ::orcus::parser_base(content, size), + mp_impl(std::make_unique<impl>()), + m_nest_level(0), + m_buffer_pos(0), + m_root_elem_open(true) +{ + mp_impl->m_cell_buffers.push_back(std::make_unique<cell_buffer>()); +} + +parser_base::~parser_base() {} + +void parser_base::inc_buffer_pos() +{ + ++m_buffer_pos; + if (m_buffer_pos == mp_impl->m_cell_buffers.size()) + mp_impl->m_cell_buffers.push_back(std::make_unique<cell_buffer>()); +} + +cell_buffer& parser_base::get_cell_buffer() +{ + return *mp_impl->m_cell_buffers[m_buffer_pos]; +} + +void parser_base::comment() +{ + // Parse until we reach '-->'. + size_t len = available_size(); + assert(len > 3); + char c = cur_char(); + size_t i = 0; + bool hyphen = false; + for (; i < len; ++i, c = next_and_char()) + { + if (c == '-') + { + if (!hyphen) + // first hyphen. + hyphen = true; + else + // second hyphen. + break; + } + else + hyphen = false; + } + + if (len - i < 2 || next_and_char() != '>') + throw malformed_xml_error( + "'--' should not occur in comment other than in the closing tag.", offset()); + + next(); +} + +void parser_base::expects_next(const char* p, size_t n) +{ + if (available_size() < n+1) + throw malformed_xml_error( + "not enough stream left to check for an expected string segment.", offset()); + + const char* p0 = p; + const char* p_end = p + n; + char c = next_and_char(); + for (; p != p_end; ++p, c = next_and_char()) + { + if (c == *p) + continue; + + std::ostringstream os; + os << "'" << std::string(p0, n) << "' was expected, but not found."; + throw malformed_xml_error(os.str(), offset()); + } +} + +void parser_base::parse_encoded_char(cell_buffer& buf) +{ + assert(cur_char() == '&'); + next(); + const char* p0 = mp_char; + for (; has_char(); next()) + { + if (cur_char() != ';') + continue; + + size_t n = mp_char - p0; + if (!n) + throw malformed_xml_error("empty encoded character.", offset()); + +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::parse_encoded_char: raw='" << std::string(p0, n) << "'" << endl; +#endif + + char c = decode_xml_encoded_char(p0, n); + if (c) + buf.append(&c, 1); + else + { + std::string utf8 = decode_xml_unicode_char(p0, n); + + if (!utf8.empty()) + { + buf.append(utf8.data(), utf8.size()); + c = '1'; // just to avoid hitting the !c case below + } + } + + // Move to the character past ';' before returning to the parent call. + next(); + + if (!c) + { +#if ORCUS_DEBUG_SAX_PARSER + cout << "sax_parser::parse_encoded_char: not a known encoding name. Use the original." << endl; +#endif + // Unexpected encoding name. Use the original text. + buf.append(p0, mp_char-p0); + } + + return; + } + + throw malformed_xml_error( + "error parsing encoded character: terminating character is not found.", offset()); +} + +void parser_base::value_with_encoded_char(cell_buffer& buf, std::string_view& str, char quote_char) +{ + assert(cur_char() == '&'); + parse_encoded_char(buf); + + const char* p0 = mp_char; + + while (has_char()) + { + if (cur_char() == '&') + { + if (mp_char > p0) + buf.append(p0, mp_char-p0); + + parse_encoded_char(buf); + p0 = mp_char; + } + + if (cur_char() == quote_char) + break; + + if (cur_char() != '&') + next(); + } + + if (mp_char > p0) + buf.append(p0, mp_char-p0); + + if (!buf.empty()) + str = buf.str(); + + // Skip the closing quote. + assert(!has_char() || cur_char() == quote_char); + if (has_char()) + next(); +} + +bool parser_base::value(std::string_view& str, bool decode) +{ + char c = cur_char_checked(); + if (c != '"' && c != '\'') + throw malformed_xml_error("value must be quoted", offset()); + + char quote_char = c; + + c = next_char_checked(); + + const char* p0 = mp_char; + for (; c != quote_char; c = next_char_checked()) + { + if (decode && c == '&') + { + // This value contains one or more encoded characters. + cell_buffer& buf = get_cell_buffer(); + buf.reset(); + buf.append(p0, mp_char-p0); + value_with_encoded_char(buf, str, quote_char); + return true; + } + } + + str = std::string_view(p0, mp_char-p0); + + // Skip the closing quote. + next(); + + return false; +} + +void parser_base::name(std::string_view& str) +{ + const char* p0 = mp_char; + mp_char = parse_utf8_xml_name_start_char(mp_char, mp_end); + if (mp_char == p0) + { + ::std::ostringstream os; + os << "name must begin with an alphabet, but got this instead '" << cur_char() << "'"; + throw malformed_xml_error(os.str(), offset()); + } + +#if defined(__ORCUS_CPU_FEATURES) && defined(__SSE4_2__) + + const __m128i match = _mm_loadu_si128((const __m128i*)"azAZ09--__.."); + const int mode = _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY; + + size_t n_total = available_size(); + + while (n_total) + { + __m128i char_block = _mm_loadu_si128((const __m128i*)mp_char); + + int n = std::min<size_t>(16u, n_total); + int r = _mm_cmpestri(match, 12, char_block, n, mode); + mp_char += r; // Move the current char position. + n_total -= r; + + if (r < 16 && n_total) + { + // There is a character that does not match the SSE-based ASCII-only check. + // It may either by an ascii character that is not allowed, in which case stop, + // or it may possibly be an allowed utf-8 character, in which case move over it + // using the slow function. + + const char* p = parse_utf8_xml_name_char(mp_char, mp_end); + if (p == mp_char) + break; + + n_total -= p - mp_char; + mp_char = p; + } + + } + cur_char_checked(); // check end of xml stream + +#else + for(;;) + { + cur_char_checked(); // check end of xml stream + const char* p = parse_utf8_xml_name_char(mp_char, mp_end); + + if (p == mp_char) + break; + + mp_char = p; + } +#endif + + str = std::string_view(p0, mp_char-p0); +} + +void parser_base::element_name(parser_element& elem, std::ptrdiff_t begin_pos) +{ + elem.begin_pos = begin_pos; + name(elem.name); + if (cur_char() == ':') + { + elem.ns = elem.name; + next_check(); + name(elem.name); + } +} + +void parser_base::attribute_name(std::string_view& attr_ns, std::string_view& attr_name) +{ + name(attr_name); + if (cur_char() == ':') + { + // Attribute name is namespaced. + attr_ns = attr_name; + next_check(); + name(attr_name); + } +} + +void parser_base::characters_with_encoded_char(cell_buffer& buf) +{ + assert(cur_char() == '&'); + parse_encoded_char(buf); + + const char* p0 = mp_char; + + while (has_char()) + { + if (cur_char() == '&') + { + if (mp_char > p0) + buf.append(p0, mp_char-p0); + + parse_encoded_char(buf); + p0 = mp_char; + } + + if (cur_char() == '<') + break; + + if (cur_char() != '&') + next(); + } + + if (mp_char > p0) + buf.append(p0, mp_char-p0); +} + +}} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |