diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 05:48:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 05:48:59 +0000 |
commit | c484829272cd13a738e35412498e12f2c9a194ac (patch) | |
tree | a1f5ec09629ee895bd3963fa8820b45f2f4c574b /src/parser/parser_base.cpp | |
parent | Initial commit. (diff) | |
download | liborcus-c484829272cd13a738e35412498e12f2c9a194ac.tar.xz liborcus-c484829272cd13a738e35412498e12f2c9a194ac.zip |
Adding upstream version 0.19.2.upstream/0.19.2upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/parser/parser_base.cpp')
-rw-r--r-- | src/parser/parser_base.cpp | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/src/parser/parser_base.cpp b/src/parser/parser_base.cpp new file mode 100644 index 0000000..e49af71 --- /dev/null +++ b/src/parser/parser_base.cpp @@ -0,0 +1,222 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "orcus/parser_base.hpp" +#include "orcus/parser_global.hpp" +#include "cpu_features.hpp" + +#include <sstream> +#include <cstring> +#include <limits> +#include <cassert> +#include <algorithm> + +#ifdef __ORCUS_CPU_FEATURES +#include <immintrin.h> +#endif + +namespace orcus { + +parser_base::parser_base(const char* p, size_t n) : + mp_begin(p), mp_char(p), mp_end(p+n), + m_func_parse_numeric(parse_numeric) +{ +} + +void parser_base::prev(size_t dec) +{ + mp_char -= dec; +} + +char parser_base::peek_char(std::size_t offset) const +{ + return *(mp_char + offset); +} + +std::string_view parser_base::peek_chars(std::size_t length) const +{ + return {mp_char, length}; +} + +void parser_base::skip_bom() +{ + // Skip one or more UTF-8 BOM's. + constexpr std::string_view BOM = "\xEF\xBB\xBF"; + + while (true) + { + if (available_size() < 3) + return; + + if (peek_chars(3) != BOM) + return; + + next(3); + } +} + +void parser_base::skip(std::string_view chars_to_skip) +{ +#if defined(__ORCUS_CPU_FEATURES) && defined(__SSE4_2__) + __m128i match = _mm_loadu_si128((const __m128i*)chars_to_skip.data()); + const int mode = _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_EQUAL_ANY | _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY; + + int n_total = available_size(); + + while (n_total) + { + __m128i char_block = _mm_loadu_si128((const __m128i*)mp_char); + + // Find position of the first character that is NOT any of the + // characters to skip. + int n = std::min<int>(16, n_total); + int r = _mm_cmpestri(match, chars_to_skip.size(), char_block, n, mode); + + if (!r) + // No characters to skip. Bail out. + break; + + mp_char += r; // Move the current char position. + + if (r < 16) + // No need to move to the next segment. Stop here. + break; + + // Skip 16 chars to the next segment. + n_total -= 16; + } +#else + for (; has_char(); next()) + { + if (!is_in(*mp_char, chars_to_skip)) + break; + } +#endif +} + +void parser_base::skip_space_and_control() +{ +#if defined(__ORCUS_CPU_FEATURES) && defined(__AVX2__) + size_t n_total = available_size(); + const __m256i ws = _mm256_set1_epi8(' '); // whitespaces + const __m256i sb = _mm256_set1_epi8(0x80); // signed bit on. + + while (n_total) + { + // The 'results' stores (for each 8-bit int) 0x00 if the char is less + // than or equal to whitespace, or the char is "negative" i.e. the + // signed bit is on IOW greater than 127. + __m256i char_block = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mp_char)); + __m256i results = _mm256_cmpgt_epi8(char_block, ws); // NB: this is a signed comparison. + results = _mm256_or_si256(results, _mm256_and_si256(char_block, sb)); + int r = _mm256_movemask_epi8(results); + r = _tzcnt_u32(r); + r = std::min<size_t>(r, n_total); + + if (!r) + // No characters to skip. Bail out. + break; + + mp_char += r; // Move the current char position. + + if (r < 32) + // No need to move to the next segment. Stop here. + break; + + n_total -= 32; + } + +#elif defined(__ORCUS_CPU_FEATURES) && defined(__SSE4_2__) + __m128i match = _mm_loadu_si128((const __m128i*)"\0 "); + const int mode = _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY; + + size_t n_total = available_size(); + + while (n_total) + { + __m128i char_block = _mm_loadu_si128((const __m128i*)mp_char); + + // Find position of the first character that is NOT any of the + // characters to skip. + int n = std::min<size_t>(16u, n_total); + int r = _mm_cmpestri(match, 2, char_block, n, mode); + + if (!r) + // No characters to skip. Bail out. + break; + + mp_char += r; // Move the current char position. + + if (r < 16) + // No need to move to the next segment. Stop here. + break; + + // Skip 16 chars to the next segment. + n_total -= 16; + } +#else + for (; mp_char != mp_end && ((unsigned char)*mp_char) <= (unsigned char)' '; ++mp_char) + ; +#endif +} + +bool parser_base::parse_expected(std::string_view expected) +{ + if (expected.size() > available_size()) + return false; + +#if defined(__ORCUS_CPU_FEATURES) && defined(__SSE4_2__) + __m128i v_expected = _mm_loadu_si128((const __m128i*)expected.data()); + __m128i v_char_block = _mm_loadu_si128((const __m128i*)mp_char); + + const int mode = _SIDD_CMP_EQUAL_ORDERED | _SIDD_UBYTE_OPS | _SIDD_BIT_MASK; + __m128i res = _mm_cmpestrm(v_expected, expected.size(), v_char_block, expected.size(), mode); + int mask = _mm_cvtsi128_si32(res); + + if (mask) + mp_char += expected.size(); + + return mask; +#else + const char* p = expected.data(); + for (size_t i = 0; i < expected.size(); ++i, ++p, next()) + { + if (cur_char() != *p) + return false; + } + + return true; +#endif +} + +double parser_base::parse_double() +{ + size_t max_length = available_size(); + const char* p = mp_char; + double val; + p = m_func_parse_numeric(p, p + max_length, val); + if (p == mp_char) + return std::numeric_limits<double>::quiet_NaN(); + + mp_char = p; + return val; +} + +size_t parser_base::remaining_size() const +{ + size_t n = available_size(); + return n ? (n - 1) : 0; +} + +std::ptrdiff_t parser_base::offset() const +{ + return std::distance(mp_begin, mp_char); +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |