summaryrefslogtreecommitdiffstats
path: root/src/parser/utf8.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser/utf8.cpp')
-rw-r--r--src/parser/utf8.cpp524
1 files changed, 524 insertions, 0 deletions
diff --git a/src/parser/utf8.cpp b/src/parser/utf8.cpp
new file mode 100644
index 0000000..e02d224
--- /dev/null
+++ b/src/parser/utf8.cpp
@@ -0,0 +1,524 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "utf8.hpp"
+
+#include <cassert>
+#include <stdexcept>
+#include <limits>
+
+// https://en.wikipedia.org/wiki/UTF-8#Encoding
+// https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
+
+namespace orcus {
+
+namespace {
+
+bool valid_second_byte(uint8_t b)
+{
+ return (b & 0xC0) == 0x80;
+}
+
+bool parse_1b_start_char(uint8_t c1)
+{
+ if (c1 == '_')
+ return true;
+
+ if ('a' <= c1 && c1 <= 'z')
+ return true;
+
+ if ('A' <= c1 && c1 <= 'Z')
+ return true;
+
+ return false;
+}
+
+bool parse_1b_second_char(uint8_t c1)
+{
+ if (c1 == '-' || c1 == '.')
+ return true;
+
+ if ('0' <= c1 && c1 <= '9')
+ return true;
+
+ return false;
+}
+
+// [ #xC0- #xD6]: C3 80 -> C3 96
+// [ #xD8- #xF6]: C3 98 -> C3 B6
+// [ #xF8-#x2FF]: C3 B8 -> CB BF
+// [#x370-#x37D]: CD B0 -> CD BD
+//
+// [#x37F-#x1FFF]: (split)
+// [#x37F-#x07FF]: CD BF -> DF BF
+//
+bool parse_2b_start_char(uint8_t c1, uint8_t c2)
+{
+ if (c1 == 0xC3)
+ {
+ if (0x80 <= c2 && c2 <= 0x96)
+ return true;
+
+ if (0x98 <= c2 && c2 <= 0xB6)
+ return true;
+
+ if (0xB8 <= c2)
+ return true;
+ }
+
+ // C4 80 -> CB BF
+ if (0xC4 <= c1 && c1 <= 0xCB)
+ return 0x80 <= c2 && c2 <= 0xBF;
+
+ // CD B0 -> CD BD
+ // CD BF -> DF BF
+
+ if (c1 == 0xCD)
+ {
+ if (0xB0 <= c2 && c2 <= 0xBD)
+ return true;
+
+ return c2 == 0xBF;
+ }
+
+ // CE xx -> DF xx
+ return 0xCE <= c1 && c1 <= 0xDF;
+}
+
+// #xB7: C2 B7
+// [#x0300-#x036F]: CC 80 -> CD AF
+bool parse_2b_second_char(uint8_t c1, uint8_t c2)
+{
+ // C2 B7
+ if (c1 == 0xC2)
+ return c2 == 0xB7;
+
+ // CC 80 -> CD AF
+ // - CC xx
+ // - CD xx -> CD AF
+
+ if (c1 == 0xCC)
+ return true;
+
+ if (c1 == 0xCD)
+ return c2 <= 0xAF;
+
+ return false;
+}
+
+// [#x800-#x1FFF]: E0 A0 80 -> E1 BF BF
+//
+// [#x200C-#x200D]: E2 80 8C -> E2 80 8D
+// [#x2070-#x218F]: E2 81 B0 -> E2 86 8F
+// [#x2C00-#x2FEF]: E2 B0 80 -> E2 BF AF
+// [#x3001-#xD7FF]: E3 80 81 -> ED 9F BF
+// [#xF900-#xFDCF]: EF A4 80 -> EF B7 8F
+// [#xFDF0-#xFFFD]: EF B7 B0 -> EF BF BD
+bool parse_3b_start_char(uint8_t c1, uint8_t c2, uint8_t c3)
+{
+ // E0 A0 80 -> E1 BF BF
+ // - E0 A0 80 -> E0 BF BF
+ // - E1 xx xx
+
+ if (c1 == 0xE0)
+ {
+ // A0 80 -> BF BF
+ return (0xA0 <= c2 && c2 <= 0xBF && 0x80 <= c3 && c3 <= 0xBF);
+ }
+
+ if (c1 == 0xE1)
+ // entire E1 xx xx range is valid.
+ return true;
+
+ if (c1 == 0xE2)
+ {
+ // E2 80 8C -> E2 80 8D
+ // E2 81 B0 -> E2 86 8F
+ // E2 B0 80 -> E2 BF AF
+
+ if (c2 == 0x80)
+ // 8C -> 8D
+ return c3 == 0x8C || c3 == 0x8D;
+
+ // 81 B0 -> 86 8F
+ if (c2 == 0x81)
+ return c3 >= 0xB0;
+
+ if (0x82 <= c2 && c2 <= 0x85)
+ return true;
+
+ if (c2 == 0x86)
+ return c3 <= 0x8F;
+
+ // B0 80 -> BF AF
+ if (0xB0 <= c2 && c2 <= 0xBE)
+ return true;
+
+ if (c2 == 0xBF)
+ return c3 <= 0xAF;
+ }
+
+ // E3 80 81 -> ED 9F BF
+ // - E3 80 81 -> E3 80 BF
+ // - E3 81 xx
+ // - E4 xx xx -> EC xx xx
+ // - ED xx xx -> ED 9F xx
+ if (c1 == 0xE3)
+ {
+ if (c2 == 0x80)
+ return c3 >= 0x81;
+
+ return 0x81 <= c2;
+ }
+
+ if (0xE4 <= c1 && c1 <= 0xEC)
+ return true;
+
+ if (c1 == 0xED)
+ return c2 <= 0x9F;
+
+ // EF A4 80 -> EF B7 8F
+ // - EF A4 xx
+ // - EF A5 xx -> EF B6 xx
+ // - EF B7 xx -> EF B7 8F
+ // EF B7 B0 -> EF BF BD
+ // - EF B7 B0 -> EF B7 xx
+ // - EF B8 xx -> EF BE xx
+ // - EF BF xx -> EF BF BD
+ if (c1 == 0xEF)
+ {
+ if (c2 == 0xA4)
+ return true;
+
+ if (0xA5 <= c2 && c2 <= 0xB6)
+ return true;
+
+ if (c2 == 0xB7)
+ {
+ if (c3 <= 0x8F)
+ return true;
+
+ return 0xB0 <= c3;
+ }
+
+ if (0xB8 <= c2 && c2 <= 0xBE)
+ return true;
+
+ if (c2 == 0xBF)
+ return c3 <= 0xBD;
+ }
+
+ return false;
+}
+
+// [#x203F-#x2040]: E2 80 BF -> E2 81 80
+bool parse_3b_second_char(uint8_t c1, uint8_t c2, uint8_t c3)
+{
+ if (c1 != 0xE2)
+ return false;
+
+ if (c2 == 0x80)
+ return c3 == 0xBF;
+
+ if (c2 == 0x81)
+ return c3 == 0x80;
+
+ return false;
+}
+
+// [#x10000-#xEFFFF]: F0 90 80 80 -> F3 AF BF BF
+bool parse_4b_char(uint8_t c1, uint8_t c2, uint8_t /*c3*/, uint8_t /*c4*/)
+{
+ // F0 90 80 80 -> F3 AF BF BF
+ // - F0 90 xx xx -> F0 xx xx xx
+ // - F1 xx xx xx -> F2 xx xx xx
+ // - F3 xx xx xx -> F3 AF xx xx
+ if (c1 == 0xF0)
+ return 0x90 <= c2;
+
+ if (0xF1 <= c1 && c1 <= 0xF2)
+ return true;
+
+ if (c1 == 0xF3)
+ return c2 <= 0xAF;
+
+ return false;
+}
+
+uint8_t calc_encoded_length(uint32_t cp)
+{
+ if (cp <= 0x7F)
+ return 1;
+
+ if (0x80 <= cp && cp <= 0x7FF)
+ return 2;
+
+ if (0x800 <= cp && cp <= 0xFFFF)
+ return 3;
+
+ if (0x10000 <= cp && cp <= 0x10FFFF)
+ return 4;
+
+ throw std::runtime_error("invalid utf-8 range.");
+}
+
+// input must be less than or equal to 0x7FF
+//
+// b1: 0b110xxxxx (5)
+// b2: 0b10xxxxxx (6)
+std::vector<char> encode_2b(uint32_t cp)
+{
+ assert(cp <= 0x7FF);
+
+ // Get the lowest 6 bits
+ char low = (cp & 0x3F);
+ low |= 0x80;
+
+ // Get the next 5 bits
+ cp = cp >> 6;
+ char high = (cp & 0x1F);
+ high |= 0xC0;
+
+ std::vector<char> ret = { high, low };
+ return ret;
+}
+
+// input must be less than or equal to 0xFFFF
+//
+// b1: 0b1110xxxx (4)
+// b2: 0b10xxxxxx (6)
+// b3: 0b10xxxxxx (6)
+std::vector<char> encode_3b(uint32_t cp)
+{
+ assert(cp <= 0xFFFF);
+
+ // Get the lowest 6 bits
+ char low = (cp & 0x3F);
+ low |= 0x80;
+ cp = cp >> 6;
+
+ // Get the middle 6 bits
+ char mid = (cp & 0x3F);
+ mid |= 0x80;
+ cp = cp >> 6;
+
+ // Get the next 4 bits
+ char high = (cp & 0x0F);
+ high |= 0xE0;
+
+ std::vector<char> ret = { high, mid, low };
+ return ret;
+}
+
+// input must be less than or equal to 0x10FFFF
+//
+// b1: 0b11110xxx (3)
+// b2: 0b10xxxxxx (6)
+// b3: 0b10xxxxxx (6)
+// b4: 0b10xxxxxx (6)
+std::vector<char> encode_4b(uint32_t cp)
+{
+ assert(cp <= 0x10FFFF);
+
+ // Get the lowest 6 bits
+ char low = (cp & 0x3F);
+ low |= 0x80;
+ cp = cp >> 6;
+
+ // Get the next 6 bits
+ char mid1 = (cp & 0x3F);
+ mid1 |= 0x80;
+ cp = cp >> 6;
+
+ // Get the next 6 bits
+ char mid2 = (cp & 0x3F);
+ mid2 |= 0x80;
+ cp = cp >> 6;
+
+ // Get the next 3 bits
+ char high = (cp & 0x07);
+ high |= 0xF0;
+
+ std::vector<char> ret = { high, mid2, mid1, low };
+ return ret;
+}
+
+} // anonymous namespace
+
+const char* parse_utf8_xml_name_start_char(const char* p, const char* p_end)
+{
+ size_t n_remaining = p_end - p;
+ if (!n_remaining)
+ return p;
+
+ uint8_t n_bytes = calc_utf8_byte_length(*p);
+
+ switch (n_bytes)
+ {
+ case 1:
+ return parse_1b_start_char(*p) ? p + 1 : p;
+ case 2:
+ {
+ if (n_remaining < 2)
+ return p;
+
+ uint8_t c1 = p[0];
+ uint8_t c2 = p[1];
+
+ if (!valid_second_byte(c2))
+ return p;
+
+ return parse_2b_start_char(c1, c2) ? p + 2 : p;
+ }
+ case 3:
+ {
+ if (n_remaining < 3)
+ return p;
+
+ uint8_t c1 = p[0];
+ uint8_t c2 = p[1];
+ uint8_t c3 = p[2];
+
+ if (!valid_second_byte(c2) || !valid_second_byte(c3))
+ return p;
+
+ return parse_3b_start_char(c1, c2, c3) ? p + 3 : p;
+ }
+ case 4:
+ {
+ if (n_remaining < 4)
+ return p;
+
+ uint8_t c1 = p[0];
+ uint8_t c2 = p[1];
+ uint8_t c3 = p[2];
+ uint8_t c4 = p[3];
+
+ if (!valid_second_byte(c2) || !valid_second_byte(c3) || !valid_second_byte(c4))
+ return p;
+
+ return parse_4b_char(c1, c2, c3, c4) ? p + 4 : p;
+ }
+ }
+
+ return p;
+}
+
+const char* parse_utf8_xml_name_char(const char* p, const char* p_end)
+{
+ size_t n_remaining = p_end - p;
+ if (!n_remaining)
+ return p;
+
+ uint8_t n_bytes = calc_utf8_byte_length(*p);
+
+ switch (n_bytes)
+ {
+ case 1:
+ {
+ if (parse_1b_start_char(*p))
+ return p + 1;
+
+ return parse_1b_second_char(*p) ? p + 1 : p;
+ }
+ case 2:
+ {
+ if (n_remaining < 2)
+ return p;
+
+ uint8_t c1 = p[0];
+ uint8_t c2 = p[1];
+
+ if (!valid_second_byte(c2))
+ return p;
+
+ if (parse_2b_start_char(c1, c2))
+ return p + 2;
+
+ return parse_2b_second_char(c1, c2) ? p + 2 : p;
+ }
+ case 3:
+ {
+ if (n_remaining < 3)
+ return p;
+
+ uint8_t c1 = p[0];
+ uint8_t c2 = p[1];
+ uint8_t c3 = p[2];
+
+ if (!valid_second_byte(c2) || !valid_second_byte(c3))
+ return p;
+
+ if (parse_3b_start_char(c1, c2, c3))
+ return p + 3;
+
+ return parse_3b_second_char(c1, c2, c3) ? p + 3 : p;
+ }
+ case 4:
+ {
+ if (n_remaining < 4)
+ return p;
+
+ uint8_t c1 = p[0];
+ uint8_t c2 = p[1];
+ uint8_t c3 = p[2];
+ uint8_t c4 = p[3];
+
+ if (!valid_second_byte(c2) || !valid_second_byte(c3) || !valid_second_byte(c4))
+ return p;
+
+ return parse_4b_char(c1, c2, c3, c4) ? p + 4 : p;
+ }
+ }
+
+ return p;
+}
+
+std::vector<char> encode_utf8(uint32_t cp)
+{
+ uint8_t n_encoded = calc_encoded_length(cp);
+
+ switch (n_encoded)
+ {
+ case 1:
+ // no conversion
+ return std::vector<char>(1, cp);
+ case 2:
+ return encode_2b(cp);
+ case 3:
+ return encode_3b(cp);
+ case 4:
+ return encode_4b(cp);
+ }
+
+ throw std::logic_error("this should never be reached.");
+}
+
+uint8_t calc_utf8_byte_length(uint8_t c1)
+{
+ if ((c1 & 0x80) == 0x00)
+ // highest bit is not set.
+ return 1;
+
+ if ((c1 & 0xE0) == 0xC0)
+ // highest 3 bits are 110.
+ return 2;
+
+ if ((c1 & 0xF0) == 0xE0)
+ // highest 4 bits are 1110.
+ return 3;
+
+ if ((c1 & 0xFC) == 0xF0)
+ // highest 5 bits are 11110.
+ return 4;
+
+ return std::numeric_limits<uint8_t>::max();
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */