Adding upstream version 0.19.2.upstream/0.19.2 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 05:48:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 05:48:59 +0000
commit: c484829272cd13a738e35412498e12f2c9a194ac (patch)
tree: a1f5ec09629ee895bd3963fa8820b45f2f4c574b /src/parser/utf8_test.cpp
parent: Initial commit. (diff)
download: liborcus-c484829272cd13a738e35412498e12f2c9a194ac.tar.xz
liborcus-c484829272cd13a738e35412498e12f2c9a194ac.zip
1 files changed, 170 insertions, 0 deletions
diff --git a/src/parser/utf8_test.cpp b/src/parser/utf8_test.cpp
new file mode 100644
index 0000000..88dcd3e
--- /dev/null
+++ b/src/parser/utf8_test.cpp
@@ -0,0 +1,170 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "test_global.hpp"
+#include "utf8.hpp"
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <cassert>
+#include <functional>
+#include <iomanip>
+
+using namespace orcus;
+using std::cout;
+using std::endl;
+
+struct cp_range_t
+{
+    uint32_t lower;
+    uint32_t upper;
+    bool valid;
+};
+
+using parse_func_t = std::function<const char*(const char*, const char*)>;
+
+bool check_cp_ranges(parse_func_t parse, std::vector<cp_range_t> ranges)
+{
+    for (const cp_range_t& range : ranges)
+    {
+        for (uint32_t cp = range.lower; cp <= range.upper; ++cp)
+        {
+            std::vector<char> buf;
+
+            try
+            {
+                buf = encode_utf8(cp);
+            }
+            catch (const std::exception& e)
+            {
+                cout << "failed to encode 0x" << std::hex << std::uppercase << cp
+                     << " as utf-8: " << e.what() << endl;
+                return false;
+            }
+
+            const char* p = buf.data();
+            const char* p_end = p + buf.size();
+            const char* ret = parse(p, p_end);
+
+            if ((ret == p_end) != range.valid)
+            {
+                cout << "failed to parse 0x" << std::hex << std::uppercase << cp
+                    << " (utf-8:";
+
+                for (char b : buf)
+                    cout << ' ' << short(0xFF & b);
+                cout << ")" << endl;
+                cout << "expected to be " << (range.valid ? "valid" : "invalid")
+                     << ", but was " << (range.valid ? "invalid" : "valid") << endl;
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+void test_xml_name_start_char()
+{
+    bool res = check_cp_ranges(
+        parse_utf8_xml_name_start_char,
+        {
+            { 0x00, 0x40, false },
+            { 'A', 'Z', true },
+            { '[', '^', false },
+            { '_', '_', true },
+            { '`', '`', false },
+            { 'a', 'z', true },
+            { '{', 0xBF, false },
+            { 0xC0, 0xD6, true },
+            { 0xD7, 0xD7, false },
+            { 0xD8, 0xF6, true },
+            { 0xF7, 0xF7, false },
+            { 0xF8, 0x2FF, true },
+            { 0x300, 0x36F, false },
+            { 0x370, 0x37D, true },
+            { 0x37E, 0x37E, false },
+            { 0x37F, 0x1FFF, true },
+            { 0x2000, 0x200B, false },
+            { 0x200C, 0x200D, true },
+            { 0x200E, 0x206F, false },
+            { 0x2070, 0x218F, true },
+            { 0x2190, 0x2BFF, false },
+            { 0x2C00, 0x2FEF, true },
+            { 0x2FF0, 0x3000, false },
+            { 0x3001, 0xD7FF, true },
+            { 0xD800, 0xF8FF, false },
+            { 0xF900, 0xFDCF, true },
+            { 0xFDD0, 0xFDEF, false },
+            { 0xFDF0, 0xFFFD, true },
+            { 0xFFFE, 0xFFFF, false },
+            { 0x10000, 0xEFFFF, true },
+            { 0xF0000, 0xF0000, false }, // just check one byte past last valid byte.
+        }
+    );
+    assert(res);
+}
+
+void test_xml_name_char()
+{
+    bool res = check_cp_ranges(
+        parse_utf8_xml_name_char,
+        {
+            { 0x00, ',', false },
+            { '-', '.', true }, // 0x2D - 0x2E
+            { '/', '/', false },
+            { '0', '9', true },
+            { ':', '@', false },
+            { 'A', 'Z', true },
+            { '[', '^', false },
+            { '_', '_', true }, // 0x5F
+            { '`', '`', false },
+            { 'a', 'z', true },
+            { '{', 0xB6, false },
+            { 0xB7, 0xB7, true },
+            { 0xB8, 0xBF, false },
+            { 0xC0, 0xD6, true },
+            { 0xD7, 0xD7, false },
+            { 0xD8, 0xF6, true },
+            { 0xF7, 0xF7, false },
+            { 0xF8, 0x2FF, true },
+            { 0x300, 0x36F, true },
+            { 0x370, 0x37D, true },
+            { 0x37E, 0x37E, false },
+            { 0x37F, 0x1FFF, true },
+            { 0x2000, 0x200B, false },
+            { 0x200C, 0x200D, true },
+            { 0x200E, 0x203E, false },
+            { 0x203F, 0x2040, true },
+            { 0x2041, 0x206F, false },
+            { 0x2070, 0x218F, true },
+            { 0x2190, 0x2BFF, false },
+            { 0x2C00, 0x2FEF, true },
+            { 0x2FF0, 0x3000, false },
+            { 0x3001, 0xD7FF, true },
+            { 0xD800, 0xF8FF, false },
+            { 0xF900, 0xFDCF, true },
+            { 0xFDD0, 0xFDEF, false },
+            { 0xFDF0, 0xFFFD, true },
+            { 0xFFFE, 0xFFFF, false },
+            { 0x10000, 0xEFFFF, true },
+            { 0xF0000, 0xF0000, false }, // just check one byte past last valid byte.
+        }
+    );
+    assert(res);
+}
+
+int main()
+{
+    test_xml_name_start_char();
+    test_xml_name_char();
+
+    return EXIT_SUCCESS;
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 05:48:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 05:48:59 +0000
commit	c484829272cd13a738e35412498e12f2c9a194ac (patch)
tree	a1f5ec09629ee895bd3963fa8820b45f2f4c574b /src/parser/utf8_test.cpp
parent	Initial commit. (diff)
download	liborcus-c484829272cd13a738e35412498e12f2c9a194ac.tar.xz liborcus-c484829272cd13a738e35412498e12f2c9a194ac.zip