summaryrefslogtreecommitdiffstats
path: root/src/parser/utf8_test.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser/utf8_test.cpp')
-rw-r--r--src/parser/utf8_test.cpp170
1 files changed, 170 insertions, 0 deletions
diff --git a/src/parser/utf8_test.cpp b/src/parser/utf8_test.cpp
new file mode 100644
index 0000000..88dcd3e
--- /dev/null
+++ b/src/parser/utf8_test.cpp
@@ -0,0 +1,170 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "test_global.hpp"
+#include "utf8.hpp"
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <cassert>
+#include <functional>
+#include <iomanip>
+
+using namespace orcus;
+using std::cout;
+using std::endl;
+
+struct cp_range_t
+{
+ uint32_t lower;
+ uint32_t upper;
+ bool valid;
+};
+
+using parse_func_t = std::function<const char*(const char*, const char*)>;
+
+bool check_cp_ranges(parse_func_t parse, std::vector<cp_range_t> ranges)
+{
+ for (const cp_range_t& range : ranges)
+ {
+ for (uint32_t cp = range.lower; cp <= range.upper; ++cp)
+ {
+ std::vector<char> buf;
+
+ try
+ {
+ buf = encode_utf8(cp);
+ }
+ catch (const std::exception& e)
+ {
+ cout << "failed to encode 0x" << std::hex << std::uppercase << cp
+ << " as utf-8: " << e.what() << endl;
+ return false;
+ }
+
+ const char* p = buf.data();
+ const char* p_end = p + buf.size();
+ const char* ret = parse(p, p_end);
+
+ if ((ret == p_end) != range.valid)
+ {
+ cout << "failed to parse 0x" << std::hex << std::uppercase << cp
+ << " (utf-8:";
+
+ for (char b : buf)
+ cout << ' ' << short(0xFF & b);
+ cout << ")" << endl;
+ cout << "expected to be " << (range.valid ? "valid" : "invalid")
+ << ", but was " << (range.valid ? "invalid" : "valid") << endl;
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+void test_xml_name_start_char()
+{
+ bool res = check_cp_ranges(
+ parse_utf8_xml_name_start_char,
+ {
+ { 0x00, 0x40, false },
+ { 'A', 'Z', true },
+ { '[', '^', false },
+ { '_', '_', true },
+ { '`', '`', false },
+ { 'a', 'z', true },
+ { '{', 0xBF, false },
+ { 0xC0, 0xD6, true },
+ { 0xD7, 0xD7, false },
+ { 0xD8, 0xF6, true },
+ { 0xF7, 0xF7, false },
+ { 0xF8, 0x2FF, true },
+ { 0x300, 0x36F, false },
+ { 0x370, 0x37D, true },
+ { 0x37E, 0x37E, false },
+ { 0x37F, 0x1FFF, true },
+ { 0x2000, 0x200B, false },
+ { 0x200C, 0x200D, true },
+ { 0x200E, 0x206F, false },
+ { 0x2070, 0x218F, true },
+ { 0x2190, 0x2BFF, false },
+ { 0x2C00, 0x2FEF, true },
+ { 0x2FF0, 0x3000, false },
+ { 0x3001, 0xD7FF, true },
+ { 0xD800, 0xF8FF, false },
+ { 0xF900, 0xFDCF, true },
+ { 0xFDD0, 0xFDEF, false },
+ { 0xFDF0, 0xFFFD, true },
+ { 0xFFFE, 0xFFFF, false },
+ { 0x10000, 0xEFFFF, true },
+ { 0xF0000, 0xF0000, false }, // just check one byte past last valid byte.
+ }
+ );
+ assert(res);
+}
+
+void test_xml_name_char()
+{
+ bool res = check_cp_ranges(
+ parse_utf8_xml_name_char,
+ {
+ { 0x00, ',', false },
+ { '-', '.', true }, // 0x2D - 0x2E
+ { '/', '/', false },
+ { '0', '9', true },
+ { ':', '@', false },
+ { 'A', 'Z', true },
+ { '[', '^', false },
+ { '_', '_', true }, // 0x5F
+ { '`', '`', false },
+ { 'a', 'z', true },
+ { '{', 0xB6, false },
+ { 0xB7, 0xB7, true },
+ { 0xB8, 0xBF, false },
+ { 0xC0, 0xD6, true },
+ { 0xD7, 0xD7, false },
+ { 0xD8, 0xF6, true },
+ { 0xF7, 0xF7, false },
+ { 0xF8, 0x2FF, true },
+ { 0x300, 0x36F, true },
+ { 0x370, 0x37D, true },
+ { 0x37E, 0x37E, false },
+ { 0x37F, 0x1FFF, true },
+ { 0x2000, 0x200B, false },
+ { 0x200C, 0x200D, true },
+ { 0x200E, 0x203E, false },
+ { 0x203F, 0x2040, true },
+ { 0x2041, 0x206F, false },
+ { 0x2070, 0x218F, true },
+ { 0x2190, 0x2BFF, false },
+ { 0x2C00, 0x2FEF, true },
+ { 0x2FF0, 0x3000, false },
+ { 0x3001, 0xD7FF, true },
+ { 0xD800, 0xF8FF, false },
+ { 0xF900, 0xFDCF, true },
+ { 0xFDD0, 0xFDEF, false },
+ { 0xFDF0, 0xFFFD, true },
+ { 0xFFFE, 0xFFFF, false },
+ { 0x10000, 0xEFFFF, true },
+ { 0xF0000, 0xF0000, false }, // just check one byte past last valid byte.
+ }
+ );
+ assert(res);
+}
+
+int main()
+{
+ test_xml_name_start_char();
+ test_xml_name_char();
+
+ return EXIT_SUCCESS;
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */