summaryrefslogtreecommitdiffstats
path: root/third-party/utf8cpp/test_drivers
diff options
context:
space:
mode:
Diffstat (limited to 'third-party/utf8cpp/test_drivers')
-rw-r--r--third-party/utf8cpp/test_drivers/negative/negative.cpp53
-rw-r--r--third-party/utf8cpp/test_drivers/smoke_test/test.cpp298
-rw-r--r--third-party/utf8cpp/test_drivers/utf8reader/utf8reader.cpp160
3 files changed, 511 insertions, 0 deletions
diff --git a/third-party/utf8cpp/test_drivers/negative/negative.cpp b/third-party/utf8cpp/test_drivers/negative/negative.cpp
new file mode 100644
index 0000000..0f1015d
--- /dev/null
+++ b/third-party/utf8cpp/test_drivers/negative/negative.cpp
@@ -0,0 +1,53 @@
+#include "../../source/utf8.h"
+using namespace utf8;
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+using namespace std;
+
+const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264};
+const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned);
+
+int main(int argc, char** argv)
+{
+ string test_file_path;
+ if (argc == 2)
+ test_file_path = argv[1];
+ else {
+ cout << "Wrong number of arguments" << endl;
+ exit(0);
+ }
+ // Open the test file
+ ifstream fs8(test_file_path.c_str());
+ if (!fs8.is_open()) {
+ cout << "Could not open " << test_file_path << endl;
+ return 0;
+ }
+
+ // Read it line by line
+ unsigned int line_count = 0;
+ char byte;
+ while (!fs8.eof()) {
+ string line;
+ while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
+ line.push_back(byte);
+
+ line_count++;
+ bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END);
+ // Print out lines that contain unexpected invalid UTF-8
+ if (!is_valid(line.begin(), line.end())) {
+ if (expected_valid)
+ cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
+
+ // try fixing it:
+ string fixed_line;
+ replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
+ if (!is_valid(fixed_line.begin(), fixed_line.end()))
+ cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n';
+ }
+ else if (!expected_valid)
+ cout << "Invalid utf-8 NOT detected at line " << line_count << '\n';
+ }
+}
diff --git a/third-party/utf8cpp/test_drivers/smoke_test/test.cpp b/third-party/utf8cpp/test_drivers/smoke_test/test.cpp
new file mode 100644
index 0000000..4f9fb04
--- /dev/null
+++ b/third-party/utf8cpp/test_drivers/smoke_test/test.cpp
@@ -0,0 +1,298 @@
+#include <cstring>
+#include <cassert>
+#include <vector>
+#include "../../source/utf8.h"
+using namespace utf8;
+using namespace std;
+
+int main()
+{
+ //append
+ unsigned char u[5] = {0,0,0,0,0};
+
+ append(0x0448, u);
+ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+ append(0x65e5, u);
+ assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
+
+ append(0x3044, u);
+ assert (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0);
+
+ append(0x10346, u);
+ assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
+
+
+ //next
+ const char* twochars = "\xe6\x97\xa5\xd1\x88";
+ const char* w = twochars;
+ int cp = next(w, twochars + 6);
+ assert (cp == 0x65e5);
+ assert (w == twochars + 3);
+
+ const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+ w = threechars;
+ cp = next(w, threechars + 9);
+ assert (cp == 0x10346);
+ assert (w == threechars + 4);
+ cp = next(w, threechars + 9);
+ assert (cp == 0x65e5);
+ assert (w == threechars + 7);
+ cp = next(w, threechars + 9);
+ assert (cp == 0x0448);
+ assert (w == threechars + 9);
+
+ //peek_next
+ const char* const cw = twochars;
+ cp = peek_next(cw, cw + 6);
+ assert (cp == 0x65e5);
+ assert (cw == twochars);
+
+ //prior
+ w = twochars + 3;
+ cp = prior (w, twochars);
+ assert (cp == 0x65e5);
+ assert (w == twochars);
+
+ w = threechars + 9;
+ cp = prior(w, threechars);
+ assert (cp == 0x0448);
+ assert (w == threechars + 7);
+ cp = prior(w, threechars);
+ assert (cp == 0x65e5);
+ assert (w == threechars + 4);
+ cp = prior(w, threechars);
+ assert (cp == 0x10346);
+ assert (w == threechars);
+
+ //previous (deprecated)
+ w = twochars + 3;
+ cp = previous (w, twochars - 1);
+ assert (cp == 0x65e5);
+ assert (w == twochars);
+
+ w = threechars + 9;
+ cp = previous(w, threechars - 1);
+ assert (cp == 0x0448);
+ assert (w == threechars + 7);
+ cp = previous(w, threechars -1);
+ assert (cp == 0x65e5);
+ assert (w == threechars + 4);
+ cp = previous(w, threechars - 1);
+ assert (cp == 0x10346);
+ assert (w == threechars);
+
+ // advance
+ w = twochars;
+ advance (w, 2, twochars + 6);
+ assert (w == twochars + 5);
+
+ // distance
+ size_t dist = utf8::distance(twochars, twochars + 5);
+ assert (dist == 2);
+
+ // utf32to8
+ int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+ vector<char> utf8result;
+ utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+ assert (utf8result.size() == 9);
+ // try it with the return value;
+ char* utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]);
+ assert (utf8_end == &utf8result[0] + 9);
+
+ //utf8to32
+ vector<int> utf32result;
+ utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+ assert (utf32result.size() == 2);
+ // try it with the return value;
+ int* utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]);
+ assert (utf32_end == &utf32result[0] + 2);
+
+ //utf16to8
+ unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+ utf8result.clear();
+ utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+ assert (utf8result.size() == 10);
+ // try it with the return value;
+ utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]);
+ assert (utf8_end == &utf8result[0] + 10);
+
+ //utf8to16
+ char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+ vector <unsigned short> utf16result;
+ utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+ assert (utf16result.size() == 4);
+ assert (utf16result[2] == 0xd834);
+ assert (utf16result[3] == 0xdd1e);
+ // try it with the return value;
+ unsigned short* utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]);
+ assert (utf16_end == &utf16result[0] + 4);
+
+ //find_invalid
+ char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+ char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+ assert (invalid == utf_invalid + 5);
+
+ //is_valid
+ bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
+ assert (bvalid == false);
+ bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
+ assert (bvalid == true);
+
+ //starts_with_bom
+ unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+ bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
+ assert (bbom == true);
+ bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
+ assert (no_bbom == false);
+
+ //is_bom
+ bool unsafe_bbom = is_bom(byte_order_mark);
+ assert (unsafe_bbom == true);
+
+
+ //replace_invalid
+ char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+ vector<char> replace_invalid_result;
+ replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
+ bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+ assert (bvalid);
+ const char fixed_invalid_sequence[] = "a????z";
+ assert (sizeof(fixed_invalid_sequence) == replace_invalid_result.size());
+ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
+
+ // iterator
+ utf8::iterator<const char*> it(threechars, threechars, threechars + 9);
+ utf8::iterator<const char*> it2 = it;
+ assert (it2 == it);
+ assert (*it == 0x10346);
+ assert (*(++it) == 0x65e5);
+ assert ((*it++) == 0x65e5);
+ assert (*it == 0x0448);
+ assert (it != it2);
+ utf8::iterator<const char*> endit (threechars + 9, threechars, threechars + 9);
+ assert (++it == endit);
+ assert (*(--it) == 0x0448);
+ assert ((*it--) == 0x0448);
+ assert (*it == 0x65e5);
+ assert (--it == utf8::iterator<const char*>(threechars, threechars, threechars + 9));
+ assert (*it == 0x10346);
+
+ //////////////////////////////////////////////////////////
+ //// Unchecked variants
+ //////////////////////////////////////////////////////////
+
+ //append
+ memset(u, 0, 5);
+ append(0x0448, u);
+ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+ append(0x65e5, u);
+ assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
+
+ append(0x10346, u);
+ assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
+
+ //next
+ w = twochars;
+ cp = unchecked::next(w);
+ assert (cp == 0x65e5);
+ assert (w == twochars + 3);
+
+ w = threechars;
+ cp = unchecked::next(w);
+ assert (cp == 0x10346);
+ assert (w == threechars + 4);
+ cp = unchecked::next(w);
+ assert (cp == 0x65e5);
+ assert (w == threechars + 7);
+ cp = unchecked::next(w);
+ assert (cp == 0x0448);
+ assert (w == threechars + 9);
+
+ //peek_next
+ cp = unchecked::peek_next(cw);
+ assert (cp == 0x65e5);
+ assert (cw == twochars);
+
+
+ //previous (calls prior internally)
+
+ w = twochars + 3;
+ cp = unchecked::previous (w);
+ assert (cp == 0x65e5);
+ assert (w == twochars);
+
+ w = threechars + 9;
+ cp = unchecked::previous(w);
+ assert (cp == 0x0448);
+ assert (w == threechars + 7);
+ cp = unchecked::previous(w);
+ assert (cp == 0x65e5);
+ assert (w == threechars + 4);
+ cp = unchecked::previous(w);
+ assert (cp == 0x10346);
+ assert (w == threechars);
+
+ // advance
+ w = twochars;
+ unchecked::advance (w, 2);
+ assert (w == twochars + 5);
+
+ // distance
+ dist = unchecked::distance(twochars, twochars + 5);
+ assert (dist == 2);
+
+ // utf32to8
+ utf8result.clear();
+ unchecked::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+ assert (utf8result.size() == 9);
+ // try it with the return value;
+ utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]);
+ assert(utf8_end == &utf8result[0] + 9);
+
+ //utf8to32
+ utf32result.clear();
+ unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+ assert (utf32result.size() == 2);
+ // try it with the return value;
+ utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]);
+ assert (utf32_end == &utf32result[0] + 2);
+
+ //utf16to8
+ utf8result.clear();
+ unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+ assert (utf8result.size() == 10);
+ // try it with the return value;
+ utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]);
+ assert (utf8_end == &utf8result[0] + 10);
+
+ //utf8to16
+ utf16result.clear();
+ unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+ assert (utf16result.size() == 4);
+ assert (utf16result[2] == 0xd834);
+ assert (utf16result[3] == 0xdd1e);
+ // try it with the return value;
+ utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]);
+ assert (utf16_end == &utf16result[0] + 4);
+
+ // iterator
+ utf8::unchecked::iterator<const char*> un_it(threechars);
+ utf8::unchecked::iterator<const char*> un_it2 = un_it;
+ assert (un_it2 == un_it);
+ assert (*un_it == 0x10346);
+ assert (*(++un_it) == 0x65e5);
+ assert ((*un_it++) == 0x65e5);
+ assert (un_it != un_it2);
+ assert (*un_it == 0x0448);
+ utf8::unchecked::iterator<const char*> un_endit (threechars + 9);
+ assert (++un_it == un_endit);
+ assert (*(--un_it) == 0x0448);
+ assert ((*un_it--) == 0x0448);
+ assert (*un_it == 0x65e5);
+ assert (--un_it == utf8::unchecked::iterator<const char*>(threechars));
+ assert (*un_it == 0x10346);
+}
+
+
diff --git a/third-party/utf8cpp/test_drivers/utf8reader/utf8reader.cpp b/third-party/utf8cpp/test_drivers/utf8reader/utf8reader.cpp
new file mode 100644
index 0000000..c88a5ee
--- /dev/null
+++ b/third-party/utf8cpp/test_drivers/utf8reader/utf8reader.cpp
@@ -0,0 +1,160 @@
+#include "../../source/utf8.h"
+using namespace utf8;
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+using namespace std;
+
+int main(int argc, char** argv)
+{
+ if (argc != 2) {
+ cout << "\nUsage: utfreader filename\n";
+ return 0;
+ }
+ const char* TEST_FILE_PATH = argv[1];
+ // Open the test file
+ ifstream fs8(TEST_FILE_PATH);
+ if (!fs8.is_open()) {
+ cout << "Could not open " << TEST_FILE_PATH << endl;
+ return 0;
+ }
+
+ // Read it line by line
+ unsigned int line_count = 0;
+ char byte;
+ while (!fs8.eof()) {
+ string line;
+ while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
+ line.push_back(byte);
+
+ line_count++;
+ // Play around with each line and convert it to utf16
+ string::iterator line_start = line.begin();
+ string::iterator line_end = line.end();
+ line_end = find_invalid(line_start, line_end);
+ if (line_end != line.end())
+ cout << "Line " << line_count << ": Invalid utf-8 at byte " << int(line.end() - line_end) << '\n';
+
+ // Convert it to utf-16 and write to the file
+ vector<unsigned short> utf16_line;
+ utf8to16(line_start, line_end, back_inserter(utf16_line));
+
+ // Back to utf-8 and compare it to the original line.
+ string back_to_utf8;
+ utf16to8(utf16_line.begin(), utf16_line.end(), back_inserter(back_to_utf8));
+ if (back_to_utf8.compare(string(line_start, line_end)) != 0)
+ cout << "Line " << line_count << ": Conversion to UTF-16 and back failed" << '\n';
+
+ // Now, convert it to utf-32, back to utf-8 and compare
+ vector <unsigned> utf32_line;
+ utf8to32(line_start, line_end, back_inserter(utf32_line));
+ back_to_utf8.clear();
+ utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
+ if (back_to_utf8.compare(string(line_start, line_end)) != 0)
+ cout << "Line " << line_count << ": Conversion to UTF-32 and back failed" << '\n';
+
+ // Now, iterate and back
+ unsigned char_count = 0;
+ string::iterator it = line_start;
+ while (it != line_end) {
+ unsigned int next_cp = peek_next(it, line_end);
+ if (next(it, line_end) != next_cp)
+ cout << "Line " << line_count << ": Error: peek_next gave a different result than next" << '\n';
+ char_count++;
+ }
+ if (char_count != utf32_line.size())
+ cout << "Line " << line_count << ": Error in iterating with next - wrong number of characters" << '\n';
+
+ string::iterator adv_it = line_start;
+ utf8::advance(adv_it, char_count, line_end);
+ if (adv_it != line_end)
+ cout << "Line " << line_count << ": Error in advance function" << '\n';
+
+ if (string::size_type(utf8::distance(line_start, line_end)) != char_count)
+ cout << "Line " << line_count << ": Error in distance function" << '\n';
+
+ while (it != line_start) {
+ previous(it, line.rend().base());
+ char_count--;
+ }
+ if (char_count != 0)
+ cout << "Line " << line_count << ": Error in iterating with previous - wrong number of characters" << '\n';
+
+ // Try utf8::iterator
+ utf8::iterator<string::iterator> u8it(line_start, line_start, line_end);
+ if (!utf32_line.empty() && *u8it != utf32_line.at(0))
+ cout << "Line " << line_count << ": Error in utf::iterator * operator" << '\n';
+ if (std::distance(u8it, utf8::iterator<string::iterator>(line_end, line_start, line_end)) != static_cast<int>(utf32_line.size()))
+ cout << "Line " << line_count << ": Error in using utf::iterator with std::distance - wrong number of characters" << '\n';
+
+ std::advance(u8it, utf32_line.size());
+ if (u8it != utf8::iterator<string::iterator>(line_end, line_start, line_end))
+ cout << "Line " << line_count << ": Error in using utf::iterator with std::advance" << '\n';
+
+
+ //======================== Now, the unchecked versions ======================
+ // Convert it to utf-16 and compare to the checked version
+ vector<unsigned short> utf16_line_unchecked;
+ unchecked::utf8to16(line_start, line_end, back_inserter(utf16_line_unchecked));
+
+ if (utf16_line != utf16_line_unchecked)
+ cout << "Line " << line_count << ": Error in unchecked::utf8to16" << '\n';
+
+ // Back to utf-8 and compare it to the original line.
+ back_to_utf8.clear();
+ unchecked::utf16to8(utf16_line_unchecked.begin(), utf16_line_unchecked.end(), back_inserter(back_to_utf8));
+ if (back_to_utf8.compare(string(line_start, line_end)) != 0)
+ cout << "Line " << line_count << ": Unchecked conversion to UTF-16 and back failed" << '\n';
+
+ // Now, convert it to utf-32, back to utf-8 and compare
+ vector <unsigned> utf32_line_unchecked;
+ unchecked::utf8to32(line_start, line_end, back_inserter(utf32_line_unchecked));
+ if (utf32_line != utf32_line_unchecked)
+ cout << "Line " << line_count << ": Error in unchecked::utf8to32" << '\n';
+
+ back_to_utf8.clear();
+ unchecked::utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
+ if (back_to_utf8.compare(string(line_start, line_end)) != 0)
+ cout << "Line " << line_count << ": Unchecked conversion to UTF-32 and back failed" << '\n';
+
+ // Now, iterate and back
+ char_count = 0;
+ it = line_start;
+ while (it != line_end) {
+ unsigned int next_cp = unchecked::peek_next(it);
+ if (unchecked::next(it) != next_cp)
+ cout << "Line " << line_count << ": Error: unchecked::peek_next gave a different result than unchecked::next" << '\n';;
+ char_count++;
+ }
+ if (char_count != utf32_line.size())
+ cout << "Line " << line_count << ": Error in iterating with unchecked::next - wrong number of characters" << '\n';
+
+ adv_it = line_start;
+ utf8::unchecked::advance(adv_it, char_count);
+ if (adv_it != line_end)
+ cout << "Line " << line_count << ": Error in unchecked::advance function" << '\n';
+
+ if (string::size_type(utf8::unchecked::distance(line_start, line_end)) != char_count)
+ cout << "Line " << line_count << ": Error in unchecked::distance function" << '\n';
+
+ while (it != line_start) {
+ unchecked::previous(it);
+ char_count--;
+ }
+ if (char_count != 0)
+ cout << "Line " << line_count << ": Error in iterating with unchecked::previous - wrong number of characters" << '\n';
+
+ // Try utf8::unchecked::iterator
+ utf8::unchecked::iterator<string::iterator> un_u8it(line_start);
+ if (!utf32_line.empty() && *un_u8it != utf32_line.at(0))
+ cout << "Line " << line_count << ": Error in utf::unchecked::iterator * operator" << '\n';
+ if (std::distance(un_u8it, utf8::unchecked::iterator<string::iterator>(line_end)) != static_cast<int>(utf32_line.size()))
+ cout << "Line " << line_count << ": Error in using utf::unchecked::iterator with std::distance - wrong number of characters" << '\n';
+
+ std::advance(un_u8it, utf32_line.size());
+ if (un_u8it != utf8::unchecked::iterator<string::iterator>(line_end))
+ cout << "Line " << line_count << ": Error in using utf::unchecked::iterator with std::advance" << '\n';
+ }
+}