3 files changed, 511 insertions, 0 deletions
diff --git a/third-party/utf8cpp/test_drivers/negative/negative.cpp b/third-party/utf8cpp/test_drivers/negative/negative.cpp
new file mode 100644
index 0000000..0f1015d
--- /dev/null
+++ b/third-party/utf8cpp/test_drivers/negative/negative.cpp
@@ -0,0 +1,53 @@
+#include "../../source/utf8.h"
+using namespace utf8;
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+using namespace std;
+
+const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264};
+const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned);
+
+int main(int argc, char** argv)
+{
+    string test_file_path;
+    if (argc == 2) 
+        test_file_path = argv[1];
+    else {
+        cout << "Wrong number of arguments" << endl;
+        exit(0);
+    }
+    // Open the test file
+    ifstream fs8(test_file_path.c_str());
+    if (!fs8.is_open()) {
+        cout << "Could not open " << test_file_path << endl;
+        return 0;
+    }
+
+    // Read it line by line
+    unsigned int line_count = 0;
+    char byte;
+    while (!fs8.eof()) {
+        string line;
+        while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
+            line.push_back(byte);
+
+        line_count++;
+        bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END);
+        // Print out lines that contain unexpected invalid UTF-8
+        if (!is_valid(line.begin(), line.end())) {
+            if (expected_valid)    
+                cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
+
+            // try fixing it:
+            string fixed_line;
+            replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
+            if (!is_valid(fixed_line.begin(), fixed_line.end()))
+                cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n';
+        }
+        else if (!expected_valid)
+            cout << "Invalid utf-8 NOT detected at line " << line_count << '\n';
+    }
+}
diff --git a/third-party/utf8cpp/test_drivers/smoke_test/test.cpp b/third-party/utf8cpp/test_drivers/smoke_test/test.cpp
new file mode 100644
index 0000000..4f9fb04
--- /dev/null
+++ b/third-party/utf8cpp/test_drivers/smoke_test/test.cpp
@@ -0,0 +1,298 @@
+#include <cstring>
+#include <cassert>
+#include <vector>
+#include "../../source/utf8.h"
+using namespace utf8;
+using namespace std;
+
+int main()
+{
+    //append
+    unsigned char u[5] = {0,0,0,0,0};
+
+    append(0x0448, u);
+    assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+    append(0x65e5, u);
+    assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
+
+    append(0x3044, u);
+    assert (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0);
+
+    append(0x10346, u);
+    assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
+
+
+    //next
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    const char* w = twochars;
+    int cp = next(w, twochars + 6);
+    assert (cp == 0x65e5);
+    assert (w == twochars + 3);
+
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    w = threechars;
+    cp = next(w, threechars + 9);
+    assert (cp == 0x10346);
+    assert (w == threechars + 4);
+    cp = next(w, threechars + 9);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 7);
+    cp = next(w, threechars + 9);
+    assert (cp == 0x0448);
+    assert (w == threechars + 9);
+
+    //peek_next
+    const char* const cw = twochars;
+    cp = peek_next(cw, cw + 6);
+    assert (cp == 0x65e5);
+    assert (cw == twochars);
+
+    //prior
+    w = twochars + 3;
+    cp = prior (w, twochars);
+    assert (cp == 0x65e5);
+    assert (w == twochars);
+
+    w = threechars + 9;
+    cp = prior(w, threechars);
+    assert (cp == 0x0448);
+    assert (w == threechars + 7);
+    cp = prior(w, threechars);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 4);
+    cp = prior(w, threechars);
+    assert (cp == 0x10346);
+    assert (w == threechars); 
+
+    //previous (deprecated)
+    w = twochars + 3;
+    cp = previous (w, twochars - 1);
+    assert (cp == 0x65e5);
+    assert (w == twochars);
+
+    w = threechars + 9;
+    cp = previous(w, threechars - 1);
+    assert (cp == 0x0448);
+    assert (w == threechars + 7);
+    cp = previous(w, threechars -1);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 4);
+    cp = previous(w, threechars - 1);
+    assert (cp == 0x10346);
+    assert (w == threechars); 
+
+    // advance
+    w = twochars;
+    advance (w, 2, twochars + 6);
+    assert (w == twochars + 5);
+
+    // distance
+    size_t dist = utf8::distance(twochars, twochars + 5);
+    assert (dist == 2);
+
+    // utf32to8
+    int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+    vector<char> utf8result;
+    utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+    assert (utf8result.size() == 9);
+    // try it with the return value;
+    char* utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]);
+    assert (utf8_end == &utf8result[0] + 9);
+
+    //utf8to32
+    vector<int> utf32result;
+    utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+    assert (utf32result.size() == 2);
+    // try it with the return value;
+    int* utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]);
+    assert (utf32_end == &utf32result[0] + 2);
+
+    //utf16to8
+    unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    utf8result.clear();
+    utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+    assert (utf8result.size() == 10);
+    // try it with the return value;
+    utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]);
+    assert (utf8_end == &utf8result[0] + 10);
+
+    //utf8to16
+    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    vector <unsigned short> utf16result;
+    utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+    assert (utf16result.size() == 4);
+    assert (utf16result[2] == 0xd834);
+    assert (utf16result[3] == 0xdd1e);
+    // try it with the return value;
+    unsigned short* utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]);
+    assert (utf16_end == &utf16result[0] + 4);
+
+    //find_invalid
+    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+    assert (invalid == utf_invalid + 5);
+
+    //is_valid
+    bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
+    assert (bvalid == false);
+    bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
+    assert (bvalid == true);
+
+    //starts_with_bom
+    unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+    bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
+    assert (bbom == true);
+	bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
+	assert (no_bbom == false);
+
+    //is_bom
+	bool unsafe_bbom = is_bom(byte_order_mark);
+    assert (unsafe_bbom == true);
+
+    
+    //replace_invalid
+    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    vector<char> replace_invalid_result;
+    replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
+    bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+    assert (bvalid);
+    const char fixed_invalid_sequence[] = "a????z";
+    assert (sizeof(fixed_invalid_sequence) == replace_invalid_result.size());
+    assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
+
+    // iterator
+    utf8::iterator<const char*> it(threechars, threechars, threechars + 9);
+    utf8::iterator<const char*> it2 = it;
+    assert (it2 == it);
+    assert (*it == 0x10346);
+    assert (*(++it) == 0x65e5);
+    assert ((*it++) == 0x65e5);
+    assert (*it == 0x0448);
+    assert (it != it2);
+    utf8::iterator<const char*> endit (threechars + 9, threechars, threechars + 9);  
+    assert (++it == endit);
+    assert (*(--it) == 0x0448);
+    assert ((*it--) == 0x0448);
+    assert (*it == 0x65e5);
+    assert (--it == utf8::iterator<const char*>(threechars, threechars, threechars + 9));
+    assert (*it == 0x10346);
+
+    //////////////////////////////////////////////////////////
+    //// Unchecked variants
+    //////////////////////////////////////////////////////////
+
+    //append
+    memset(u, 0, 5);
+    append(0x0448, u);
+    assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+    append(0x65e5, u);
+    assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
+
+    append(0x10346, u);
+    assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
+
+    //next
+    w = twochars;
+    cp = unchecked::next(w);
+    assert (cp == 0x65e5);
+    assert (w == twochars + 3);
+
+    w = threechars;
+    cp = unchecked::next(w);
+    assert (cp == 0x10346);
+    assert (w == threechars + 4);
+    cp = unchecked::next(w);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 7);
+    cp = unchecked::next(w);
+    assert (cp == 0x0448);
+    assert (w == threechars + 9);
+
+    //peek_next
+    cp = unchecked::peek_next(cw);
+    assert (cp == 0x65e5);
+    assert (cw == twochars);
+
+
+    //previous (calls prior internally)
+
+    w = twochars + 3;
+    cp = unchecked::previous (w);
+    assert (cp == 0x65e5);
+    assert (w == twochars);
+
+    w = threechars + 9;
+    cp = unchecked::previous(w);
+    assert (cp == 0x0448);
+    assert (w == threechars + 7);
+    cp = unchecked::previous(w);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 4);
+    cp = unchecked::previous(w);
+    assert (cp == 0x10346);
+    assert (w == threechars); 
+
+    // advance
+    w = twochars;
+    unchecked::advance (w, 2);
+    assert (w == twochars + 5);
+
+    // distance
+    dist = unchecked::distance(twochars, twochars + 5);
+    assert (dist == 2);
+
+    // utf32to8
+    utf8result.clear();
+    unchecked::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+    assert (utf8result.size() == 9);
+    // try it with the return value;
+    utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]);
+    assert(utf8_end == &utf8result[0] + 9);
+
+    //utf8to32
+    utf32result.clear();
+    unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+    assert (utf32result.size() == 2);
+    // try it with the return value;
+    utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]);
+    assert (utf32_end == &utf32result[0] + 2);
+
+    //utf16to8
+    utf8result.clear();
+    unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+    assert (utf8result.size() == 10);
+    // try it with the return value;
+    utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]);
+    assert (utf8_end == &utf8result[0] + 10);
+
+    //utf8to16
+    utf16result.clear();
+    unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+    assert (utf16result.size() == 4);
+    assert (utf16result[2] == 0xd834);
+    assert (utf16result[3] == 0xdd1e);
+    // try it with the return value;
+    utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]);
+    assert (utf16_end == &utf16result[0] + 4);
+    
+    // iterator
+    utf8::unchecked::iterator<const char*> un_it(threechars);
+    utf8::unchecked::iterator<const char*> un_it2 = un_it;
+    assert (un_it2 == un_it);
+    assert (*un_it == 0x10346);
+    assert (*(++un_it) == 0x65e5);
+    assert ((*un_it++) == 0x65e5);
+    assert (un_it != un_it2);
+    assert (*un_it == 0x0448);
+    utf8::unchecked::iterator<const char*> un_endit (threechars + 9);  
+    assert (++un_it == un_endit);
+    assert (*(--un_it) == 0x0448);
+    assert ((*un_it--) == 0x0448);
+    assert (*un_it == 0x65e5);
+    assert (--un_it == utf8::unchecked::iterator<const char*>(threechars));
+    assert (*un_it == 0x10346);
+}
+
+
diff --git a/third-party/utf8cpp/test_drivers/utf8reader/utf8reader.cpp b/third-party/utf8cpp/test_drivers/utf8reader/utf8reader.cpp
new file mode 100644
index 0000000..c88a5ee
--- /dev/null
+++ b/third-party/utf8cpp/test_drivers/utf8reader/utf8reader.cpp
@@ -0,0 +1,160 @@
+#include "../../source/utf8.h"
+using namespace utf8;
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+using namespace std;
+
+int main(int argc, char** argv)
+{
+    if (argc != 2) {
+        cout << "\nUsage: utfreader filename\n";
+        return 0;
+    }
+    const char* TEST_FILE_PATH = argv[1];
+    // Open the test file
+    ifstream fs8(TEST_FILE_PATH);
+    if (!fs8.is_open()) {
+    cout << "Could not open " << TEST_FILE_PATH << endl;
+    return 0;
+    }
+
+    // Read it line by line
+    unsigned int line_count = 0;
+    char byte;
+    while (!fs8.eof()) {
+        string line;
+        while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof()) 
+            line.push_back(byte);
+
+        line_count++;
+	// Play around with each line and convert it to utf16
+        string::iterator line_start = line.begin();
+        string::iterator line_end   = line.end();
+        line_end = find_invalid(line_start, line_end);
+        if (line_end != line.end()) 
+            cout << "Line " << line_count << ": Invalid utf-8 at byte " << int(line.end() - line_end) << '\n';
+
+        // Convert it to utf-16 and write to the file
+        vector<unsigned short> utf16_line;
+        utf8to16(line_start, line_end, back_inserter(utf16_line));
+
+        // Back to utf-8 and compare it to the original line.
+        string back_to_utf8;
+        utf16to8(utf16_line.begin(), utf16_line.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Conversion to UTF-16 and back failed" << '\n';
+
+        // Now, convert it to utf-32, back to utf-8 and compare
+        vector <unsigned> utf32_line;
+        utf8to32(line_start, line_end, back_inserter(utf32_line));
+        back_to_utf8.clear();
+        utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Conversion to UTF-32 and back failed" << '\n';
+
+        // Now, iterate and back
+        unsigned char_count = 0;
+        string::iterator it = line_start;
+        while (it != line_end) {
+            unsigned int next_cp = peek_next(it, line_end);
+            if (next(it, line_end) != next_cp)
+                cout << "Line " << line_count << ": Error: peek_next gave a different result than next" << '\n';
+            char_count++;
+        }
+        if (char_count != utf32_line.size())
+            cout << "Line " << line_count << ": Error in iterating with next - wrong number of characters" << '\n';
+
+        string::iterator adv_it = line_start;
+        utf8::advance(adv_it, char_count, line_end);
+        if (adv_it != line_end)
+            cout << "Line " << line_count << ": Error in advance function" << '\n';
+
+        if (string::size_type(utf8::distance(line_start, line_end)) != char_count)
+            cout << "Line " << line_count << ": Error in distance function" << '\n';
+
+        while (it != line_start) {
+            previous(it, line.rend().base());
+            char_count--;
+        }
+        if (char_count != 0)
+            cout << "Line " << line_count << ": Error in iterating with previous - wrong number of characters" << '\n';
+
+        // Try utf8::iterator
+        utf8::iterator<string::iterator> u8it(line_start, line_start, line_end);
+        if (!utf32_line.empty() && *u8it != utf32_line.at(0))
+          cout << "Line " << line_count << ": Error in utf::iterator * operator" << '\n'; 
+        if (std::distance(u8it, utf8::iterator<string::iterator>(line_end, line_start, line_end)) != static_cast<int>(utf32_line.size()))
+          cout << "Line " << line_count << ": Error in using utf::iterator with std::distance - wrong number of characters" << '\n';
+
+        std::advance(u8it, utf32_line.size());
+        if (u8it != utf8::iterator<string::iterator>(line_end, line_start, line_end))
+          cout << "Line " << line_count << ": Error in using utf::iterator with std::advance" << '\n';
+
+
+        //======================== Now, the unchecked versions ======================
+        // Convert it to utf-16 and compare to the checked version
+        vector<unsigned short> utf16_line_unchecked;
+        unchecked::utf8to16(line_start, line_end, back_inserter(utf16_line_unchecked));
+
+        if (utf16_line != utf16_line_unchecked)
+            cout << "Line " << line_count << ": Error in unchecked::utf8to16" << '\n';
+
+        // Back to utf-8 and compare it to the original line.
+        back_to_utf8.clear();
+        unchecked::utf16to8(utf16_line_unchecked.begin(), utf16_line_unchecked.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Unchecked conversion to UTF-16 and back failed" << '\n';
+
+        // Now, convert it to utf-32, back to utf-8 and compare
+        vector <unsigned> utf32_line_unchecked;
+        unchecked::utf8to32(line_start, line_end, back_inserter(utf32_line_unchecked));
+        if (utf32_line != utf32_line_unchecked)
+            cout << "Line " << line_count << ": Error in unchecked::utf8to32" << '\n';
+
+        back_to_utf8.clear();
+        unchecked::utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Unchecked conversion to UTF-32 and back failed" << '\n';
+
+        // Now, iterate and back
+        char_count = 0;
+        it = line_start;
+        while (it != line_end) {
+            unsigned int next_cp = unchecked::peek_next(it); 
+            if (unchecked::next(it) != next_cp)
+              cout << "Line " << line_count << ": Error: unchecked::peek_next gave a different result than unchecked::next" << '\n';;
+            char_count++;
+        }
+        if (char_count != utf32_line.size())
+            cout << "Line " << line_count << ": Error in iterating with unchecked::next - wrong number of characters" << '\n';
+
+        adv_it = line_start;
+        utf8::unchecked::advance(adv_it, char_count);
+        if (adv_it != line_end)
+            cout << "Line " << line_count << ": Error in unchecked::advance function" << '\n';
+
+        if (string::size_type(utf8::unchecked::distance(line_start, line_end)) != char_count)
+            cout << "Line " << line_count << ": Error in unchecked::distance function" << '\n';
+
+        while (it != line_start) {
+            unchecked::previous(it);
+            char_count--;
+        }
+        if (char_count != 0)
+            cout << "Line " << line_count << ": Error in iterating with unchecked::previous - wrong number of characters" << '\n';
+
+        // Try utf8::unchecked::iterator
+        utf8::unchecked::iterator<string::iterator> un_u8it(line_start);
+        if (!utf32_line.empty() && *un_u8it != utf32_line.at(0))
+          cout << "Line " << line_count << ": Error in utf::unchecked::iterator * operator" << '\n'; 
+        if (std::distance(un_u8it, utf8::unchecked::iterator<string::iterator>(line_end)) != static_cast<int>(utf32_line.size()))
+          cout << "Line " << line_count << ": Error in using utf::unchecked::iterator with std::distance - wrong number of characters" << '\n';
+
+        std::advance(un_u8it, utf32_line.size());
+        if (un_u8it != utf8::unchecked::iterator<string::iterator>(line_end))
+          cout << "Line " << line_count << ": Error in using utf::unchecked::iterator with std::advance" << '\n';
+    }
+}