1 files changed, 175 insertions, 0 deletions
diff --git a/ml/dlib/dlib/unicode/unicode.cpp b/ml/dlib/dlib/unicode/unicode.cpp
new file mode 100644
index 000000000..2facc919c
--- /dev/null
+++ b/ml/dlib/dlib/unicode/unicode.cpp
@@ -0,0 +1,175 @@
+// Copyright (C) 2008 Keita Mochizuki, Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_UNICODe_CPp_
+#define DLIB_UNICODe_CPp_
+#include "unicode.h"
+#include <cwchar>
+#include "../string.h"
+#include <vector>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    static const unichar SURROGATE_FIRST_TOP = 0xD800;
+    static const unichar SURROGATE_SECOND_TOP = 0xDC00;
+    static const unichar SURROGATE_CLEARING_MASK = 0x03FF;
+    static const unichar SURROGATE_TOP = SURROGATE_FIRST_TOP;
+    static const unichar SURROGATE_END = 0xE000;
+    static const unichar SMP_TOP = 0x10000;
+    static const int VALID_BITS = 10;
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename T> bool is_surrogate(T ch)
+    {
+        return (zero_extend_cast<unichar>(ch) >= SURROGATE_TOP && 
+                zero_extend_cast<unichar>(ch) < SURROGATE_END);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename T> unichar surrogate_pair_to_unichar(T first, T second)
+    {
+        return ((first & SURROGATE_CLEARING_MASK) << VALID_BITS) | ((second & SURROGATE_CLEARING_MASK) + SMP_TOP);
+    }
+    //110110 0000000000
+    //110111 0000000000
+
+// ----------------------------------------------------------------------------------------
+
+    void unichar_to_surrogate_pair(unichar input, unichar &first, unichar &second)
+    {
+        first = ((input - SMP_TOP) >> VALID_BITS) | SURROGATE_FIRST_TOP;
+        second = (input & SURROGATE_CLEARING_MASK) | SURROGATE_SECOND_TOP;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <int N> void wstr2ustring_t(const wchar_t *src, size_t src_len, ustring &dest);
+
+    template <> void wstr2ustring_t<4>(const wchar_t *src, size_t , ustring &dest)
+    {
+        dest.assign((const unichar *)(src));
+    }
+
+    template <> void wstr2ustring_t<2>(const wchar_t *src, size_t src_len, ustring &dest)
+    {
+        size_t wlen = 0;
+        for (size_t i = 0; i < src_len; i++)
+        {
+            is_surrogate(src[i]) ? i++, wlen++ : wlen++;
+        }
+        dest.resize(wlen);
+        for (size_t i = 0, ii = 0; ii < src_len; ++i)
+        {
+            if (is_surrogate(src[ii]))
+            {
+                dest[i] = surrogate_pair_to_unichar(src[ii], src[ii+1]);
+                ii += 2;
+            }else
+            {
+                dest[i] = zero_extend_cast<unichar>(src[ii]);
+                ii++;
+            }
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    const ustring convert_wstring_to_utf32(const std::wstring &src)
+    {
+        ustring dest;
+        wstr2ustring_t<sizeof(wchar_t)>(src.c_str(), src.size(), dest);
+        return dest;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <int N> struct ustring2wstr
+    {
+    };
+
+    // for the environment of sizeof(wchar_t) == 2 (i.e. Win32)
+    template <> struct ustring2wstr<2>
+    {
+        wchar_t *wstr;
+        size_t wlen;
+        ustring2wstr(const ustring &src){
+            wlen = 0;
+            for (size_t i = 0; i < src.length(); ++i)
+            {
+                if (src[i] < SMP_TOP) wlen++;
+                else wlen += 2;
+            }
+            wstr = new wchar_t[wlen+1];
+            wstr[wlen] = L'\0';
+
+            size_t wi = 0;
+            for (size_t i = 0; i < src.length(); ++i)
+            {
+                if (src[i] < SMP_TOP)
+                {
+                    wstr[wi++] = (wchar_t)src[i];
+                }else
+                {
+                    unichar high, low;
+                    unichar_to_surrogate_pair(src[i], high, low);
+                    wstr[wi++] = (wchar_t)high;
+                    wstr[wi++] = (wchar_t)low;
+                }
+            }
+        }
+        ~ustring2wstr()
+        {
+            delete[] wstr;
+        }
+    };
+
+    // for the environment of sizeof(wchar_t) == 4 (i.e. Unix gcc)
+    template <> struct ustring2wstr<4>
+    {
+        const wchar_t *wstr;
+        size_t wlen;
+        ustring2wstr(const ustring &src){
+            wstr = (const wchar_t *)(src.c_str());
+            wlen = src.size();
+        }
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    const std::wstring convert_utf32_to_wstring(const ustring &src)
+    {
+        ustring2wstr<sizeof(wchar_t)> conv(src);
+        std::wstring dest(conv.wstr);
+        return dest;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    const std::wstring convert_mbstring_to_wstring(const std::string &src)
+    {
+        std::vector<wchar_t> wstr(src.length()+5);
+        std::mbstowcs(&wstr[0], src.c_str(), src.length()+1);
+        return std::wstring(&wstr[0]);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    const std::string convert_wstring_to_mbstring(const std::wstring &src)
+    {
+        using namespace std;
+        std::string str;
+        str.resize((src.length() + 1) * MB_CUR_MAX);
+        wcstombs(&str[0], src.c_str(), str.size());
+        return std::string(&str[0]);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_UNICODe_CPp_
+