diff options
Diffstat (limited to 'ml/dlib/dlib/unicode/unicode.cpp')
-rw-r--r-- | ml/dlib/dlib/unicode/unicode.cpp | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/ml/dlib/dlib/unicode/unicode.cpp b/ml/dlib/dlib/unicode/unicode.cpp new file mode 100644 index 000000000..2facc919c --- /dev/null +++ b/ml/dlib/dlib/unicode/unicode.cpp @@ -0,0 +1,175 @@ +// Copyright (C) 2008 Keita Mochizuki, Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_UNICODe_CPp_ +#define DLIB_UNICODe_CPp_ +#include "unicode.h" +#include <cwchar> +#include "../string.h" +#include <vector> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + static const unichar SURROGATE_FIRST_TOP = 0xD800; + static const unichar SURROGATE_SECOND_TOP = 0xDC00; + static const unichar SURROGATE_CLEARING_MASK = 0x03FF; + static const unichar SURROGATE_TOP = SURROGATE_FIRST_TOP; + static const unichar SURROGATE_END = 0xE000; + static const unichar SMP_TOP = 0x10000; + static const int VALID_BITS = 10; + +// ---------------------------------------------------------------------------------------- + + template <typename T> bool is_surrogate(T ch) + { + return (zero_extend_cast<unichar>(ch) >= SURROGATE_TOP && + zero_extend_cast<unichar>(ch) < SURROGATE_END); + } + +// ---------------------------------------------------------------------------------------- + + template <typename T> unichar surrogate_pair_to_unichar(T first, T second) + { + return ((first & SURROGATE_CLEARING_MASK) << VALID_BITS) | ((second & SURROGATE_CLEARING_MASK) + SMP_TOP); + } + //110110 0000000000 + //110111 0000000000 + +// ---------------------------------------------------------------------------------------- + + void unichar_to_surrogate_pair(unichar input, unichar &first, unichar &second) + { + first = ((input - SMP_TOP) >> VALID_BITS) | SURROGATE_FIRST_TOP; + second = (input & SURROGATE_CLEARING_MASK) | SURROGATE_SECOND_TOP; + } + +// ---------------------------------------------------------------------------------------- + + template <int N> void wstr2ustring_t(const wchar_t *src, size_t src_len, ustring &dest); + + template <> void wstr2ustring_t<4>(const wchar_t *src, size_t , ustring &dest) + { + dest.assign((const unichar *)(src)); + } + + template <> void wstr2ustring_t<2>(const wchar_t *src, size_t src_len, ustring &dest) + { + size_t wlen = 0; + for (size_t i = 0; i < src_len; i++) + { + is_surrogate(src[i]) ? i++, wlen++ : wlen++; + } + dest.resize(wlen); + for (size_t i = 0, ii = 0; ii < src_len; ++i) + { + if (is_surrogate(src[ii])) + { + dest[i] = surrogate_pair_to_unichar(src[ii], src[ii+1]); + ii += 2; + }else + { + dest[i] = zero_extend_cast<unichar>(src[ii]); + ii++; + } + } + } + +// ---------------------------------------------------------------------------------------- + + const ustring convert_wstring_to_utf32(const std::wstring &src) + { + ustring dest; + wstr2ustring_t<sizeof(wchar_t)>(src.c_str(), src.size(), dest); + return dest; + } + +// ---------------------------------------------------------------------------------------- + + template <int N> struct ustring2wstr + { + }; + + // for the environment of sizeof(wchar_t) == 2 (i.e. Win32) + template <> struct ustring2wstr<2> + { + wchar_t *wstr; + size_t wlen; + ustring2wstr(const ustring &src){ + wlen = 0; + for (size_t i = 0; i < src.length(); ++i) + { + if (src[i] < SMP_TOP) wlen++; + else wlen += 2; + } + wstr = new wchar_t[wlen+1]; + wstr[wlen] = L'\0'; + + size_t wi = 0; + for (size_t i = 0; i < src.length(); ++i) + { + if (src[i] < SMP_TOP) + { + wstr[wi++] = (wchar_t)src[i]; + }else + { + unichar high, low; + unichar_to_surrogate_pair(src[i], high, low); + wstr[wi++] = (wchar_t)high; + wstr[wi++] = (wchar_t)low; + } + } + } + ~ustring2wstr() + { + delete[] wstr; + } + }; + + // for the environment of sizeof(wchar_t) == 4 (i.e. Unix gcc) + template <> struct ustring2wstr<4> + { + const wchar_t *wstr; + size_t wlen; + ustring2wstr(const ustring &src){ + wstr = (const wchar_t *)(src.c_str()); + wlen = src.size(); + } + }; + +// ---------------------------------------------------------------------------------------- + + const std::wstring convert_utf32_to_wstring(const ustring &src) + { + ustring2wstr<sizeof(wchar_t)> conv(src); + std::wstring dest(conv.wstr); + return dest; + } + +// ---------------------------------------------------------------------------------------- + + const std::wstring convert_mbstring_to_wstring(const std::string &src) + { + std::vector<wchar_t> wstr(src.length()+5); + std::mbstowcs(&wstr[0], src.c_str(), src.length()+1); + return std::wstring(&wstr[0]); + } + +// ---------------------------------------------------------------------------------------- + + const std::string convert_wstring_to_mbstring(const std::wstring &src) + { + using namespace std; + std::string str; + str.resize((src.length() + 1) * MB_CUR_MAX); + wcstombs(&str[0], src.c_str(), str.size()); + return std::string(&str[0]); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_UNICODe_CPp_ + |