diff options
Diffstat (limited to 'ml/dlib/dlib/unicode')
-rw-r--r-- | ml/dlib/dlib/unicode/unicode.cpp | 175 | ||||
-rw-r--r-- | ml/dlib/dlib/unicode/unicode.h | 622 | ||||
-rw-r--r-- | ml/dlib/dlib/unicode/unicode_abstract.h | 233 |
3 files changed, 1030 insertions, 0 deletions
diff --git a/ml/dlib/dlib/unicode/unicode.cpp b/ml/dlib/dlib/unicode/unicode.cpp new file mode 100644 index 000000000..2facc919c --- /dev/null +++ b/ml/dlib/dlib/unicode/unicode.cpp @@ -0,0 +1,175 @@ +// Copyright (C) 2008 Keita Mochizuki, Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_UNICODe_CPp_ +#define DLIB_UNICODe_CPp_ +#include "unicode.h" +#include <cwchar> +#include "../string.h" +#include <vector> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + static const unichar SURROGATE_FIRST_TOP = 0xD800; + static const unichar SURROGATE_SECOND_TOP = 0xDC00; + static const unichar SURROGATE_CLEARING_MASK = 0x03FF; + static const unichar SURROGATE_TOP = SURROGATE_FIRST_TOP; + static const unichar SURROGATE_END = 0xE000; + static const unichar SMP_TOP = 0x10000; + static const int VALID_BITS = 10; + +// ---------------------------------------------------------------------------------------- + + template <typename T> bool is_surrogate(T ch) + { + return (zero_extend_cast<unichar>(ch) >= SURROGATE_TOP && + zero_extend_cast<unichar>(ch) < SURROGATE_END); + } + +// ---------------------------------------------------------------------------------------- + + template <typename T> unichar surrogate_pair_to_unichar(T first, T second) + { + return ((first & SURROGATE_CLEARING_MASK) << VALID_BITS) | ((second & SURROGATE_CLEARING_MASK) + SMP_TOP); + } + //110110 0000000000 + //110111 0000000000 + +// ---------------------------------------------------------------------------------------- + + void unichar_to_surrogate_pair(unichar input, unichar &first, unichar &second) + { + first = ((input - SMP_TOP) >> VALID_BITS) | SURROGATE_FIRST_TOP; + second = (input & SURROGATE_CLEARING_MASK) | SURROGATE_SECOND_TOP; + } + +// ---------------------------------------------------------------------------------------- + + template <int N> void wstr2ustring_t(const wchar_t *src, size_t src_len, ustring &dest); + + template <> void wstr2ustring_t<4>(const wchar_t *src, size_t , ustring &dest) + { + dest.assign((const unichar *)(src)); + } + + template <> void wstr2ustring_t<2>(const wchar_t *src, size_t src_len, ustring &dest) + { + size_t wlen = 0; + for (size_t i = 0; i < src_len; i++) + { + is_surrogate(src[i]) ? i++, wlen++ : wlen++; + } + dest.resize(wlen); + for (size_t i = 0, ii = 0; ii < src_len; ++i) + { + if (is_surrogate(src[ii])) + { + dest[i] = surrogate_pair_to_unichar(src[ii], src[ii+1]); + ii += 2; + }else + { + dest[i] = zero_extend_cast<unichar>(src[ii]); + ii++; + } + } + } + +// ---------------------------------------------------------------------------------------- + + const ustring convert_wstring_to_utf32(const std::wstring &src) + { + ustring dest; + wstr2ustring_t<sizeof(wchar_t)>(src.c_str(), src.size(), dest); + return dest; + } + +// ---------------------------------------------------------------------------------------- + + template <int N> struct ustring2wstr + { + }; + + // for the environment of sizeof(wchar_t) == 2 (i.e. Win32) + template <> struct ustring2wstr<2> + { + wchar_t *wstr; + size_t wlen; + ustring2wstr(const ustring &src){ + wlen = 0; + for (size_t i = 0; i < src.length(); ++i) + { + if (src[i] < SMP_TOP) wlen++; + else wlen += 2; + } + wstr = new wchar_t[wlen+1]; + wstr[wlen] = L'\0'; + + size_t wi = 0; + for (size_t i = 0; i < src.length(); ++i) + { + if (src[i] < SMP_TOP) + { + wstr[wi++] = (wchar_t)src[i]; + }else + { + unichar high, low; + unichar_to_surrogate_pair(src[i], high, low); + wstr[wi++] = (wchar_t)high; + wstr[wi++] = (wchar_t)low; + } + } + } + ~ustring2wstr() + { + delete[] wstr; + } + }; + + // for the environment of sizeof(wchar_t) == 4 (i.e. Unix gcc) + template <> struct ustring2wstr<4> + { + const wchar_t *wstr; + size_t wlen; + ustring2wstr(const ustring &src){ + wstr = (const wchar_t *)(src.c_str()); + wlen = src.size(); + } + }; + +// ---------------------------------------------------------------------------------------- + + const std::wstring convert_utf32_to_wstring(const ustring &src) + { + ustring2wstr<sizeof(wchar_t)> conv(src); + std::wstring dest(conv.wstr); + return dest; + } + +// ---------------------------------------------------------------------------------------- + + const std::wstring convert_mbstring_to_wstring(const std::string &src) + { + std::vector<wchar_t> wstr(src.length()+5); + std::mbstowcs(&wstr[0], src.c_str(), src.length()+1); + return std::wstring(&wstr[0]); + } + +// ---------------------------------------------------------------------------------------- + + const std::string convert_wstring_to_mbstring(const std::wstring &src) + { + using namespace std; + std::string str; + str.resize((src.length() + 1) * MB_CUR_MAX); + wcstombs(&str[0], src.c_str(), str.size()); + return std::string(&str[0]); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_UNICODe_CPp_ + diff --git a/ml/dlib/dlib/unicode/unicode.h b/ml/dlib/dlib/unicode/unicode.h new file mode 100644 index 000000000..d7510e34a --- /dev/null +++ b/ml/dlib/dlib/unicode/unicode.h @@ -0,0 +1,622 @@ +// Copyright (C) 2007 Davis E. King (davis@dlib.net), and Nils Labugt +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_UNICODe_H_ +#define DLIB_UNICODe_H_ + +#include "../uintn.h" +#include "../algs.h" +#include "unicode_abstract.h" +#include <string> +#include <cstring> + +#include <fstream> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + typedef uint32 unichar; + +#if defined(__GNUC__) && __GNUC__ < 4 && __GNUC_MINOR__ < 4 + struct unichar_traits + { + typedef dlib::unichar char_type; + typedef dlib::unichar int_type; + typedef std::streamoff off_type; + typedef std::streampos pos_type; + typedef std::mbstate_t state_type; + + static void assign(char_type& c1, const char_type& c2) { c1 = c2; } + static bool eq(const char_type& c1, const char_type& c2) { return c1 == c2; } + static bool lt(const char_type& c1, const char_type& c2) { return c1 < c2; } + static int compare(const char_type* s1, const char_type* s2, size_t n) + { + for (size_t i = 0; i < n; ++i) + { + if (s1[i] < s2[i]) + return -1; + else if (s1[i] > s2[i]) + return 1; + } + return 0; + } + + static size_t length(const char_type* s) + { + size_t i = 0; + while (s[i] != 0) + ++i; + return i; + } + + static const char_type* find(const char_type* s, size_t n, + const char_type& a) + { + for (size_t i = 0; i < n; ++i) + { + if (s[i] == a) + { + return s+i; + } + } + return 0; + } + + static char_type* move(char_type* s1, const char_type* s2, size_t n) + { + return static_cast<char_type*>(std::memmove(s1, s2, sizeof(char_type)*n)); + } + + static char_type* copy(char_type* s1, const char_type* s2, size_t n) + { + for (size_t i = 0; i < n; ++i) + s1[i] = s2[i]; + + return s1; + } + + static char_type* assign(char_type* s, size_t n, char_type a) + { + for (size_t i = 0; i < n; ++i) + s[i] = a; + + return s; + } + + + static int_type not_eof(const int_type& c) + { + if (!eq_int_type(c,eof())) + return to_int_type(c); + else + return 0; + } + + static char_type to_char_type(const int_type& c) { return static_cast<char_type>(c); } + static int_type to_int_type(const char_type& c) { return zero_extend_cast<int_type>(c); } + + static bool eq_int_type(const int_type& c1, const int_type& c2) { return c1 == c2; } + + static int_type eof() { return static_cast<int_type>(EOF); } + }; + + typedef std::basic_string<unichar, unichar_traits> ustring; +#else + typedef std::basic_string<unichar> ustring; +#endif + +// ---------------------------------------------------------------------------------------- + + namespace unicode_helpers + { + + template < + typename charT + > + int u8_to_u32( + charT& result, + std::istream& in + ) + /*! + ensures + - if (there just wasn't any more data and we hit EOF) then + - returns 0 + - else if (we decoded another character without error) then + - #result == the decoded character + - returns the number of bytes consumed to make this character + - else + - some error occurred + - returns -1 + !*/ + { + int val = in.get(); + if (val == EOF) + return 0; + + unichar ch[4]; + ch[0] = zero_extend_cast<unichar>(val); + if ( ch[0] < 0x80 ) + { + result = static_cast<charT>(ch[0]); + return 1; + } + if ( ( ch[0] & ~0x3F ) == 0x80 ) + { + // invalid leading byte + return -1; + } + if ( ( ch[0] & ~0x1F ) == 0xC0 ) + { + val = in.get(); + if ( val == EOF ) + return -1; + + ch[1] = zero_extend_cast<unichar>(val); + if ( ( ch[1] & ~0x3F ) != 0x80 ) + return -1; // invalid tail + if ( ( ch[0] & ~0x01 ) == 0xC0 ) + return -1; // overlong form + ch[0] &= 0x1F; + ch[1] &= 0x3F; + result = static_cast<charT>(( ch[0] << 6 ) | ch[1]); + return 2; + } + if ( ( ch[0] & ~0x0F ) == 0xE0 ) + { + for ( unsigned n = 1;n < 3;n++ ) + { + val = in.get(); + if ( val == EOF ) + return -1; + ch[n] = zero_extend_cast<unichar>(val); + if ( ( ch[n] & ~0x3F ) != 0x80 ) + return -1; // invalid tail + ch[n] &= 0x3F; + } + ch[0] &= 0x0F; + result = static_cast<charT>(( ch[0] << 12 ) | ( ch[1] << 6 ) | ch[2]); + if ( result < 0x0800 ) + return -1; // overlong form + if ( result >= 0xD800 && result < 0xE000 ) + return -1; // invalid character (UTF-16 surrogate pairs) + if ( result >= 0xFDD0 && result <= 0xFDEF ) + return -1; // noncharacter + if ( result >= 0xFFFE ) + return -1; // noncharacter + return 3; + } + if ( ( ch[0] & ~0x07 ) == 0xF0 ) + { + for ( unsigned n = 1;n < 4;n++ ) + { + val = in.get(); + if ( val == EOF ) + return -1; + ch[n] = zero_extend_cast<unichar>(val); + if ( ( ch[n] & ~0x3F ) != 0x80 ) + return -1; // invalid tail + ch[n] &= 0x3F; + } + if ( ( ch[0] ^ 0xF6 ) < 4 ) + return -1; + ch[0] &= 0x07; + result = static_cast<charT>(( ch[0] << 18 ) | ( ch[1] << 12 ) | ( ch[2] << 6 ) | ch[3]); + if ( result < 0x10000 ) + return -1; // overlong form + if ( (result & 0xFFFF) >= 0xFFFE ) + return -1; // noncharacter + return 4; + } + return -1; + } + + // ------------------------------------------------------------------------------------ + + template <typename charT> + class basic_utf8_streambuf : public std::basic_streambuf<charT> + { + public: + basic_utf8_streambuf ( + std::ifstream& fin_ + ) : + fin(fin_) + { + this->setg(in_buffer+max_putback, + in_buffer+max_putback, + in_buffer+max_putback); + } + + protected: + + typedef typename std::basic_streambuf<charT>::int_type int_type; + + // input functions + int_type underflow( + ) + { + if (this->gptr() < this->egptr()) + { + return zero_extend_cast<int_type>(*this->gptr()); + } + + int num_put_back = static_cast<int>(this->gptr() - this->eback()); + if (num_put_back > max_putback) + { + num_put_back = max_putback; + } + + // copy the putback characters into the putback end of the in_buffer + std::memmove(in_buffer+(max_putback-num_put_back), this->gptr()-num_put_back, num_put_back); + + + // fill the buffer with characters + int n = in_buffer_size-max_putback; + int i; + for (i = 0; i < n; ++i) + { + charT ch; + if (unicode_helpers::u8_to_u32(ch,fin) > 0) + { + (in_buffer+max_putback)[i] = ch; + } + else + { + break; + } + } + + if (i == 0) + { + // an error occurred or we hit EOF + return EOF; + } + + // reset in_buffer pointers + this->setg (in_buffer+(max_putback-num_put_back), + in_buffer+max_putback, + in_buffer+max_putback+i); + + return zero_extend_cast<int_type>(*this->gptr()); + } + + private: + std::ifstream& fin; + static const int max_putback = 4; + static const int in_buffer_size = 10; + charT in_buffer[in_buffer_size]; + }; + } + +// ---------------------------------------------------------------------------------------- +#if defined(__GNUC__) && __GNUC__ >= 6 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#endif + template <typename T> + bool is_combining_char( + const T ch_ + ) + { + const unichar ch = zero_extend_cast<unichar>(ch_); + if ( ch < 0x300 ) return false; + if ( ch < 0x370 ) return true; + + if ( ch < 0x800 ) + { + if ( ch < 0x483 )return false;if ( ch < 0x48A )return true; + + if ( ch < 0x591 )return false;if ( ch < 0x5D0 ) + { + if ( ch == 0x5C0 )return false; + if ( ch == 0x5C3 )return false; + if ( ch == 0x5C6 )return false; + return true; + } + if ( ch < 0x610 )return false;if ( ch < 0x616 )return true; + if ( ch < 0x64B )return false;if ( ch < 0x660 )return true; + + if ( ch == 0x670 )return true; + + if ( ch < 0x6D6 )return false;if ( ch < 0x6EE ) + { + if ( ch == 0x6DD )return false; + if ( ch == 0x6E5 )return false; + if ( ch == 0x6E6 )return false; + if ( ch == 0x6E9 )return false; + return true; + } + if ( ch == 0x711 )return true; + + if ( ch < 0x730 )return false;if ( ch < 0x74B )return true; + if ( ch < 0x7A6 )return false;if ( ch < 0x7B1 )return true; + if ( ch < 0x7EB )return false;if ( ch < 0x7F4 )return true; + return false; + } + if ( ch < 0xA00 ) + { + if ( ch < 0x901 )return false;if ( ch < 0x904 )return true; + if ( ch < 0x93C )return false;if ( ch < 0x955 ) + { + if ( ch == 0x93D )return false; + if ( ch == 0x950 )return false; + return true; + } + if ( ch < 0x962 )return false;if ( ch < 0x964 )return true; + if ( ch < 0x981 )return false;if ( ch < 0x984 )return true; + if ( ch < 0x9BC )return false;if ( ch < 0x9D8 ) + { + if ( ch == 0x9BD )return false; + if ( ch == 0x9CE )return false; + return true; + } + if ( ch < 0x9E2 )return false;if ( ch < 0x9E4 )return true; + return false; + } + if ( ch < 0xC00 ) + { + if ( ch < 0xA01 )return false;if ( ch < 0xA04 )return true; + if ( ch < 0xA3C )return false;if ( ch < 0xA4E )return true; + if ( ch < 0xA70 )return false;if ( ch < 0xA72 )return true; + if ( ch < 0xA81 )return false;if ( ch < 0xA84 )return true; + if ( ch < 0xABC )return false;if ( ch < 0xACE ) + { + if ( ch == 0xABD )return false; + return true; + } + if ( ch < 0xAE2 )return false;if ( ch < 0xAE4 )return true; + if ( ch < 0xB01 )return false;if ( ch < 0xB04 )return true; + if ( ch < 0xB3C )return false;if ( ch < 0xB58 ) + { + if ( ch == 0xB3D )return false; + return true; + } + if ( ch == 0xB82 )return true; + + if ( ch < 0xBBE )return false;if ( ch < 0xBD8 )return true; + + if ( ch == 0xBF4 )return true; + if ( ch == 0xBF8 )return true; + return false; + } + if(ch < 0xE00) + { + if ( ch < 0xC01 )return false;if ( ch < 0xC04 )return true; + if ( ch < 0xC3E )return false;if ( ch < 0xC57 )return true; + if ( ch < 0xC82 )return false;if ( ch < 0xC84 )return true; + if ( ch < 0xCBC )return false;if ( ch < 0xCD7 ) + { + if ( ch == 0xCBD )return false; + return true; + } + if ( ch < 0xCE2 )return false;if ( ch < 0xCE4 )return true; + if ( ch < 0xD02 )return false;if ( ch < 0xD04 )return true; + if ( ch < 0xD3E )return false;if ( ch < 0xD58 )return true; + if ( ch < 0xD82 )return false;if ( ch < 0xD84 )return true; + if ( ch < 0xDCA )return false;if ( ch < 0xDF4 )return true; + return false; + } + if(ch < 0x1000) + { + if ( ch == 0xE31 )return true; + + if ( ch < 0xE34 )return false;if ( ch < 0xE3B )return true; + if ( ch < 0xE47 )return false;if ( ch < 0xE4F )return true; + + if ( ch == 0xEB1 )return true; + + if ( ch < 0xEB4 )return false;if ( ch < 0xEBD )return true; + if ( ch < 0xEC8 )return false;if ( ch < 0xECE )return true; + if ( ch < 0xF18 )return false;if ( ch < 0xF1A )return true; + + if ( ch == 0xF35 )return true; + if ( ch == 0xF37 )return true; + if ( ch == 0xF39 )return true; + + if ( ch < 0xF3E )return false;if ( ch < 0xF40 )return true; + if ( ch < 0xF71 )return false;if ( ch < 0xF88 ) + { + if ( ch == 0xF85 )return false; + return true; + } + if ( ch < 0xF90 )return false;if ( ch < 0xFBD )return true; + + if ( ch == 0xFC6 )return true; + return false; + } + if ( ch < 0x1800 ) + { + if ( ch < 0x102C )return false;if ( ch < 0x1040 )return true; + if ( ch < 0x1056 )return false;if ( ch < 0x105A )return true; + + if ( ch == 0x135F )return true; + + if ( ch < 0x1712 )return false;if ( ch < 0x1715 )return true; + if ( ch < 0x1732 )return false;if ( ch < 0x1735 )return true; + if ( ch < 0x1752 )return false;if ( ch < 0x1754 )return true; + if ( ch < 0x1772 )return false;if ( ch < 0x1774 )return true; + if ( ch < 0x17B6 )return false;if ( ch < 0x17D4 )return true; + + if ( ch == 0x17DD )return true; + return false; + } + if(ch < 0x2000) + { + if ( ch < 0x180B )return false;if ( ch < 0x180E )return true; + + if ( ch == 0x18A9 )return true; + + if ( ch < 0x1920 )return false;if ( ch < 0x193C )return true; + if ( ch < 0x19B0 )return false;if ( ch < 0x19C1 )return true; + if ( ch < 0x19C8 )return false;if ( ch < 0x19CA )return true; + if ( ch < 0x1A17 )return false;if ( ch < 0x1A1C )return true; + if ( ch < 0x1B00 )return false;if ( ch < 0x1B05 )return true; + if ( ch < 0x1B34 )return false;if ( ch < 0x1B45 )return true; + if ( ch < 0x1B6B )return false;if ( ch < 0x1B74 )return true; + if ( ch < 0x1DC0 )return false;if ( ch < 0x1E00 )return true; + return false; + } + if ( ch < 0x20D0 )return false;if ( ch < 0x2100 )return true; + if ( ch < 0x302A )return false;if ( ch < 0x3030 )return true; + if ( ch < 0x3099 )return false;if ( ch < 0x309B )return true; + + if ( ch == 0xA802 )return true; + if ( ch == 0xA806 )return true; + if ( ch == 0xA80B )return true; + + if ( ch < 0xA823 )return false;if ( ch < 0xA828 )return true; + + if ( ch == 0xFB1E )return true; + + if ( ch < 0xFE00 )return false;if ( ch < 0xFE10 )return true; + if ( ch < 0xFE20 )return false;if ( ch < 0xFE30 )return true; + if ( ch < 0x10A01 )return false;if ( ch < 0x10A10 )return true; + if ( ch < 0x10A38 )return false;if ( ch < 0x10A40 )return true; + if ( ch < 0x1D165 )return false;if ( ch < 0x1D16A )return true; + if ( ch < 0x1D16D )return false;if ( ch < 0x1D173 )return true; + if ( ch < 0x1D17B )return false;if ( ch < 0x1D183 )return true; + if ( ch < 0x1D185 )return false;if ( ch < 0x1D18C )return true; + if ( ch < 0x1D1AA )return false;if ( ch < 0x1D1AE )return true; + if ( ch < 0x1D242 )return false;if ( ch < 0x1D245 )return true; + if ( ch < 0xE0100 )return false;if ( ch < 0xE01F0 )return true; + return false; + } +#if defined(__GNUC__) && __GNUC__ >= 6 +#pragma GCC diagnostic pop +#endif + +// ---------------------------------------------------------------------------------------- + + class invalid_utf8_error : public error + { + public: + invalid_utf8_error():error(EUTF8_TO_UTF32) {} + }; + + inline const ustring convert_utf8_to_utf32 ( + const std::string& str + ) + { + using namespace unicode_helpers; + ustring temp; + std::istringstream sin(str); + + temp.reserve(str.size()); + + int status; + unichar ch; + while ( (status = u8_to_u32(ch,sin)) > 0) + temp.push_back(ch); + + if (status < 0) + throw invalid_utf8_error(); + + return temp; + } + +// ---------------------------------------------------------------------------------------- + + bool is_surrogate(unichar ch); + + unichar surrogate_pair_to_unichar(unichar first, unichar second); + + void unichar_to_surrogate_pair(unichar unicode, unichar &first, unichar &second); + + + const ustring convert_wstring_to_utf32 ( + const std::wstring &wstr + ); + + const std::wstring convert_utf32_to_wstring ( + const ustring &src + ); + + const std::wstring convert_mbstring_to_wstring ( + const std::string &src + ); + + const std::string convert_wstring_to_mbstring( + const std::wstring &src + ); + +// ---------------------------------------------------------------------------------------- + + template <typename charT> + class basic_utf8_ifstream : public std::basic_istream<charT> + { + public: + + basic_utf8_ifstream ( + ) : std::basic_istream<charT>(&buf), buf(fin) {} + + basic_utf8_ifstream ( + const char* file_name, + std::ios_base::openmode mode = std::ios::in + ) : + std::basic_istream<charT>(&buf), + buf(fin) + { + fin.open(file_name,mode); + // make this have the same error state as fin + this->clear(fin.rdstate()); + } + + basic_utf8_ifstream ( + const std::string& file_name, + std::ios_base::openmode mode = std::ios::in + ) : + std::basic_istream<charT>(&buf), + buf(fin) + { + fin.open(file_name.c_str(),mode); + // make this have the same error state as fin + this->clear(fin.rdstate()); + } + + void open( + const std::string& file_name, + std::ios_base::openmode mode = std::ios::in + ) + { + open(file_name.c_str(),mode); + } + + void open ( + const char* file_name, + std::ios_base::openmode mode = std::ios::in + ) + { + fin.close(); + fin.clear(); + fin.open(file_name,mode); + // make this have the same error state as fin + this->clear(fin.rdstate()); + } + + void close ( + ) + { + fin.close(); + // make this have the same error state as fin + this->clear(fin.rdstate()); + } + + private: + + std::ifstream fin; + unicode_helpers::basic_utf8_streambuf<charT> buf; + }; + + typedef basic_utf8_ifstream<unichar> utf8_uifstream; + typedef basic_utf8_ifstream<wchar_t> utf8_wifstream; + +// ---------------------------------------------------------------------------------------- + +} + +#ifdef NO_MAKEFILE +#include "unicode.cpp" +#endif + +#endif // DLIB_UNICODe_H_ + diff --git a/ml/dlib/dlib/unicode/unicode_abstract.h b/ml/dlib/dlib/unicode/unicode_abstract.h new file mode 100644 index 000000000..ed5b9ab4e --- /dev/null +++ b/ml/dlib/dlib/unicode/unicode_abstract.h @@ -0,0 +1,233 @@ +// Copyright (C) 2007 Davis E. King (davis@dlib.net), and Nils Labugt +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_UNICODe_ABSTRACT_H_ +#ifdef DLIB_UNICODe_ABSTRACT_H_ + +#include "../uintn.h" +#include "../error.h" +#include <string> +#include <fstream> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + // a typedef for an unsigned 32bit integer to hold our UNICODE characters + typedef uint32 unichar; + + // a typedef for a string object to hold our UNICODE strings + typedef std::basic_string<unichar> ustring; + +// ---------------------------------------------------------------------------------------- + + template <typename T> + bool is_combining_char( + const T ch_ + ); + /*! + ensures + - if (ch_ is a unicode combining character) then + - returns true + - else + - returns false + !*/ + + bool is_surrogate( + unichar ch + ); + /*! + ensures + - if (ch is a unicode surrogate character) then + - returns true + - else + - returns false + !*/ + + unichar surrogate_pair_to_unichar( + unichar first, + unichar second + ); + /*! + requires + - 0xD800 <= first < 0xDC00 + - 0xDC00 <= second < 0xE000 + - is_surrogate(first) == true + - is_surrogate(second) == true + ensures + - converts two surrogates into one unicode character + !*/ + + void unichar_to_surrogate_pair( + unichar ch, + unichar& first, + unichar& second + ); + /*! + requires + - ch >= 0x10000 (i.e. is not in Basic Multilingual Plane) + ensures + - surrogate_pair_to_unichar(#first,#second) == ch + (i.e. converts ch into two surrogate characters) + !*/ + +// ---------------------------------------------------------------------------------------- + + class invalid_utf8_error : public error + { + public: + invalid_utf8_error():error(EUTF8_TO_UTF32) {} + }; + + const ustring convert_utf8_to_utf32 ( + const std::string& str + ); + /*! + ensures + - if (str is a valid UTF-8 encoded string) then + - returns a copy of str that has been converted into a + unichar string + - else + - throws invalid_utf8_error + !*/ + +// ---------------------------------------------------------------------------------------- + + const ustring convert_wstring_to_utf32 ( + const std::wstring &wstr + ); + /*! + requires + - wstr is a valid UTF-16 string when sizeof(wchar_t) == 2 + - wstr is a valid UTF-32 string when sizeof(wchar_t) == 4 + ensures + - converts wstr into UTF-32 string + !*/ + +// ---------------------------------------------------------------------------------------- + + const std::wstring convert_utf32_to_wstring ( + const ustring &str + ); + /*! + requires + - str is a valid UTF-32 encoded string + ensures + - converts str into wstring whose encoding is UTF-16 when sizeof(wchar_t) == 2 + - converts str into wstring whose encoding is UTF-32 when sizeof(wchar_t) == 4 + !*/ + +// ---------------------------------------------------------------------------------------- + + const std::wstring convert_mbstring_to_wstring ( + const std::string &str + ); + /*! + requires + - str is a valid multibyte string whose encoding is same as current locale setting + ensures + - converts str into wstring whose encoding is UTF-16 when sizeof(wchar_t) == 2 + - converts str into wstring whose encoding is UTF-32 when sizeof(wchar_t) == 4 + !*/ + +// ---------------------------------------------------------------------------------------- + + const std::string convert_wstring_to_mbstring ( + const std::wstring &src + ); + /*! + requires + - str is a valid wide character string string whose encoding is same as current + locale setting + ensures + - returns a multibyte encoded version of the given string + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename charT + > + class basic_utf8_ifstream : public std::basic_istream<charT> + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents an input file stream much like the + normal std::ifstream except that it knows how to read UTF-8 + data. So when you read characters out of this stream it will + automatically convert them from the UTF-8 multibyte encoding + into a fixed width wide character encoding. + !*/ + + public: + + basic_utf8_ifstream ( + ); + /*! + ensures + - constructs an input stream that isn't yet associated with + a file. + !*/ + + basic_utf8_ifstream ( + const char* file_name, + std::ios_base::openmode mode = std::ios::in + ); + /*! + ensures + - tries to open the given file for reading by this stream + - mode is interpreted exactly the same was as the open mode + argument used by std::ifstream. + !*/ + + basic_utf8_ifstream ( + const std::string& file_name, + std::ios_base::openmode mode = std::ios::in + ); + /*! + ensures + - tries to open the given file for reading by this stream + - mode is interpreted exactly the same was as the open mode + argument used by std::ifstream. + !*/ + + void open( + const std::string& file_name, + std::ios_base::openmode mode = std::ios::in + ); + /*! + ensures + - tries to open the given file for reading by this stream + - mode is interpreted exactly the same was as the open mode + argument used by std::ifstream. + !*/ + + void open ( + const char* file_name, + std::ios_base::openmode mode = std::ios::in + ); + /*! + ensures + - tries to open the given file for reading by this stream + - mode is interpreted exactly the same was as the open mode + argument used by std::ifstream. + !*/ + + void close ( + ); + /*! + ensures + - any file opened by this stream has been closed + !*/ + }; + + typedef basic_utf8_ifstream<unichar> utf8_uifstream; + typedef basic_utf8_ifstream<wchar_t> utf8_wifstream; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_UNICODe_ABSTRACT_H_ + + |