diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 18:07:22 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 18:07:22 +0000 |
commit | c04dcc2e7d834218ef2d4194331e383402495ae1 (patch) | |
tree | 7333e38d10d75386e60f336b80c2443c1166031d /xbmc/utils/Utf8Utils.cpp | |
parent | Initial commit. (diff) | |
download | kodi-c04dcc2e7d834218ef2d4194331e383402495ae1.tar.xz kodi-c04dcc2e7d834218ef2d4194331e383402495ae1.zip |
Adding upstream version 2:20.4+dfsg.upstream/2%20.4+dfsg
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'xbmc/utils/Utf8Utils.cpp')
-rw-r--r-- | xbmc/utils/Utf8Utils.cpp | 148 |
1 files changed, 148 insertions, 0 deletions
diff --git a/xbmc/utils/Utf8Utils.cpp b/xbmc/utils/Utf8Utils.cpp new file mode 100644 index 0000000..a45002a --- /dev/null +++ b/xbmc/utils/Utf8Utils.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2013-2018 Team Kodi + * This file is part of Kodi - https://kodi.tv + * + * SPDX-License-Identifier: GPL-2.0-or-later + * See LICENSES/README.md for more information. + */ + +#include "Utf8Utils.h" + + +CUtf8Utils::utf8CheckResult CUtf8Utils::checkStrForUtf8(const std::string& str) +{ + const char* const strC = str.c_str(); + const size_t len = str.length(); + size_t pos = 0; + bool isPlainAscii = true; + + while (pos < len) + { + const size_t chrLen = SizeOfUtf8Char(strC + pos); + if (chrLen == 0) + return hiAscii; // non valid UTF-8 sequence + else if (chrLen > 1) + isPlainAscii = false; + + pos += chrLen; + } + + if (isPlainAscii) + return plainAscii; // only single-byte characters (valid for US-ASCII and for UTF-8) + + return utf8string; // valid UTF-8 with at least one valid UTF-8 multi-byte sequence +} + + + +size_t CUtf8Utils::FindValidUtf8Char(const std::string& str, const size_t startPos /*= 0*/) +{ + const char* strC = str.c_str(); + const size_t len = str.length(); + + size_t pos = startPos; + while (pos < len) + { + if (SizeOfUtf8Char(strC + pos)) + return pos; + + pos++; + } + + return std::string::npos; +} + +size_t CUtf8Utils::RFindValidUtf8Char(const std::string& str, const size_t startPos) +{ + const size_t len = str.length(); + if (!len) + return std::string::npos; + + const char* strC = str.c_str(); + size_t pos = (startPos >= len) ? len - 1 : startPos; + while (pos < len) // pos is unsigned, after zero pos becomes large then len + { + if (SizeOfUtf8Char(strC + pos)) + return pos; + + pos--; + } + + return std::string::npos; +} + +inline size_t CUtf8Utils::SizeOfUtf8Char(const std::string& str, const size_t charStart /*= 0*/) +{ + if (charStart >= str.length()) + return std::string::npos; + + return SizeOfUtf8Char(str.c_str() + charStart); +} + +// must be used only internally in class! +// str must be null-terminated +inline size_t CUtf8Utils::SizeOfUtf8Char(const char* const str) +{ + if (!str) + return 0; + + const unsigned char* const strU = (const unsigned char*)str; + const unsigned char chr = strU[0]; + + /* this is an implementation of http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G27506 */ + + /* U+0000 - U+007F in UTF-8 */ + if (chr <= 0x7F) + return 1; + + /* U+0080 - U+07FF in UTF-8 */ /* binary representation and range */ + if (chr >= 0xC2 && chr <= 0xDF /* C2=1100 0010 - DF=1101 1111 */ + // as str is null terminated, + && ((strU[1] & 0xC0) == 0x80)) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + return 2; // valid UTF-8 2 bytes sequence + + /* U+0800 - U+0FFF in UTF-8 */ + if (chr == 0xE0 /* E0=1110 0000 */ + && (strU[1] & 0xE0) == 0xA0 /* E0=1110 0000, A0=1010 0000 - BF=1011 1111 */ + && (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + return 3; // valid UTF-8 3 bytes sequence + + /* U+1000 - U+CFFF in UTF-8 */ + /* skip U+D000 - U+DFFF (handled later) */ + /* U+E000 - U+FFFF in UTF-8 */ + if (((chr >= 0xE1 && chr <= 0xEC) /* E1=1110 0001 - EC=1110 1100 */ + || chr == 0xEE || chr == 0xEF) /* EE=1110 1110 - EF=1110 1111 */ + && (strU[1] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + && (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + return 3; // valid UTF-8 3 bytes sequence + + /* U+D000 - U+D7FF in UTF-8 */ + /* note: range U+D800 - U+DFFF is reserved and invalid */ + if (chr == 0xED /* ED=1110 1101 */ + && (strU[1] & 0xE0) == 0x80 /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */ + && (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + return 3; // valid UTF-8 3 bytes sequence + + /* U+10000 - U+3FFFF in UTF-8 */ + if (chr == 0xF0 /* F0=1111 0000 */ + && (strU[1] & 0xE0) == 0x80 /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */ + && strU[2] >= 0x90 && strU[2] <= 0xBF /* 90=1001 0000 - BF=1011 1111 */ + && (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + return 4; // valid UTF-8 4 bytes sequence + + /* U+40000 - U+FFFFF in UTF-8 */ + if (chr >= 0xF1 && chr <= 0xF3 /* F1=1111 0001 - F3=1111 0011 */ + && (strU[1] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + && (strU[2] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + && (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + return 4; // valid UTF-8 4 bytes sequence + + /* U+100000 - U+10FFFF in UTF-8 */ + if (chr == 0xF4 /* F4=1111 0100 */ + && (strU[1] & 0xF0) == 0x80 /* F0=1111 0000, 80=1000 0000 - 8F=1000 1111 */ + && (strU[2] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + && (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ + return 4; // valid UTF-8 4 bytes sequence + + return 0; // invalid UTF-8 char sequence +} |