summaryrefslogtreecommitdiffstats
path: root/xbmc/utils/Utf8Utils.cpp
blob: a45002a11c85c75062d7ebb8008420a63b6558be (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/*
 *  Copyright (C) 2013-2018 Team Kodi
 *  This file is part of Kodi - https://kodi.tv
 *
 *  SPDX-License-Identifier: GPL-2.0-or-later
 *  See LICENSES/README.md for more information.
 */

#include "Utf8Utils.h"


CUtf8Utils::utf8CheckResult CUtf8Utils::checkStrForUtf8(const std::string& str)
{
  const char* const strC = str.c_str();
  const size_t len = str.length();
  size_t pos = 0;
  bool isPlainAscii = true;

  while (pos < len)
  {
    const size_t chrLen = SizeOfUtf8Char(strC + pos);
    if (chrLen == 0)
      return hiAscii; // non valid UTF-8 sequence
    else if (chrLen > 1)
      isPlainAscii = false;

    pos += chrLen;
  }

  if (isPlainAscii)
    return plainAscii; // only single-byte characters (valid for US-ASCII and for UTF-8)

  return utf8string;   // valid UTF-8 with at least one valid UTF-8 multi-byte sequence
}



size_t CUtf8Utils::FindValidUtf8Char(const std::string& str, const size_t startPos /*= 0*/)
{
  const char* strC = str.c_str();
  const size_t len = str.length();

  size_t pos = startPos;
  while (pos < len)
  {
    if (SizeOfUtf8Char(strC + pos))
      return pos;

    pos++;
  }

  return std::string::npos;
}

size_t CUtf8Utils::RFindValidUtf8Char(const std::string& str, const size_t startPos)
{
  const size_t len = str.length();
  if (!len)
    return std::string::npos;

  const char* strC = str.c_str();
  size_t pos = (startPos >= len) ? len - 1 : startPos;
  while (pos < len)  // pos is unsigned, after zero pos becomes large then len
  {
    if (SizeOfUtf8Char(strC + pos))
      return pos;

    pos--;
  }

  return std::string::npos;
}

inline size_t CUtf8Utils::SizeOfUtf8Char(const std::string& str, const size_t charStart /*= 0*/)
{
  if (charStart >= str.length())
    return std::string::npos;

  return SizeOfUtf8Char(str.c_str() + charStart);
}

// must be used only internally in class!
// str must be null-terminated
inline size_t CUtf8Utils::SizeOfUtf8Char(const char* const str)
{
  if (!str)
    return 0;

  const unsigned char* const strU = (const unsigned char*)str;
  const unsigned char chr = strU[0];

  /* this is an implementation of http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G27506 */

  /* U+0000 - U+007F in UTF-8 */
  if (chr <= 0x7F)
    return 1;

  /* U+0080 - U+07FF in UTF-8 */                    /* binary representation and range */
  if (chr >= 0xC2 && chr <= 0xDF                    /* C2=1100 0010 - DF=1101 1111 */
      // as str is null terminated,
      && ((strU[1] & 0xC0) == 0x80))  /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
    return 2;  // valid UTF-8 2 bytes sequence

  /* U+0800 - U+0FFF in UTF-8 */
  if (chr == 0xE0                                   /* E0=1110 0000 */
      && (strU[1] & 0xE0) == 0xA0     /* E0=1110 0000, A0=1010 0000 - BF=1011 1111 */
      && (strU[2] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
    return 3; // valid UTF-8 3 bytes sequence

  /* U+1000 - U+CFFF in UTF-8 */
  /* skip U+D000 - U+DFFF (handled later) */
  /* U+E000 - U+FFFF in UTF-8 */
  if (((chr >= 0xE1 && chr <= 0xEC)                 /* E1=1110 0001 - EC=1110 1100 */
        || chr == 0xEE || chr == 0xEF)              /* EE=1110 1110 - EF=1110 1111 */
        && (strU[1] & 0xC0) == 0x80   /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
        && (strU[2] & 0xC0) == 0x80)  /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
    return 3; // valid UTF-8 3 bytes sequence

  /* U+D000 - U+D7FF in UTF-8 */
  /* note: range U+D800 - U+DFFF is reserved and invalid */
  if (chr == 0xED                                   /* ED=1110 1101 */
      && (strU[1] & 0xE0) == 0x80     /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */
      && (strU[2] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
    return 3; // valid UTF-8 3 bytes sequence

  /* U+10000 - U+3FFFF in UTF-8 */
  if (chr == 0xF0                                   /* F0=1111 0000 */
      && (strU[1] & 0xE0) == 0x80     /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */
      && strU[2] >= 0x90 && strU[2] <= 0xBF         /* 90=1001 0000 - BF=1011 1111 */
      && (strU[3] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
    return 4; // valid UTF-8 4 bytes sequence

  /* U+40000 - U+FFFFF in UTF-8 */
  if (chr >= 0xF1 && chr <= 0xF3                    /* F1=1111 0001 - F3=1111 0011 */
      && (strU[1] & 0xC0) == 0x80     /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
      && (strU[2] & 0xC0) == 0x80     /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
      && (strU[3] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
    return 4; // valid UTF-8 4 bytes sequence

  /* U+100000 - U+10FFFF in UTF-8 */
  if (chr == 0xF4                                   /* F4=1111 0100 */
      && (strU[1] & 0xF0) == 0x80     /* F0=1111 0000, 80=1000 0000 - 8F=1000 1111 */
      && (strU[2] & 0xC0) == 0x80     /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
      && (strU[3] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
    return 4; // valid UTF-8 4 bytes sequence

  return 0; // invalid UTF-8 char sequence
}