summaryrefslogtreecommitdiffstats
path: root/intl/components/src/UnicodeProperties.h
blob: 7fd64e099e06d193a7e1fe7a72517c7352742929 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef intl_components_UnicodeProperties_h_
#define intl_components_UnicodeProperties_h_

#include "mozilla/intl/BidiClass.h"
#include "mozilla/intl/GeneralCategory.h"
#include "mozilla/intl/ICU4CGlue.h"
#include "mozilla/intl/UnicodeScriptCodes.h"
#include "mozilla/Vector.h"

#include "unicode/uchar.h"
#include "unicode/uscript.h"

namespace mozilla::intl {

/**
 * This component is a Mozilla-focused API for working with text properties.
 */
class UnicodeProperties final {
 public:
  /**
   * Return the BidiClass for the character.
   */
  static inline BidiClass GetBidiClass(uint32_t aCh) {
    return BidiClass(u_charDirection(aCh));
  }

  /**
   * Maps the specified character to a "mirror-image" character.
   */
  static inline uint32_t CharMirror(uint32_t aCh) { return u_charMirror(aCh); }

  /**
   * Return the general category value for the code point.
   */
  static inline GeneralCategory CharType(uint32_t aCh) {
    return GeneralCategory(u_charType(aCh));
  }

  /**
   * Determine whether the code point has the Bidi_Mirrored property.
   */
  static inline bool IsMirrored(uint32_t aCh) { return u_isMirrored(aCh); }

  /**
   * Returns the combining class of the code point as specified in
   * UnicodeData.txt.
   */
  static inline uint8_t GetCombiningClass(uint32_t aCh) {
    return u_getCombiningClass(aCh);
  }

  enum class IntProperty {
    BidiPairedBracketType,
    EastAsianWidth,
    HangulSyllableType,
    LineBreak,
    NumericType,
  };

  /**
   * Get the property value for an enumerated or integer Unicode property for a
   * code point.
   */
  static inline int32_t GetIntPropertyValue(uint32_t aCh, IntProperty aProp) {
    UProperty prop;
    switch (aProp) {
      case IntProperty::BidiPairedBracketType:
        prop = UCHAR_BIDI_PAIRED_BRACKET_TYPE;
        break;
      case IntProperty::EastAsianWidth:
        prop = UCHAR_EAST_ASIAN_WIDTH;
        break;
      case IntProperty::HangulSyllableType:
        prop = UCHAR_HANGUL_SYLLABLE_TYPE;
        break;
      case IntProperty::LineBreak:
        prop = UCHAR_LINE_BREAK;
        break;
      case IntProperty::NumericType:
        prop = UCHAR_NUMERIC_TYPE;
        break;
    }
    return u_getIntPropertyValue(aCh, prop);
  }

  /**
   * Get the numeric value for a Unicode code point as defined in the
   * Unicode Character Database if the input is decimal or a digit,
   * otherwise, returns -1.
   */
  static inline int8_t GetNumericValue(uint32_t aCh) {
    UNumericType type =
        UNumericType(GetIntPropertyValue(aCh, IntProperty::NumericType));
    return type == U_NT_DECIMAL || type == U_NT_DIGIT
               ? int8_t(u_getNumericValue(aCh))
               : -1;
  }

  /**
   * Maps the specified character to its paired bracket character.
   */
  static inline uint32_t GetBidiPairedBracket(uint32_t aCh) {
    return u_getBidiPairedBracket(aCh);
  }

  /**
   * The given character is mapped to its uppercase equivalent according to
   * UnicodeData.txt; if the character has no uppercase equivalent, the
   * character itself is returned.
   */
  static inline uint32_t ToUpper(uint32_t aCh) { return u_toupper(aCh); }

  /**
   * The given character is mapped to its lowercase equivalent according to
   * UnicodeData.txt; if the character has no lowercase equivalent, the
   * character itself is returned.
   */
  static inline uint32_t ToLower(uint32_t aCh) { return u_tolower(aCh); }

  /**
   * Check if a code point has the Lowercase Unicode property.
   */
  static inline bool IsLowercase(uint32_t aCh) { return u_isULowercase(aCh); }

  /**
   * The given character is mapped to its titlecase equivalent according to
   * UnicodeData.txt; if the character has no titlecase equivalent, the
   * character itself is returned.
   */
  static inline uint32_t ToTitle(uint32_t aCh) { return u_totitle(aCh); }

  /**
   * The given character is mapped to its case folding equivalent according to
   * UnicodeData.txt and CaseFolding.txt;
   * if the character has no case folding equivalent, the character
   * itself is returned.
   */
  static inline uint32_t FoldCase(uint32_t aCh) {
    return u_foldCase(aCh, U_FOLD_CASE_DEFAULT);
  }

  enum class BinaryProperty {
    DefaultIgnorableCodePoint,
    Emoji,
    EmojiPresentation,
  };

  /**
   * Check a binary Unicode property for a code point.
   */
  static inline bool HasBinaryProperty(uint32_t aCh, BinaryProperty aProp) {
    UProperty prop;
    switch (aProp) {
      case BinaryProperty::DefaultIgnorableCodePoint:
        prop = UCHAR_DEFAULT_IGNORABLE_CODE_POINT;
        break;
      case BinaryProperty::Emoji:
        prop = UCHAR_EMOJI;
        break;
      case BinaryProperty::EmojiPresentation:
        prop = UCHAR_EMOJI_PRESENTATION;
        break;
    }
    return u_hasBinaryProperty(aCh, prop);
  }

  /**
   * Check if the width of aCh is full width, half width or wide
   * excluding emoji.
   */
  static inline bool IsEastAsianWidthFHWexcludingEmoji(uint32_t aCh) {
    switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) {
      case U_EA_FULLWIDTH:
      case U_EA_HALFWIDTH:
        return true;
      case U_EA_WIDE:
        return HasBinaryProperty(aCh, BinaryProperty::Emoji) ? false : true;
      case U_EA_AMBIGUOUS:
      case U_EA_NARROW:
      case U_EA_NEUTRAL:
        return false;
    }
    return false;
  }

  /**
   * Check if the width of aCh is ambiguous, full width, or wide.
   */
  static inline bool IsEastAsianWidthAFW(uint32_t aCh) {
    switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) {
      case U_EA_AMBIGUOUS:
      case U_EA_FULLWIDTH:
      case U_EA_WIDE:
        return true;
      case U_EA_HALFWIDTH:
      case U_EA_NARROW:
      case U_EA_NEUTRAL:
        return false;
    }
    return false;
  }

  /**
   * Check if the width of aCh is full width, or wide.
   */
  static inline bool IsEastAsianWidthFW(uint32_t aCh) {
    switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) {
      case U_EA_FULLWIDTH:
      case U_EA_WIDE:
        return true;
      case U_EA_AMBIGUOUS:
      case U_EA_HALFWIDTH:
      case U_EA_NARROW:
      case U_EA_NEUTRAL:
        return false;
    }
    return false;
  }

  /**
   * Check if the CharType of aCh is math or other symbol.
   */
  static inline bool IsMathOrMusicSymbol(uint32_t aCh) {
    // Keep this function in sync with is_math_symbol in base_chars.py.
    return CharType(aCh) == GeneralCategory::Math_Symbol ||
           CharType(aCh) == GeneralCategory::Other_Symbol;
  }

  static inline Script GetScriptCode(uint32_t aCh) {
    // We can safely ignore the error code here because uscript_getScript
    // returns USCRIPT_INVALID_CODE in the event of an error.
    UErrorCode err = U_ZERO_ERROR;
    return Script(uscript_getScript(aCh, &err));
  }

  static inline bool HasScript(uint32_t aCh, Script aScript) {
    return uscript_hasScript(aCh, UScriptCode(aScript));
  }

  static inline const char* GetScriptShortName(Script aScript) {
    return uscript_getShortName(UScriptCode(aScript));
  }

  static inline int32_t GetMaxNumberOfScripts() {
    return u_getIntPropertyMaxValue(UCHAR_SCRIPT);
  }

  // The code point which has the most script extensions is 0x0965, which has 21
  // script extensions, so choose the vector size as 32 to prevent heap
  // allocation.
  static constexpr size_t kMaxScripts = 32;

  using ScriptExtensionVector = Vector<Script, kMaxScripts>;

  /**
   * Get the script extensions for the given code point, and write the script
   * extensions to aExtensions vector. If the code point has script extensions,
   * the script code (Script::COMMON or Script::INHERITED) will be excluded.
   *
   * If the code point doesn't have any script extension, then its script code
   * will be written to aExtensions vector.
   *
   * If the code point is invalid, Script::UNKNOWN will be written to
   * aExtensions vector.
   *
   * Note: aExtensions will be cleared after calling this method regardless of
   * failure.
   *
   * See [1] for the script code of the code point, [2] for the script
   * extensions.
   *
   * https://www.unicode.org/Public/UNIDATA/Scripts.txt
   * https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt
   */
  static ICUResult GetExtensions(char32_t aCodePoint,
                                 ScriptExtensionVector& aExtensions) {
    // Clear the vector first.
    aExtensions.clear();

    // We cannot pass aExtensions to uscript_getScriptExtension as USCriptCode
    // takes 4 bytes, so create a local UScriptCode array to get the extensions.
    UScriptCode ext[kMaxScripts];
    UErrorCode status = U_ZERO_ERROR;
    int32_t len = uscript_getScriptExtensions(static_cast<UChar32>(aCodePoint),
                                              ext, kMaxScripts, &status);
    if (U_FAILURE(status)) {
      // kMaxScripts should be large enough to hold the maximun number of script
      // extensions.
      MOZ_DIAGNOSTIC_ASSERT(status != U_BUFFER_OVERFLOW_ERROR);
      return Err(ToICUError(status));
    }

    if (!aExtensions.reserve(len)) {
      return Err(ICUError::OutOfMemory);
    }

    for (int32_t i = 0; i < len; i++) {
      aExtensions.infallibleAppend(Script(ext[i]));
    }

    return Ok();
  }
};

}  // namespace mozilla::intl

#endif