intl/components/src/String.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef intl_components_String_h_
#define intl_components_String_h_

#include "mozilla/Assertions.h"
#include "mozilla/Casting.h"
#include "mozilla/intl/ICU4CGlue.h"
#include "mozilla/intl/ICUError.h"
#include "mozilla/PodOperations.h"
#include "mozilla/Result.h"
#include "mozilla/Span.h"

#include "unicode/uchar.h"
#include "unicode/unorm2.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "unicode/utypes.h"

namespace mozilla::intl {

/**
 * This component is a Mozilla-focused API for working with strings in
 * internationalization code.
 */
class String final {
 public:
  String() = delete;

  /**
   * Return the locale-sensitive lower case string of the input.
   */
  template <typename B>
  static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale,
                                                Span<const char16_t> aString,
                                                B& aBuffer) {
    if (!aBuffer.reserve(aString.size())) {
      return Err(ICUError::OutOfMemory);
    }
    return FillBufferWithICUCall(
        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
          return u_strToLower(target, length, aString.data(), aString.size(),
                              aLocale, status);
        });
  }

  /**
   * Return the locale-sensitive upper case string of the input.
   */
  template <typename B>
  static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale,
                                                Span<const char16_t> aString,
                                                B& aBuffer) {
    if (!aBuffer.reserve(aString.size())) {
      return Err(ICUError::OutOfMemory);
    }
    return FillBufferWithICUCall(
        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
          return u_strToUpper(target, length, aString.data(), aString.size(),
                              aLocale, status);
        });
  }

  /**
   * Normalization form constants to describe which normalization algorithm
   * should be performed.
   *
   * Also see:
   * - Unicode Standard, §2.12 Equivalent Sequences
   * - Unicode Standard, §3.11 Normalization Forms
   * - https://unicode.org/reports/tr15/
   */
  enum class NormalizationForm {
    /**
     * Normalization Form C
     */
    NFC,

    /**
     * Normalization Form D
     */
    NFD,

    /**
     * Normalization Form KC
     */
    NFKC,

    /**
     * Normalization Form KD
     */
    NFKD,
  };

  enum class AlreadyNormalized : bool { No, Yes };

  /**
   * Normalize the input string according to requested normalization form.
   *
   * Returns `AlreadyNormalized::Yes` when the string is already in normalized
   * form. The output buffer is unchanged in this case. Otherwise returns
   * `AlreadyNormalized::No` and places the normalized string into the output
   * buffer.
   */
  template <typename B>
  static Result<AlreadyNormalized, ICUError> Normalize(
      NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) {
    // The unorm2_getXXXInstance() methods return a shared instance which must
    // not be deleted.
    UErrorCode status = U_ZERO_ERROR;
    const UNormalizer2* normalizer;
    switch (aForm) {
      case NormalizationForm::NFC:
        normalizer = unorm2_getNFCInstance(&status);
        break;
      case NormalizationForm::NFD:
        normalizer = unorm2_getNFDInstance(&status);
        break;
      case NormalizationForm::NFKC:
        normalizer = unorm2_getNFKCInstance(&status);
        break;
      case NormalizationForm::NFKD:
        normalizer = unorm2_getNFKDInstance(&status);
        break;
    }
    if (U_FAILURE(status)) {
      return Err(ToICUError(status));
    }

    int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(),
                                                     aString.size(), &status);
    if (U_FAILURE(status)) {
      return Err(ToICUError(status));
    }

    size_t spanLength = AssertedCast<size_t>(spanLengthInt);
    MOZ_ASSERT(spanLength <= aString.size());

    // Return if the input string is already normalized.
    if (spanLength == aString.size()) {
      return AlreadyNormalized::Yes;
    }

    if (!aBuffer.reserve(aString.size())) {
      return Err(ICUError::OutOfMemory);
    }

    // Copy the already normalized prefix.
    if (spanLength > 0) {
      PodCopy(aBuffer.data(), aString.data(), spanLength);

      aBuffer.written(spanLength);
    }

    MOZ_TRY(FillBufferWithICUCall(
        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
          Span<const char16_t> remaining = aString.From(spanLength);
          return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength,
                                                 length, remaining.data(),
                                                 remaining.size(), status);
        }));

    return AlreadyNormalized::No;
  }

  /**
   * Return true if the code point has the binary property "Cased".
   */
  static bool IsCased(char32_t codePoint) {
    return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED);
  }

  /**
   * Return true if the code point has the binary property "Case_Ignorable".
   */
  static bool IsCaseIgnorable(char32_t codePoint) {
    return u_hasBinaryProperty(static_cast<UChar32>(codePoint),
                               UCHAR_CASE_IGNORABLE);
  }

  /**
   * Return the NFC pairwise composition of the two input characters, if any;
   * returns 0 (which we know is not a composed char!) if none exists.
   */
  static char32_t ComposePairNFC(char32_t a, char32_t b) {
    // unorm2_getNFCInstance returns a static instance that does not have to be
    // released here. If it fails, we just return 0 (no composition) always.
    static UErrorCode status = U_ZERO_ERROR;
    static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
    if (U_FAILURE(status)) {
      return 0;
    }
    UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a),
                                    static_cast<UChar32>(b));
    return ch < 0 ? 0 : static_cast<char32_t>(ch);
  }

  /**
   * Put the "raw" (single-level) canonical decomposition of the input char, if
   * any, into the provided buffer. Canonical decomps are never more than two
   * chars in length (although full normalization may result in longer output
   * due to recursion).
   * Returns the length of the decomposition (0 if none, else 1 or 2).
   */
  static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) {
    // unorm2_getNFCInstance returns a static instance that does not have to be
    // released here. If it fails, we just return 0 (no decomposition) always.
    // Although we are using it to query for a decomposition, the mode of the
    // Normalizer2 is irrelevant here, so we may as well use the same singleton
    // instance as ComposePairNFC.
    static UErrorCode status = U_ZERO_ERROR;
    static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
    if (U_FAILURE(status)) {
      return 0;
    }

    // Canonical decompositions are never more than two Unicode characters,
    // or a maximum of 4 utf-16 code units.
    const unsigned MAX_DECOMP_LENGTH = 4;
    UErrorCode error = U_ZERO_ERROR;
    UChar decompUtf16[MAX_DECOMP_LENGTH];
    int32_t len =
        unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab),
                                   decompUtf16, MAX_DECOMP_LENGTH, &error);
    if (U_FAILURE(error) || len < 0) {
      return 0;
    }
    UText text = UTEXT_INITIALIZER;
    utext_openUChars(&text, decompUtf16, len, &error);
    MOZ_ASSERT(U_SUCCESS(error));
    UChar32 ch = UTEXT_NEXT32(&text);
    len = 0;
    if (ch != U_SENTINEL) {
      decomp[0] = static_cast<char32_t>(ch);
      ++len;
      ch = UTEXT_NEXT32(&text);
      if (ch != U_SENTINEL) {
        decomp[1] = static_cast<char32_t>(ch);
        ++len;
      }
    }
    utext_close(&text);
    return len;
  }

  /**
   * Return the Unicode version, for example "13.0".
   */
  static Span<const char> GetUnicodeVersion();
};

}  // namespace mozilla::intl

#endif