intl/components/src/Collator.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef intl_components_Collator_h_
#define intl_components_Collator_h_

#ifndef JS_STANDALONE
#  include "gtest/MozGtestFriend.h"
#endif

#include "unicode/ucol.h"

#include "mozilla/intl/ICU4CGlue.h"
#include "mozilla/intl/ICUError.h"
#include "mozilla/Result.h"
#include "mozilla/Span.h"

namespace mozilla::intl {

class Collator final {
 public:
  /**
   * Construct from a raw UCollator. This is public so that the UniquePtr can
   * access it.
   */
  explicit Collator(UCollator* aCollator);

  // Do not allow copy as this class owns the ICU resource. Move is not
  // currently implemented, but a custom move operator could be created if
  // needed.
  Collator(const Collator&) = delete;
  Collator& operator=(const Collator&) = delete;

  /**
   * Attempt to initialize a new collator.
   */
  static Result<UniquePtr<Collator>, ICUError> TryCreate(const char* aLocale);

  ~Collator();

  /**
   * Get a sort key with the provided UTF-16 string, and store the sort key into
   * the provided buffer of byte array.
   * Every sort key ends with 0x00, and the terminating 0x00 byte is counted
   * when calculating the length of buffer. For the purpose of other byte
   * values, check the "Special Byte Values" document from ICU.
   *
   * https://icu.unicode.org/design/collation/bytes
   */
  template <typename B>
  ICUResult GetSortKey(Span<const char16_t> aString, B& aBuffer) const {
    return FillBufferWithICUCall(
        aBuffer,
        [this, aString](uint8_t* target, int32_t length, UErrorCode* status) {
          // ucol_getSortKey doesn't use the error code to report
          // U_BUFFER_OVERFLOW_ERROR, instead it uses the return value to
          // indicate the desired length to store the key. So we update the
          // UErrorCode accordingly to let FillBufferWithICUCall resize the
          // buffer.
          int32_t len = ucol_getSortKey(mCollator.GetConst(), aString.data(),
                                        static_cast<int32_t>(aString.size()),
                                        target, length);
          if (len == 0) {
            // Returns 0 means there's an internal error.
            *status = U_INTERNAL_PROGRAM_ERROR;
          } else if (len > length) {
            *status = U_BUFFER_OVERFLOW_ERROR;
          } else {
            *status = U_ZERO_ERROR;
          }
          return len;
        });
  }

  int32_t CompareStrings(Span<const char16_t> aSource,
                         Span<const char16_t> aTarget) const;

  int32_t CompareSortKeys(Span<const uint8_t> aKey1,
                          Span<const uint8_t> aKey2) const;

  /**
   * Determine how casing affects sorting. These options map to ECMA 402
   * collator options.
   *
   * https://tc39.es/ecma402/#sec-initializecollator
   */
  enum class CaseFirst {
    // Sort upper case first.
    Upper,
    // Sort lower case first.
    Lower,
    // Orders upper and lower case letters in accordance to their tertiary
    // weights.
    False,
  };

  /**
   * Which differences in the strings should lead to differences in collation
   * comparisons.
   *
   * This setting needs to be ECMA 402 compliant.
   * https://tc39.es/ecma402/#sec-collator-comparestrings
   */
  enum class Sensitivity {
    // Only strings that differ in base letters compare as unequal.
    // Examples: a ≠ b, a = á, a = A.
    Base,
    // Only strings that differ in base letters or accents and other diacritic
    // marks compare as unequal.
    // Examples: a ≠ b, a ≠ á, a = A.
    Accent,
    // Only strings that differ in base letters or case compare as unequal.
    // Examples: a ≠ b, a = á, a ≠ A.
    Case,
    // Strings that differ in base letters, accents and other diacritic marks,
    // or case compare as unequal. Other differences may also be taken into
    // consideration.
    // Examples: a ≠ b, a ≠ á, a ≠ A.
    Variant,
  };

  /**
   * These options map to ECMA 402 collator options. Make sure the defaults map
   * to the default initialized values of ECMA 402.
   *
   * https://tc39.es/ecma402/#sec-initializecollator
   */
  struct Options {
    Sensitivity sensitivity = Sensitivity::Variant;
    CaseFirst caseFirst = CaseFirst::False;
    bool ignorePunctuation = false;
    bool numeric = false;
  };

  /**
   * Change the configuraton of the options.
   */
  ICUResult SetOptions(const Options& aOptions,
                       const Maybe<Options&> aPrevOptions = Nothing());

  /**
   * Return the case first option of this collator.
   */
  Result<CaseFirst, ICUError> GetCaseFirst() const;

  /**
   * Map keywords to their BCP 47 equivalents.
   */
  static SpanResult<char> KeywordValueToBcp47Extension(const char* aKeyword,
                                                       int32_t aLength);

  enum class CommonlyUsed : bool {
    /**
     * Select all possible values, even when not commonly used by a locale.
     */
    No,

    /**
     * Only select the values which are commonly used by a locale.
     */
    Yes,
  };

  using Bcp47ExtEnumeration =
      Enumeration<char, SpanResult<char>,
                  Collator::KeywordValueToBcp47Extension>;

  /**
   * Returns an iterator of collator locale extensions in the preferred order.
   * These extensions can be used in BCP 47 locales. For instance this
   * iterator could return "phonebk" and could be appled to the German locale
   * "de" as "de-co-phonebk" for a phonebook-style collation.
   *
   * The collation extensions can be found here:
   * http://cldr.unicode.org/core-spec/#Key_Type_Definitions
   */
  static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValuesForLocale(
      const char* aLocale, CommonlyUsed aCommonlyUsed = CommonlyUsed::No);

  /**
   * Returns an iterator over all possible collator locale extensions.
   * These extensions can be used in BCP 47 locales. For instance this
   * iterator could return "phonebk" and could be appled to the German locale
   * "de" as "de-co-phonebk" for a phonebook-style collation.
   *
   * The collation extensions can be found here:
   * http://cldr.unicode.org/core-spec/#Key_Type_Definitions
   */
  static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValues();

  /**
   * Returns an iterator over all supported collator locales.
   *
   * The returned strings are ICU locale identifiers and NOT BCP 47 language
   * tags.
   *
   * Also see <https://unicode-org.github.io/icu/userguide/locale>.
   */
  static auto GetAvailableLocales() {
    return AvailableLocalesEnumeration<ucol_countAvailable,
                                       ucol_getAvailable>();
  }

 private:
  /**
   * Toggle features, or use the default setting.
   */
  enum class Feature {
    // Turn the feature off.
    On,
    // Turn the feature off.
    Off,
    // Use the default setting for the feature.
    Default,
  };

  /**
   * Attribute for handling variable elements.
   */
  enum class AlternateHandling {
    // Treats all the codepoints with non-ignorable primary weights in the
    // same way (default)
    NonIgnorable,
    // Causes codepoints with primary weights that are equal or below the
    // variable top value to be ignored on primary level and moved to the
    // quaternary level.
    Shifted,
    Default,
  };

  /**
   * The strength attribute.
   *
   * The usual strength for most locales (except Japanese) is tertiary.
   *
   * Quaternary strength is useful when combined with shifted setting for
   * alternate handling attribute and for JIS X 4061 collation, when it is used
   * to distinguish between Katakana and Hiragana. Otherwise, quaternary level
   * is affected only by the number of non-ignorable code points in the string.
   *
   * Identical strength is rarely useful, as it amounts to codepoints of the NFD
   * form of the string.
   */
  enum class Strength {
    // Primary collation strength.
    Primary,
    // Secondary collation strength.
    Secondary,
    // Tertiary collation strength.
    Tertiary,
    // Quaternary collation strength.
    Quaternary,
    // Identical collation strength.
    Identical,
    Default,
  };

  /**
   * Configure the Collation::Strength
   */
  void SetStrength(Strength strength);

  /**
   * Configure Collation::AlternateHandling.
   */
  ICUResult SetAlternateHandling(AlternateHandling aAlternateHandling);

  /**
   * Controls whether an extra case level (positioned before the third level) is
   * generated or not.
   *
   * Contents of the case level are affected by the value of CaseFirst
   * attribute. A simple way to ignore accent differences in a string is to set
   * the strength to Primary and enable case level.
   */
  ICUResult SetCaseLevel(Feature aFeature);

  /**
   * When turned on, this attribute makes substrings of digits sort according to
   * their numeric values.
   *
   * This is a way to get '100' to sort AFTER '2'. Note that the longest digit
   * substring that can be treated as a single unit is 254 digits (not counting
   * leading zeros). If a digit substring is longer than that, the digits beyond
   * the limit will be treated as a separate digit substring.
   *
   * A "digit" in this sense is a code point with General_Category=Nd, which
   * does not include circled numbers, roman numerals, etc. Only a contiguous
   * digit substring is considered, that is, non-negative integers without
   * separators. There is no support for plus/minus signs, decimals, exponents,
   * etc.
   */
  ICUResult SetNumericCollation(Feature aFeature);

  /**
   * Controls whether the normalization check and necessary normalizations are
   * performed.
   *
   * When off (default), no normalization check is performed. The correctness of
   * the result is guaranteed only if the input data is in so-called FCD form
   * When set to on, an incremental check is performed to see whether the input
   * data is in the FCD form. If the data is not in the FCD form, incremental
   * NFD normalization is performed.
   */
  ICUResult SetNormalizationMode(Feature aFeature);

  /**
   * Configure Collation::CaseFirst.
   */
  ICUResult SetCaseFirst(CaseFirst aCaseFirst);

#ifndef JS_STANDALONE
  FRIEND_TEST(IntlCollator, SetAttributesInternal);
#endif

  ICUPointer<UCollator> mCollator = ICUPointer<UCollator>(nullptr);
  Maybe<Sensitivity> mLastStrategy = Nothing();
};

}  // namespace mozilla::intl

#endif