1 files changed, 348 insertions, 0 deletions
diff --git a/intl/components/src/Collator.h b/intl/components/src/Collator.h
new file mode 100644
index 0000000000..655cb7b0fd
--- /dev/null
+++ b/intl/components/src/Collator.h
@@ -0,0 +1,348 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef intl_components_Collator_h_
+#define intl_components_Collator_h_
+
+#ifndef JS_STANDALONE
+#  include "gtest/MozGtestFriend.h"
+#endif
+
+#include "unicode/ucol.h"
+
+#include "mozilla/Compiler.h"
+#include "mozilla/intl/ICU4CGlue.h"
+#include "mozilla/intl/ICUError.h"
+#include "mozilla/Result.h"
+#include "mozilla/Span.h"
+
+namespace mozilla::intl {
+
+class Collator final {
+ public:
+  /**
+   * Construct from a raw UCollator. This is public so that the UniquePtr can
+   * access it.
+   */
+  explicit Collator(UCollator* aCollator);
+
+  // Do not allow copy as this class owns the ICU resource. Move is not
+  // currently implemented, but a custom move operator could be created if
+  // needed.
+  Collator(const Collator&) = delete;
+  Collator& operator=(const Collator&) = delete;
+
+  /**
+   * Attempt to initialize a new collator.
+   */
+  static Result<UniquePtr<Collator>, ICUError> TryCreate(const char* aLocale);
+
+  ~Collator();
+
+  /**
+   * Get a sort key with the provided UTF-16 string, and store the sort key into
+   * the provided buffer of byte array.
+   * Every sort key ends with 0x00, and the terminating 0x00 byte is counted
+   * when calculating the length of buffer. For the purpose of other byte
+   * values, check the "Special Byte Values" document from ICU.
+   *
+   * https://icu.unicode.org/design/collation/bytes
+   */
+  template <typename B>
+  ICUResult GetSortKey(Span<const char16_t> aString, B& aBuffer) const {
+    return FillBufferWithICUCall(
+        aBuffer,
+        [this, aString](uint8_t* target, int32_t length, UErrorCode* status) {
+          // ucol_getSortKey doesn't use the error code to report
+          // U_BUFFER_OVERFLOW_ERROR, instead it uses the return value to
+          // indicate the desired length to store the key. So we update the
+          // UErrorCode accordingly to let FillBufferWithICUCall resize the
+          // buffer.
+          int32_t len = ucol_getSortKey(mCollator.GetConst(), aString.data(),
+                                        static_cast<int32_t>(aString.size()),
+                                        target, length);
+          if (len == 0) {
+            // Returns 0 means there's an internal error.
+            *status = U_INTERNAL_PROGRAM_ERROR;
+          } else if (len > length) {
+            *status = U_BUFFER_OVERFLOW_ERROR;
+          } else {
+            *status = U_ZERO_ERROR;
+          }
+          return len;
+        });
+  }
+
+  int32_t CompareStrings(Span<const char16_t> aSource,
+                         Span<const char16_t> aTarget) const;
+
+  int32_t CompareSortKeys(Span<const uint8_t> aKey1,
+                          Span<const uint8_t> aKey2) const;
+
+  /**
+   * Determine how casing affects sorting. These options map to ECMA 402
+   * collator options.
+   *
+   * https://tc39.es/ecma402/#sec-initializecollator
+   */
+  enum class CaseFirst {
+    // Sort upper case first.
+    Upper,
+    // Sort lower case first.
+    Lower,
+    // Orders upper and lower case letters in accordance to their tertiary
+    // weights.
+    False,
+  };
+
+  /**
+   * Which differences in the strings should lead to differences in collation
+   * comparisons.
+   *
+   * This setting needs to be ECMA 402 compliant.
+   * https://tc39.es/ecma402/#sec-collator-comparestrings
+   */
+  enum class Sensitivity {
+    // Only strings that differ in base letters compare as unequal.
+    // Examples: a ≠ b, a = á, a = A.
+    Base,
+    // Only strings that differ in base letters or accents and other diacritic
+    // marks compare as unequal.
+    // Examples: a ≠ b, a ≠ á, a = A.
+    Accent,
+    // Only strings that differ in base letters or case compare as unequal.
+    // Examples: a ≠ b, a = á, a ≠ A.
+    Case,
+    // Strings that differ in base letters, accents and other diacritic marks,
+    // or case compare as unequal. Other differences may also be taken into
+    // consideration.
+    // Examples: a ≠ b, a ≠ á, a ≠ A.
+    Variant,
+  };
+
+  /**
+   * These options map to ECMA 402 collator options. Make sure the defaults map
+   * to the default initialized values of ECMA 402.
+   *
+   * https://tc39.es/ecma402/#sec-initializecollator
+   */
+  struct Options {
+    Sensitivity sensitivity = Sensitivity::Variant;
+    CaseFirst caseFirst = CaseFirst::False;
+    bool ignorePunctuation = false;
+    bool numeric = false;
+  };
+
+  /**
+   * Change the configuraton of the options.
+   */
+  ICUResult SetOptions(const Options& aOptions,
+                       const Maybe<Options&> aPrevOptions = Nothing());
+
+  /**
+   * Return the case first option of this collator.
+   */
+  Result<CaseFirst, ICUError> GetCaseFirst() const;
+
+  /**
+   * Return the "ignores punctuation" option of this collator.
+   */
+  Result<bool, ICUError> GetIgnorePunctuation() const;
+
+  /**
+   * Map keywords to their BCP 47 equivalents.
+   */
+  static SpanResult<char> KeywordValueToBcp47Extension(const char* aKeyword,
+                                                       int32_t aLength);
+
+  enum class CommonlyUsed : bool {
+    /**
+     * Select all possible values, even when not commonly used by a locale.
+     */
+    No,
+
+    /**
+     * Only select the values which are commonly used by a locale.
+     */
+    Yes,
+  };
+
+  using Bcp47ExtEnumeration =
+      Enumeration<char, SpanResult<char>,
+                  Collator::KeywordValueToBcp47Extension>;
+
+  /**
+   * Returns an iterator of collator locale extensions in the preferred order.
+   * These extensions can be used in BCP 47 locales. For instance this
+   * iterator could return "phonebk" and could be appled to the German locale
+   * "de" as "de-co-phonebk" for a phonebook-style collation.
+   *
+   * The collation extensions can be found here:
+   * http://cldr.unicode.org/core-spec/#Key_Type_Definitions
+   */
+  static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValuesForLocale(
+      const char* aLocale, CommonlyUsed aCommonlyUsed = CommonlyUsed::No);
+
+  /**
+   * Returns an iterator over all possible collator locale extensions.
+   * These extensions can be used in BCP 47 locales. For instance this
+   * iterator could return "phonebk" and could be appled to the German locale
+   * "de" as "de-co-phonebk" for a phonebook-style collation.
+   *
+   * The collation extensions can be found here:
+   * http://cldr.unicode.org/core-spec/#Key_Type_Definitions
+   */
+  static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValues();
+
+  /**
+   * Returns an iterator over all supported collator locales.
+   *
+   * The returned strings are ICU locale identifiers and NOT BCP 47 language
+   * tags.
+   *
+   * Also see <https://unicode-org.github.io/icu/userguide/locale>.
+   */
+  static auto GetAvailableLocales() {
+    return AvailableLocalesEnumeration<ucol_countAvailable,
+                                       ucol_getAvailable>();
+  }
+
+ private:
+  /**
+   * Toggle features, or use the default setting.
+   */
+  enum class Feature {
+    // Turn the feature off.
+    On,
+    // Turn the feature off.
+    Off,
+    // Use the default setting for the feature.
+    Default,
+  };
+
+  static constexpr auto ToUColAttributeValue(Feature aFeature) {
+    switch (aFeature) {
+      case Collator::Feature::On:
+        return UCOL_ON;
+      case Collator::Feature::Off:
+        return UCOL_OFF;
+      case Collator::Feature::Default:
+        return UCOL_DEFAULT;
+    }
+#if MOZ_IS_GCC
+#  if !MOZ_GCC_VERSION_AT_LEAST(9, 1, 0)
+    return UCOL_DEFAULT;
+#  else
+    MOZ_CRASH("invalid collator feature");
+#  endif
+#else
+    MOZ_CRASH("invalid collator feature");
+#endif
+  }
+
+  /**
+   * Attribute for handling variable elements.
+   */
+  enum class AlternateHandling {
+    // Treats all the codepoints with non-ignorable primary weights in the
+    // same way (default)
+    NonIgnorable,
+    // Causes codepoints with primary weights that are equal or below the
+    // variable top value to be ignored on primary level and moved to the
+    // quaternary level.
+    Shifted,
+    Default,
+  };
+
+  /**
+   * The strength attribute.
+   *
+   * The usual strength for most locales (except Japanese) is tertiary.
+   *
+   * Quaternary strength is useful when combined with shifted setting for
+   * alternate handling attribute and for JIS X 4061 collation, when it is used
+   * to distinguish between Katakana and Hiragana. Otherwise, quaternary level
+   * is affected only by the number of non-ignorable code points in the string.
+   *
+   * Identical strength is rarely useful, as it amounts to codepoints of the NFD
+   * form of the string.
+   */
+  enum class Strength {
+    // Primary collation strength.
+    Primary,
+    // Secondary collation strength.
+    Secondary,
+    // Tertiary collation strength.
+    Tertiary,
+    // Quaternary collation strength.
+    Quaternary,
+    // Identical collation strength.
+    Identical,
+    Default,
+  };
+
+  /**
+   * Configure the Collation::Strength
+   */
+  void SetStrength(Strength strength);
+
+  /**
+   * Configure Collation::AlternateHandling.
+   */
+  ICUResult SetAlternateHandling(AlternateHandling aAlternateHandling);
+
+  /**
+   * Controls whether an extra case level (positioned before the third level) is
+   * generated or not.
+   *
+   * Contents of the case level are affected by the value of CaseFirst
+   * attribute. A simple way to ignore accent differences in a string is to set
+   * the strength to Primary and enable case level.
+   */
+  ICUResult SetCaseLevel(Feature aFeature);
+
+  /**
+   * When turned on, this attribute makes substrings of digits sort according to
+   * their numeric values.
+   *
+   * This is a way to get '100' to sort AFTER '2'. Note that the longest digit
+   * substring that can be treated as a single unit is 254 digits (not counting
+   * leading zeros). If a digit substring is longer than that, the digits beyond
+   * the limit will be treated as a separate digit substring.
+   *
+   * A "digit" in this sense is a code point with General_Category=Nd, which
+   * does not include circled numbers, roman numerals, etc. Only a contiguous
+   * digit substring is considered, that is, non-negative integers without
+   * separators. There is no support for plus/minus signs, decimals, exponents,
+   * etc.
+   */
+  ICUResult SetNumericCollation(Feature aFeature);
+
+  /**
+   * Controls whether the normalization check and necessary normalizations are
+   * performed.
+   *
+   * When off (default), no normalization check is performed. The correctness of
+   * the result is guaranteed only if the input data is in so-called FCD form
+   * When set to on, an incremental check is performed to see whether the input
+   * data is in the FCD form. If the data is not in the FCD form, incremental
+   * NFD normalization is performed.
+   */
+  ICUResult SetNormalizationMode(Feature aFeature);
+
+  /**
+   * Configure Collation::CaseFirst.
+   */
+  ICUResult SetCaseFirst(CaseFirst aCaseFirst);
+
+#ifndef JS_STANDALONE
+  FRIEND_TEST(IntlCollator, SetAttributesInternal);
+#endif
+
+  ICUPointer<UCollator> mCollator = ICUPointer<UCollator>(nullptr);
+  Maybe<Sensitivity> mLastStrategy = Nothing();
+};
+
+}  // namespace mozilla::intl
+
+#endif