diff options
Diffstat (limited to 'intl/components/src/Collator.h')
-rw-r--r-- | intl/components/src/Collator.h | 348 |
1 files changed, 348 insertions, 0 deletions
diff --git a/intl/components/src/Collator.h b/intl/components/src/Collator.h new file mode 100644 index 0000000000..655cb7b0fd --- /dev/null +++ b/intl/components/src/Collator.h @@ -0,0 +1,348 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_Collator_h_ +#define intl_components_Collator_h_ + +#ifndef JS_STANDALONE +# include "gtest/MozGtestFriend.h" +#endif + +#include "unicode/ucol.h" + +#include "mozilla/Compiler.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" + +namespace mozilla::intl { + +class Collator final { + public: + /** + * Construct from a raw UCollator. This is public so that the UniquePtr can + * access it. + */ + explicit Collator(UCollator* aCollator); + + // Do not allow copy as this class owns the ICU resource. Move is not + // currently implemented, but a custom move operator could be created if + // needed. + Collator(const Collator&) = delete; + Collator& operator=(const Collator&) = delete; + + /** + * Attempt to initialize a new collator. + */ + static Result<UniquePtr<Collator>, ICUError> TryCreate(const char* aLocale); + + ~Collator(); + + /** + * Get a sort key with the provided UTF-16 string, and store the sort key into + * the provided buffer of byte array. + * Every sort key ends with 0x00, and the terminating 0x00 byte is counted + * when calculating the length of buffer. For the purpose of other byte + * values, check the "Special Byte Values" document from ICU. + * + * https://icu.unicode.org/design/collation/bytes + */ + template <typename B> + ICUResult GetSortKey(Span<const char16_t> aString, B& aBuffer) const { + return FillBufferWithICUCall( + aBuffer, + [this, aString](uint8_t* target, int32_t length, UErrorCode* status) { + // ucol_getSortKey doesn't use the error code to report + // U_BUFFER_OVERFLOW_ERROR, instead it uses the return value to + // indicate the desired length to store the key. So we update the + // UErrorCode accordingly to let FillBufferWithICUCall resize the + // buffer. + int32_t len = ucol_getSortKey(mCollator.GetConst(), aString.data(), + static_cast<int32_t>(aString.size()), + target, length); + if (len == 0) { + // Returns 0 means there's an internal error. + *status = U_INTERNAL_PROGRAM_ERROR; + } else if (len > length) { + *status = U_BUFFER_OVERFLOW_ERROR; + } else { + *status = U_ZERO_ERROR; + } + return len; + }); + } + + int32_t CompareStrings(Span<const char16_t> aSource, + Span<const char16_t> aTarget) const; + + int32_t CompareSortKeys(Span<const uint8_t> aKey1, + Span<const uint8_t> aKey2) const; + + /** + * Determine how casing affects sorting. These options map to ECMA 402 + * collator options. + * + * https://tc39.es/ecma402/#sec-initializecollator + */ + enum class CaseFirst { + // Sort upper case first. + Upper, + // Sort lower case first. + Lower, + // Orders upper and lower case letters in accordance to their tertiary + // weights. + False, + }; + + /** + * Which differences in the strings should lead to differences in collation + * comparisons. + * + * This setting needs to be ECMA 402 compliant. + * https://tc39.es/ecma402/#sec-collator-comparestrings + */ + enum class Sensitivity { + // Only strings that differ in base letters compare as unequal. + // Examples: a ≠ b, a = á, a = A. + Base, + // Only strings that differ in base letters or accents and other diacritic + // marks compare as unequal. + // Examples: a ≠ b, a ≠ á, a = A. + Accent, + // Only strings that differ in base letters or case compare as unequal. + // Examples: a ≠ b, a = á, a ≠ A. + Case, + // Strings that differ in base letters, accents and other diacritic marks, + // or case compare as unequal. Other differences may also be taken into + // consideration. + // Examples: a ≠ b, a ≠ á, a ≠ A. + Variant, + }; + + /** + * These options map to ECMA 402 collator options. Make sure the defaults map + * to the default initialized values of ECMA 402. + * + * https://tc39.es/ecma402/#sec-initializecollator + */ + struct Options { + Sensitivity sensitivity = Sensitivity::Variant; + CaseFirst caseFirst = CaseFirst::False; + bool ignorePunctuation = false; + bool numeric = false; + }; + + /** + * Change the configuraton of the options. + */ + ICUResult SetOptions(const Options& aOptions, + const Maybe<Options&> aPrevOptions = Nothing()); + + /** + * Return the case first option of this collator. + */ + Result<CaseFirst, ICUError> GetCaseFirst() const; + + /** + * Return the "ignores punctuation" option of this collator. + */ + Result<bool, ICUError> GetIgnorePunctuation() const; + + /** + * Map keywords to their BCP 47 equivalents. + */ + static SpanResult<char> KeywordValueToBcp47Extension(const char* aKeyword, + int32_t aLength); + + enum class CommonlyUsed : bool { + /** + * Select all possible values, even when not commonly used by a locale. + */ + No, + + /** + * Only select the values which are commonly used by a locale. + */ + Yes, + }; + + using Bcp47ExtEnumeration = + Enumeration<char, SpanResult<char>, + Collator::KeywordValueToBcp47Extension>; + + /** + * Returns an iterator of collator locale extensions in the preferred order. + * These extensions can be used in BCP 47 locales. For instance this + * iterator could return "phonebk" and could be appled to the German locale + * "de" as "de-co-phonebk" for a phonebook-style collation. + * + * The collation extensions can be found here: + * http://cldr.unicode.org/core-spec/#Key_Type_Definitions + */ + static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValuesForLocale( + const char* aLocale, CommonlyUsed aCommonlyUsed = CommonlyUsed::No); + + /** + * Returns an iterator over all possible collator locale extensions. + * These extensions can be used in BCP 47 locales. For instance this + * iterator could return "phonebk" and could be appled to the German locale + * "de" as "de-co-phonebk" for a phonebook-style collation. + * + * The collation extensions can be found here: + * http://cldr.unicode.org/core-spec/#Key_Type_Definitions + */ + static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValues(); + + /** + * Returns an iterator over all supported collator locales. + * + * The returned strings are ICU locale identifiers and NOT BCP 47 language + * tags. + * + * Also see <https://unicode-org.github.io/icu/userguide/locale>. + */ + static auto GetAvailableLocales() { + return AvailableLocalesEnumeration<ucol_countAvailable, + ucol_getAvailable>(); + } + + private: + /** + * Toggle features, or use the default setting. + */ + enum class Feature { + // Turn the feature off. + On, + // Turn the feature off. + Off, + // Use the default setting for the feature. + Default, + }; + + static constexpr auto ToUColAttributeValue(Feature aFeature) { + switch (aFeature) { + case Collator::Feature::On: + return UCOL_ON; + case Collator::Feature::Off: + return UCOL_OFF; + case Collator::Feature::Default: + return UCOL_DEFAULT; + } +#if MOZ_IS_GCC +# if !MOZ_GCC_VERSION_AT_LEAST(9, 1, 0) + return UCOL_DEFAULT; +# else + MOZ_CRASH("invalid collator feature"); +# endif +#else + MOZ_CRASH("invalid collator feature"); +#endif + } + + /** + * Attribute for handling variable elements. + */ + enum class AlternateHandling { + // Treats all the codepoints with non-ignorable primary weights in the + // same way (default) + NonIgnorable, + // Causes codepoints with primary weights that are equal or below the + // variable top value to be ignored on primary level and moved to the + // quaternary level. + Shifted, + Default, + }; + + /** + * The strength attribute. + * + * The usual strength for most locales (except Japanese) is tertiary. + * + * Quaternary strength is useful when combined with shifted setting for + * alternate handling attribute and for JIS X 4061 collation, when it is used + * to distinguish between Katakana and Hiragana. Otherwise, quaternary level + * is affected only by the number of non-ignorable code points in the string. + * + * Identical strength is rarely useful, as it amounts to codepoints of the NFD + * form of the string. + */ + enum class Strength { + // Primary collation strength. + Primary, + // Secondary collation strength. + Secondary, + // Tertiary collation strength. + Tertiary, + // Quaternary collation strength. + Quaternary, + // Identical collation strength. + Identical, + Default, + }; + + /** + * Configure the Collation::Strength + */ + void SetStrength(Strength strength); + + /** + * Configure Collation::AlternateHandling. + */ + ICUResult SetAlternateHandling(AlternateHandling aAlternateHandling); + + /** + * Controls whether an extra case level (positioned before the third level) is + * generated or not. + * + * Contents of the case level are affected by the value of CaseFirst + * attribute. A simple way to ignore accent differences in a string is to set + * the strength to Primary and enable case level. + */ + ICUResult SetCaseLevel(Feature aFeature); + + /** + * When turned on, this attribute makes substrings of digits sort according to + * their numeric values. + * + * This is a way to get '100' to sort AFTER '2'. Note that the longest digit + * substring that can be treated as a single unit is 254 digits (not counting + * leading zeros). If a digit substring is longer than that, the digits beyond + * the limit will be treated as a separate digit substring. + * + * A "digit" in this sense is a code point with General_Category=Nd, which + * does not include circled numbers, roman numerals, etc. Only a contiguous + * digit substring is considered, that is, non-negative integers without + * separators. There is no support for plus/minus signs, decimals, exponents, + * etc. + */ + ICUResult SetNumericCollation(Feature aFeature); + + /** + * Controls whether the normalization check and necessary normalizations are + * performed. + * + * When off (default), no normalization check is performed. The correctness of + * the result is guaranteed only if the input data is in so-called FCD form + * When set to on, an incremental check is performed to see whether the input + * data is in the FCD form. If the data is not in the FCD form, incremental + * NFD normalization is performed. + */ + ICUResult SetNormalizationMode(Feature aFeature); + + /** + * Configure Collation::CaseFirst. + */ + ICUResult SetCaseFirst(CaseFirst aCaseFirst); + +#ifndef JS_STANDALONE + FRIEND_TEST(IntlCollator, SetAttributesInternal); +#endif + + ICUPointer<UCollator> mCollator = ICUPointer<UCollator>(nullptr); + Maybe<Sensitivity> mLastStrategy = Nothing(); +}; + +} // namespace mozilla::intl + +#endif |