/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* Structured representation of Unicode locale IDs used with Intl functions. */ #ifndef intl_components_Locale_h #define intl_components_Locale_h #include "mozilla/Assertions.h" #include "mozilla/intl/ICUError.h" #include "mozilla/intl/ICU4CGlue.h" #include "mozilla/Maybe.h" #include "mozilla/Span.h" #include "mozilla/TextUtils.h" #include "mozilla/TypedEnumBits.h" #include "mozilla/Variant.h" #include "mozilla/Vector.h" #include "mozilla/Result.h" #include #include #include #include #include #include "unicode/uloc.h" namespace mozilla::intl { /** * Return true if |language| is a valid language subtag. */ template bool IsStructurallyValidLanguageTag(mozilla::Span aLanguage); /** * Return true if |script| is a valid script subtag. */ template bool IsStructurallyValidScriptTag(mozilla::Span aScript); /** * Return true if |region| is a valid region subtag. */ template bool IsStructurallyValidRegionTag(mozilla::Span aRegion); #ifdef DEBUG /** * Return true if |variant| is a valid variant subtag. */ bool IsStructurallyValidVariantTag(mozilla::Span aVariant); /** * Return true if |extension| is a valid Unicode extension subtag. */ bool IsStructurallyValidUnicodeExtensionTag( mozilla::Span aExtension); /** * Return true if |privateUse| is a valid private-use subtag. */ bool IsStructurallyValidPrivateUseTag(mozilla::Span aPrivateUse); #endif template char AsciiToLowerCase(CharT aChar) { MOZ_ASSERT(mozilla::IsAscii(aChar)); return mozilla::IsAsciiUppercaseAlpha(aChar) ? (aChar + 0x20) : aChar; } template char AsciiToUpperCase(CharT aChar) { MOZ_ASSERT(mozilla::IsAscii(aChar)); return mozilla::IsAsciiLowercaseAlpha(aChar) ? (aChar - 0x20) : aChar; } template void AsciiToLowerCase(CharT* aChars, size_t aLength, char* aDest) { char (&fn)(CharT) = AsciiToLowerCase; std::transform(aChars, aChars + aLength, aDest, fn); } template void AsciiToUpperCase(CharT* aChars, size_t aLength, char* aDest) { char (&fn)(CharT) = AsciiToUpperCase; std::transform(aChars, aChars + aLength, aDest, fn); } template void AsciiToTitleCase(CharT* aChars, size_t aLength, char* aDest) { if (aLength > 0) { AsciiToUpperCase(aChars, 1, aDest); AsciiToLowerCase(aChars + 1, aLength - 1, aDest + 1); } } // Constants for language subtag lengths. namespace LanguageTagLimits { // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; static constexpr size_t LanguageLength = 8; // unicode_script_subtag = alpha{4} ; static constexpr size_t ScriptLength = 4; // unicode_region_subtag = (alpha{2} | digit{3}) ; static constexpr size_t RegionLength = 3; static constexpr size_t AlphaRegionLength = 2; static constexpr size_t DigitRegionLength = 3; // key = alphanum alpha ; static constexpr size_t UnicodeKeyLength = 2; // tkey = alpha digit ; static constexpr size_t TransformKeyLength = 2; } // namespace LanguageTagLimits // Fixed size language subtag which is stored inline in Locale. template class LanguageTagSubtag final { uint8_t mLength = 0; char mChars[SubtagLength] = {}; // zero initialize public: LanguageTagSubtag() = default; LanguageTagSubtag(const LanguageTagSubtag& aOther) { std::copy_n(aOther.mChars, SubtagLength, mChars); mLength = aOther.mLength; } LanguageTagSubtag& operator=(const LanguageTagSubtag& aOther) { std::copy_n(aOther.mChars, SubtagLength, mChars); mLength = aOther.mLength; return *this; } size_t Length() const { return mLength; } bool Missing() const { return mLength == 0; } bool Present() const { return mLength > 0; } mozilla::Span Span() const { return {mChars, mLength}; } template void Set(mozilla::Span str) { MOZ_ASSERT(str.size() <= SubtagLength); std::copy_n(str.data(), str.size(), mChars); mLength = str.size(); } // The toXYZCase() methods are using |SubtagLength| instead of |length()|, // because current compilers (tested GCC and Clang) can't infer the maximum // string length - even when using hints like |std::min| - and instead are // emitting SIMD optimized code. Using a fixed sized length avoids emitting // the SIMD code. (Emitting SIMD code doesn't make sense here, because the // SIMD code only kicks in for long strings.) A fixed length will // additionally ensure the compiler unrolls the loop in the case conversion // code. void ToLowerCase() { AsciiToLowerCase(mChars, SubtagLength, mChars); } void ToUpperCase() { AsciiToUpperCase(mChars, SubtagLength, mChars); } void ToTitleCase() { AsciiToTitleCase(mChars, SubtagLength, mChars); } template bool EqualTo(const char (&str)[N]) const { static_assert(N - 1 <= SubtagLength, "subtag literals must not exceed the maximum subtag length"); return mLength == N - 1 && memcmp(mChars, str, N - 1) == 0; } }; using LanguageSubtag = LanguageTagSubtag; using ScriptSubtag = LanguageTagSubtag; using RegionSubtag = LanguageTagSubtag; using Latin1Char = unsigned char; using UniqueChars = UniquePtr; /** * Object representing a Unicode BCP 47 locale identifier. * * All subtags are already in canonicalized case. */ class MOZ_STACK_CLASS Locale final { LanguageSubtag mLanguage = {}; ScriptSubtag mScript = {}; RegionSubtag mRegion = {}; using VariantsVector = Vector; using ExtensionsVector = Vector; VariantsVector mVariants; ExtensionsVector mExtensions; UniqueChars mPrivateUse = nullptr; friend class LocaleParser; public: enum class CanonicalizationError : uint8_t { DuplicateVariant, InternalError, OutOfMemory, }; private: Result CanonicalizeUnicodeExtension( UniqueChars& unicodeExtension); Result CanonicalizeTransformExtension( UniqueChars& transformExtension); public: static bool LanguageMapping(LanguageSubtag& aLanguage); static bool ComplexLanguageMapping(const LanguageSubtag& aLanguage); private: static bool ScriptMapping(ScriptSubtag& aScript); static bool RegionMapping(RegionSubtag& aRegion); static bool ComplexRegionMapping(const RegionSubtag& aRegion); void PerformComplexLanguageMappings(); void PerformComplexRegionMappings(); [[nodiscard]] bool PerformVariantMappings(); [[nodiscard]] bool UpdateLegacyMappings(); static bool SignLanguageMapping(LanguageSubtag& aLanguage, const RegionSubtag& aRegion); static const char* ReplaceTransformExtensionType( mozilla::Span aKey, mozilla::Span aType); public: /** * Given a Unicode key and type, return the null-terminated preferred * replacement for that type if there is one, or null if there is none, e.g. * in effect * |ReplaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"| * and * |ReplaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|. */ static const char* ReplaceUnicodeExtensionType( mozilla::Span aKey, mozilla::Span aType); public: Locale() = default; Locale(const Locale&) = delete; Locale& operator=(const Locale&) = delete; Locale(Locale&&) = default; Locale& operator=(Locale&&) = default; template class SubtagIterator { using Iter = decltype(std::declval().begin()); Iter mIter; public: explicit SubtagIterator(Iter iter) : mIter(iter) {} // std::iterator traits. using iterator_category = std::input_iterator_tag; using value_type = Span; using difference_type = ptrdiff_t; using pointer = value_type*; using reference = value_type&; SubtagIterator& operator++() { mIter++; return *this; } SubtagIterator operator++(int) { SubtagIterator result = *this; ++(*this); return result; } bool operator==(const SubtagIterator& aOther) const { return mIter == aOther.mIter; } bool operator!=(const SubtagIterator& aOther) const { return !(*this == aOther); } value_type operator*() const { return MakeStringSpan(mIter->get()); } }; template class SubtagEnumeration { using Vec = Vector; const Vec& mVector; public: explicit SubtagEnumeration(const Vec& aVector) : mVector(aVector) {} size_t length() const { return mVector.length(); } bool empty() const { return mVector.empty(); } auto begin() const { return SubtagIterator(mVector.begin()); } auto end() const { return SubtagIterator(mVector.end()); } Span operator[](size_t aIndex) const { return MakeStringSpan(mVector[aIndex].get()); } }; const LanguageSubtag& Language() const { return mLanguage; } const ScriptSubtag& Script() const { return mScript; } const RegionSubtag& Region() const { return mRegion; } auto Variants() const { return SubtagEnumeration(mVariants); } auto Extensions() const { return SubtagEnumeration(mExtensions); } Maybe> PrivateUse() const { if (const char* p = mPrivateUse.get()) { return Some(MakeStringSpan(p)); } return Nothing(); } /** * Return the Unicode extension subtag or Nothing if not present. */ Maybe> GetUnicodeExtension() const; private: ptrdiff_t UnicodeExtensionIndex() const; public: /** * Set the language subtag. The input must be a valid language subtag. */ template void SetLanguage(const char (&aLanguage)[N]) { mozilla::Span span(aLanguage, N - 1); MOZ_ASSERT(IsStructurallyValidLanguageTag(span)); mLanguage.Set(span); } /** * Set the language subtag. The input must be a valid language subtag. */ void SetLanguage(const LanguageSubtag& aLanguage) { MOZ_ASSERT(IsStructurallyValidLanguageTag(aLanguage.Span())); mLanguage.Set(aLanguage.Span()); } /** * Set the script subtag. The input must be a valid script subtag. */ template void SetScript(const char (&aScript)[N]) { mozilla::Span span(aScript, N - 1); MOZ_ASSERT(IsStructurallyValidScriptTag(span)); mScript.Set(span); } /** * Set the script subtag. The input must be a valid script subtag or the empty * string. */ void SetScript(const ScriptSubtag& aScript) { MOZ_ASSERT(aScript.Missing() || IsStructurallyValidScriptTag(aScript.Span())); mScript.Set(aScript.Span()); } /** * Set the region subtag. The input must be a valid region subtag. */ template void SetRegion(const char (&aRegion)[N]) { mozilla::Span span(aRegion, N - 1); MOZ_ASSERT(IsStructurallyValidRegionTag(span)); mRegion.Set(span); } /** * Set the region subtag. The input must be a valid region subtag or the empty * empty string. */ void SetRegion(const RegionSubtag& aRegion) { MOZ_ASSERT(aRegion.Missing() || IsStructurallyValidRegionTag(aRegion.Span())); mRegion.Set(aRegion.Span()); } /** * Removes all variant subtags. */ void ClearVariants() { mVariants.clearAndFree(); } /** * Set the Unicode extension subtag. The input must be a valid Unicode * extension subtag. */ ICUResult SetUnicodeExtension(Span aExtension); /** * Remove any Unicode extension subtag if present. */ void ClearUnicodeExtension(); /** Canonicalize the base-name (language, script, region, variant) subtags. */ Result CanonicalizeBaseName(); /** * Canonicalize all extension subtags. */ Result CanonicalizeExtensions(); /** * Canonicalizes the given structurally valid Unicode BCP 47 locale * identifier, including regularized case of subtags. For example, the * locale Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, * where * * Zh ; 2*3ALPHA * -haNS ; ["-" script] * -bu ; ["-" region] * -variant2 ; *("-" variant) * -Variant1 * -u-ca-chinese ; *("-" extension) * -t-Zh-laTN * -x-PRIVATE ; ["-" privateuse] * * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private * * Spec: ECMAScript Internationalization API Specification, 6.2.3. */ Result Canonicalize() { MOZ_TRY(CanonicalizeBaseName()); return CanonicalizeExtensions(); } /** * Fill the buffer with a string representation of the locale. */ template ICUResult ToString(B& aBuffer) const { static_assert(std::is_same_v); size_t capacity = ToStringCapacity(); // Attempt to reserve needed capacity if (!aBuffer.reserve(capacity)) { return Err(ICUError::OutOfMemory); } size_t offset = ToStringAppend(aBuffer.data()); MOZ_ASSERT(capacity == offset); aBuffer.written(offset); return Ok(); } /** * Add likely-subtags to the locale. * * Spec: */ ICUResult AddLikelySubtags(); /** * Remove likely-subtags from the locale. * * Spec: */ ICUResult RemoveLikelySubtags(); /** * Returns the default locale as an ICU locale identifier. The returned string * is NOT a valid BCP 47 locale! * * Also see . */ static const char* GetDefaultLocale() { return uloc_getDefault(); } /** * Returns an iterator over all supported locales. * * The returned strings are ICU locale identifiers and NOT BCP 47 language * tags. * * Also see . */ static auto GetAvailableLocales() { return AvailableLocalesEnumeration(); } private: static UniqueChars DuplicateStringToUniqueChars(const char* aStr); static UniqueChars DuplicateStringToUniqueChars(Span aStr); size_t ToStringCapacity() const; size_t ToStringAppend(char* aBuffer) const; }; /** * Parser for Unicode BCP 47 locale identifiers. * * */ class MOZ_STACK_CLASS LocaleParser final { public: enum class ParserError : uint8_t { // Input was not parseable as a locale, subtag or extension. NotParseable, // Unable to allocate memory for the parser to operate. OutOfMemory, }; // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|. enum class TokenKind : uint8_t { None = 0b000, Alpha = 0b001, Digit = 0b010, AlphaDigit = 0b011, Error = 0b100 }; private: class Token final { size_t mIndex; size_t mLength; TokenKind mKind; public: Token(TokenKind aKind, size_t aIndex, size_t aLength) : mIndex(aIndex), mLength(aLength), mKind(aKind) {} TokenKind Kind() const { return mKind; } size_t Index() const { return mIndex; } size_t Length() const { return mLength; } bool IsError() const { return mKind == TokenKind::Error; } bool IsNone() const { return mKind == TokenKind::None; } bool IsAlpha() const { return mKind == TokenKind::Alpha; } bool IsDigit() const { return mKind == TokenKind::Digit; } bool IsAlphaDigit() const { return mKind == TokenKind::AlphaDigit; } }; const char* mLocale; size_t mLength; size_t mIndex = 0; explicit LocaleParser(Span aLocale) : mLocale(aLocale.data()), mLength(aLocale.size()) {} char CharAt(size_t aIndex) const { return mLocale[aIndex]; } // Copy the token characters into |subtag|. template void CopyChars(const Token& aTok, LanguageTagSubtag& aSubtag) const { aSubtag.Set(mozilla::Span(mLocale + aTok.Index(), aTok.Length())); } // Create a string copy of |length| characters starting at |index|. UniqueChars Chars(size_t aIndex, size_t aLength) const; // Create a string copy of the token characters. UniqueChars Chars(const Token& aTok) const { return Chars(aTok.Index(), aTok.Length()); } UniqueChars Extension(const Token& aStart, const Token& aEnd) const { MOZ_ASSERT(aStart.Index() < aEnd.Index()); size_t length = aEnd.Index() - 1 - aStart.Index(); return Chars(aStart.Index(), length); } Token NextToken(); // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; // // Four character language subtags are not allowed in Unicode BCP 47 locale // identifiers. Also see the comparison to Unicode CLDR locale identifiers in // . bool IsLanguage(const Token& aTok) const { return aTok.IsAlpha() && ((2 <= aTok.Length() && aTok.Length() <= 3) || (5 <= aTok.Length() && aTok.Length() <= 8)); } // unicode_script_subtag = alpha{4} ; bool IsScript(const Token& aTok) const { return aTok.IsAlpha() && aTok.Length() == 4; } // unicode_region_subtag = (alpha{2} | digit{3}) ; bool IsRegion(const Token& aTok) const { return (aTok.IsAlpha() && aTok.Length() == 2) || (aTok.IsDigit() && aTok.Length() == 3); } // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; bool IsVariant(const Token& aTok) const { return (5 <= aTok.Length() && aTok.Length() <= 8) || (aTok.Length() == 4 && mozilla::IsAsciiDigit(CharAt(aTok.Index()))); } // Returns the code unit of the first character at the given singleton token. // Always returns the lower case form of an alphabetical character. char SingletonKey(const Token& aTok) const { MOZ_ASSERT(aTok.Length() == 1); return AsciiToLowerCase(CharAt(aTok.Index())); } // extensions = unicode_locale_extensions | // transformed_extensions | // other_extensions ; // // unicode_locale_extensions = sep [uU] ((sep keyword)+ | // (sep attribute)+ (sep keyword)*) ; // // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) | // (sep tfield)+) ; // // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; bool IsExtensionStart(const Token& aTok) const { return aTok.Length() == 1 && SingletonKey(aTok) != 'x'; } // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; bool IsOtherExtensionPart(const Token& aTok) const { return 2 <= aTok.Length() && aTok.Length() <= 8; } // unicode_locale_extensions = sep [uU] ((sep keyword)+ | // (sep attribute)+ (sep keyword)*) ; // keyword = key (sep type)? ; bool IsUnicodeExtensionPart(const Token& aTok) const { return IsUnicodeExtensionKey(aTok) || IsUnicodeExtensionType(aTok) || IsUnicodeExtensionAttribute(aTok); } // attribute = alphanum{3,8} ; bool IsUnicodeExtensionAttribute(const Token& aTok) const { return 3 <= aTok.Length() && aTok.Length() <= 8; } // key = alphanum alpha ; bool IsUnicodeExtensionKey(const Token& aTok) const { return aTok.Length() == 2 && mozilla::IsAsciiAlpha(CharAt(aTok.Index() + 1)); } // type = alphanum{3,8} (sep alphanum{3,8})* ; bool IsUnicodeExtensionType(const Token& aTok) const { return 3 <= aTok.Length() && aTok.Length() <= 8; } // tkey = alpha digit ; bool IsTransformExtensionKey(const Token& aTok) const { return aTok.Length() == 2 && mozilla::IsAsciiAlpha(CharAt(aTok.Index())) && mozilla::IsAsciiDigit(CharAt(aTok.Index() + 1)); } // tvalue = (sep alphanum{3,8})+ ; bool IsTransformExtensionPart(const Token& aTok) const { return 3 <= aTok.Length() && aTok.Length() <= 8; } // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; bool IsPrivateUseStart(const Token& aTok) const { return aTok.Length() == 1 && SingletonKey(aTok) == 'x'; } // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; bool IsPrivateUsePart(const Token& aTok) const { return 1 <= aTok.Length() && aTok.Length() <= 8; } // Helper function for use in |ParseBaseName| and // |ParseTlangInTransformExtension|. Do not use this directly! static Result InternalParseBaseName( LocaleParser& aLocaleParser, Locale& aTag, Token& aTok); // Parse the `unicode_language_id` production, i.e. the // language/script/region/variants portion of a locale, into |aTag|. // |aTok| must be the current token. static Result ParseBaseName(LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) { return InternalParseBaseName(aLocaleParser, aTag, aTok); } // Parse the `tlang` production within a parsed 't' transform extension. // The precise requirements for "previously parsed" are: // // * the input begins from current token |tok| with a valid `tlang` // * the `tlang` is wholly lowercase (*not* canonical case) // * variant subtags in the `tlang` may contain duplicates and be // unordered // // Return an error on internal failure. Otherwise, return a success value. If // there was no `tlang`, then |tag.language().missing()|. But if there was a // `tlang`, then |tag| is filled with subtags exactly as they appeared in the // parse input. static Result ParseTlangInTransformExtension( LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) { MOZ_ASSERT(aLocaleParser.IsLanguage(aTok)); return InternalParseBaseName(aLocaleParser, aTag, aTok); } friend class Locale; class Range final { size_t mBegin; size_t mLength; public: Range(size_t aBegin, size_t aLength) : mBegin(aBegin), mLength(aLength) {} size_t Begin() const { return mBegin; } size_t Length() const { return mLength; } }; using TFieldVector = Vector; using AttributesVector = Vector; using KeywordsVector = Vector; // Parse |extension|, which must be a validated, fully lowercase // `transformed_extensions` subtag, and fill |tag| and |fields| from the // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent // with |extension|. static Result ParseTransformExtension( mozilla::Span aExtension, Locale& aTag, TFieldVector& aFields); // Parse |extension|, which must be a validated, fully lowercase // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords| // from the `attribute` and `keyword` components. static Result ParseUnicodeExtension( mozilla::Span aExtension, AttributesVector& aAttributes, KeywordsVector& aKeywords); public: // Parse the input string as a locale. // // NOTE: |aTag| must be a new, empty Locale. static Result TryParse(Span aLocale, Locale& aTag); // Parse the input string as the base-name parts (language, script, region, // variants) of a locale. // // NOTE: |aTag| must be a new, empty Locale. static Result TryParseBaseName(Span aLocale, Locale& aTag); // Return Ok() iff |extension| can be parsed as a Unicode extension subtag. static Result CanParseUnicodeExtension( Span aExtension); // Return Ok() iff |unicodeType| can be parsed as a Unicode extension type. static Result CanParseUnicodeExtensionType( Span aUnicodeType); }; MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LocaleParser::TokenKind) } // namespace mozilla::intl #endif /* intl_components_Locale_h */