summaryrefslogtreecommitdiffstats
path: root/intl/components/src/Locale.h
diff options
context:
space:
mode:
Diffstat (limited to 'intl/components/src/Locale.h')
-rw-r--r--intl/components/src/Locale.h773
1 files changed, 773 insertions, 0 deletions
diff --git a/intl/components/src/Locale.h b/intl/components/src/Locale.h
new file mode 100644
index 0000000000..1f4e06f543
--- /dev/null
+++ b/intl/components/src/Locale.h
@@ -0,0 +1,773 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Structured representation of Unicode locale IDs used with Intl functions. */
+
+#ifndef intl_components_Locale_h
+#define intl_components_Locale_h
+
+#include "mozilla/Assertions.h"
+#include "mozilla/intl/ICUError.h"
+#include "mozilla/intl/ICU4CGlue.h"
+#include "mozilla/Maybe.h"
+#include "mozilla/Span.h"
+#include "mozilla/TextUtils.h"
+#include "mozilla/Try.h"
+#include "mozilla/TypedEnumBits.h"
+#include "mozilla/Variant.h"
+#include "mozilla/Vector.h"
+
+#include <algorithm>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <utility>
+
+#include "unicode/uloc.h"
+
+namespace mozilla::intl {
+
+/**
+ * Return true if |language| is a valid language subtag.
+ */
+template <typename CharT>
+bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> aLanguage);
+
+/**
+ * Return true if |script| is a valid script subtag.
+ */
+template <typename CharT>
+bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> aScript);
+
+/**
+ * Return true if |region| is a valid region subtag.
+ */
+template <typename CharT>
+bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> aRegion);
+
+#ifdef DEBUG
+/**
+ * Return true if |variant| is a valid variant subtag.
+ */
+bool IsStructurallyValidVariantTag(mozilla::Span<const char> aVariant);
+
+/**
+ * Return true if |extension| is a valid Unicode extension subtag.
+ */
+bool IsStructurallyValidUnicodeExtensionTag(
+ mozilla::Span<const char> aExtension);
+
+/**
+ * Return true if |privateUse| is a valid private-use subtag.
+ */
+bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> aPrivateUse);
+
+#endif
+
+template <typename CharT>
+char AsciiToLowerCase(CharT aChar) {
+ MOZ_ASSERT(mozilla::IsAscii(aChar));
+ return mozilla::IsAsciiUppercaseAlpha(aChar) ? (aChar + 0x20) : aChar;
+}
+
+template <typename CharT>
+char AsciiToUpperCase(CharT aChar) {
+ MOZ_ASSERT(mozilla::IsAscii(aChar));
+ return mozilla::IsAsciiLowercaseAlpha(aChar) ? (aChar - 0x20) : aChar;
+}
+
+template <typename CharT>
+void AsciiToLowerCase(CharT* aChars, size_t aLength, char* aDest) {
+ char (&fn)(CharT) = AsciiToLowerCase;
+ std::transform(aChars, aChars + aLength, aDest, fn);
+}
+
+template <typename CharT>
+void AsciiToUpperCase(CharT* aChars, size_t aLength, char* aDest) {
+ char (&fn)(CharT) = AsciiToUpperCase;
+ std::transform(aChars, aChars + aLength, aDest, fn);
+}
+
+template <typename CharT>
+void AsciiToTitleCase(CharT* aChars, size_t aLength, char* aDest) {
+ if (aLength > 0) {
+ AsciiToUpperCase(aChars, 1, aDest);
+ AsciiToLowerCase(aChars + 1, aLength - 1, aDest + 1);
+ }
+}
+
+// Constants for language subtag lengths.
+namespace LanguageTagLimits {
+
+// unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
+static constexpr size_t LanguageLength = 8;
+
+// unicode_script_subtag = alpha{4} ;
+static constexpr size_t ScriptLength = 4;
+
+// unicode_region_subtag = (alpha{2} | digit{3}) ;
+static constexpr size_t RegionLength = 3;
+static constexpr size_t AlphaRegionLength = 2;
+static constexpr size_t DigitRegionLength = 3;
+
+// key = alphanum alpha ;
+static constexpr size_t UnicodeKeyLength = 2;
+
+// tkey = alpha digit ;
+static constexpr size_t TransformKeyLength = 2;
+
+} // namespace LanguageTagLimits
+
+// Fixed size language subtag which is stored inline in Locale.
+template <size_t SubtagLength>
+class LanguageTagSubtag final {
+ uint8_t mLength = 0;
+ char mChars[SubtagLength] = {}; // zero initialize
+
+ public:
+ LanguageTagSubtag() = default;
+
+ LanguageTagSubtag(const LanguageTagSubtag& aOther) {
+ std::copy_n(aOther.mChars, SubtagLength, mChars);
+ mLength = aOther.mLength;
+ }
+
+ LanguageTagSubtag& operator=(const LanguageTagSubtag& aOther) {
+ std::copy_n(aOther.mChars, SubtagLength, mChars);
+ mLength = aOther.mLength;
+ return *this;
+ }
+
+ size_t Length() const { return mLength; }
+ bool Missing() const { return mLength == 0; }
+ bool Present() const { return mLength > 0; }
+
+ mozilla::Span<const char> Span() const { return {mChars, mLength}; }
+
+ template <typename CharT>
+ void Set(mozilla::Span<const CharT> str) {
+ MOZ_ASSERT(str.size() <= SubtagLength);
+ std::copy_n(str.data(), str.size(), mChars);
+ mLength = str.size();
+ }
+
+ // The toXYZCase() methods are using |SubtagLength| instead of |length()|,
+ // because current compilers (tested GCC and Clang) can't infer the maximum
+ // string length - even when using hints like |std::min| - and instead are
+ // emitting SIMD optimized code. Using a fixed sized length avoids emitting
+ // the SIMD code. (Emitting SIMD code doesn't make sense here, because the
+ // SIMD code only kicks in for long strings.) A fixed length will
+ // additionally ensure the compiler unrolls the loop in the case conversion
+ // code.
+
+ void ToLowerCase() { AsciiToLowerCase(mChars, SubtagLength, mChars); }
+
+ void ToUpperCase() { AsciiToUpperCase(mChars, SubtagLength, mChars); }
+
+ void ToTitleCase() { AsciiToTitleCase(mChars, SubtagLength, mChars); }
+
+ template <size_t N>
+ bool EqualTo(const char (&str)[N]) const {
+ static_assert(N - 1 <= SubtagLength,
+ "subtag literals must not exceed the maximum subtag length");
+
+ return mLength == N - 1 && memcmp(mChars, str, N - 1) == 0;
+ }
+};
+
+using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>;
+using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>;
+using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>;
+
+using Latin1Char = unsigned char;
+using UniqueChars = UniquePtr<char[]>;
+
+/**
+ * Object representing a Unicode BCP 47 locale identifier.
+ *
+ * All subtags are already in canonicalized case.
+ */
+class MOZ_STACK_CLASS Locale final {
+ LanguageSubtag mLanguage = {};
+ ScriptSubtag mScript = {};
+ RegionSubtag mRegion = {};
+
+ using VariantsVector = Vector<UniqueChars, 2>;
+ using ExtensionsVector = Vector<UniqueChars, 2>;
+
+ VariantsVector mVariants;
+ ExtensionsVector mExtensions;
+ UniqueChars mPrivateUse = nullptr;
+
+ friend class LocaleParser;
+
+ public:
+ enum class CanonicalizationError : uint8_t {
+ DuplicateVariant,
+ InternalError,
+ OutOfMemory,
+ };
+
+ private:
+ Result<Ok, CanonicalizationError> CanonicalizeUnicodeExtension(
+ UniqueChars& unicodeExtension);
+
+ Result<Ok, CanonicalizationError> CanonicalizeTransformExtension(
+ UniqueChars& transformExtension);
+
+ public:
+ static bool LanguageMapping(LanguageSubtag& aLanguage);
+ static bool ComplexLanguageMapping(const LanguageSubtag& aLanguage);
+
+ private:
+ static bool ScriptMapping(ScriptSubtag& aScript);
+ static bool RegionMapping(RegionSubtag& aRegion);
+ static bool ComplexRegionMapping(const RegionSubtag& aRegion);
+
+ void PerformComplexLanguageMappings();
+ void PerformComplexRegionMappings();
+ [[nodiscard]] bool PerformVariantMappings();
+
+ [[nodiscard]] bool UpdateLegacyMappings();
+
+ static bool SignLanguageMapping(LanguageSubtag& aLanguage,
+ const RegionSubtag& aRegion);
+
+ static const char* ReplaceTransformExtensionType(
+ mozilla::Span<const char> aKey, mozilla::Span<const char> aType);
+
+ public:
+ /**
+ * Given a Unicode key and type, return the null-terminated preferred
+ * replacement for that type if there is one, or null if there is none, e.g.
+ * in effect
+ * |ReplaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"|
+ * and
+ * |ReplaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|.
+ */
+ static const char* ReplaceUnicodeExtensionType(
+ mozilla::Span<const char> aKey, mozilla::Span<const char> aType);
+
+ public:
+ Locale() = default;
+ Locale(const Locale&) = delete;
+ Locale& operator=(const Locale&) = delete;
+ Locale(Locale&&) = default;
+ Locale& operator=(Locale&&) = default;
+
+ template <class Vec>
+ class SubtagIterator {
+ using Iter = decltype(std::declval<const Vec>().begin());
+
+ Iter mIter;
+
+ public:
+ explicit SubtagIterator(Iter iter) : mIter(iter) {}
+
+ // std::iterator traits.
+ using iterator_category = std::input_iterator_tag;
+ using value_type = Span<const char>;
+ using difference_type = ptrdiff_t;
+ using pointer = value_type*;
+ using reference = value_type&;
+
+ SubtagIterator& operator++() {
+ mIter++;
+ return *this;
+ }
+
+ SubtagIterator operator++(int) {
+ SubtagIterator result = *this;
+ ++(*this);
+ return result;
+ }
+
+ bool operator==(const SubtagIterator& aOther) const {
+ return mIter == aOther.mIter;
+ }
+
+ bool operator!=(const SubtagIterator& aOther) const {
+ return !(*this == aOther);
+ }
+
+ value_type operator*() const { return MakeStringSpan(mIter->get()); }
+ };
+
+ template <size_t N>
+ class SubtagEnumeration {
+ using Vec = Vector<UniqueChars, N>;
+
+ const Vec& mVector;
+
+ public:
+ explicit SubtagEnumeration(const Vec& aVector) : mVector(aVector) {}
+
+ size_t length() const { return mVector.length(); }
+ bool empty() const { return mVector.empty(); }
+
+ auto begin() const { return SubtagIterator<Vec>(mVector.begin()); }
+ auto end() const { return SubtagIterator<Vec>(mVector.end()); }
+
+ Span<const char> operator[](size_t aIndex) const {
+ return MakeStringSpan(mVector[aIndex].get());
+ }
+ };
+
+ const LanguageSubtag& Language() const { return mLanguage; }
+ const ScriptSubtag& Script() const { return mScript; }
+ const RegionSubtag& Region() const { return mRegion; }
+ auto Variants() const { return SubtagEnumeration(mVariants); }
+ auto Extensions() const { return SubtagEnumeration(mExtensions); }
+ Maybe<Span<const char>> PrivateUse() const {
+ if (const char* p = mPrivateUse.get()) {
+ return Some(MakeStringSpan(p));
+ }
+ return Nothing();
+ }
+
+ /**
+ * Return the Unicode extension subtag or Nothing if not present.
+ */
+ Maybe<Span<const char>> GetUnicodeExtension() const;
+
+ private:
+ ptrdiff_t UnicodeExtensionIndex() const;
+
+ public:
+ /**
+ * Set the language subtag. The input must be a valid language subtag.
+ */
+ template <size_t N>
+ void SetLanguage(const char (&aLanguage)[N]) {
+ mozilla::Span<const char> span(aLanguage, N - 1);
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(span));
+ mLanguage.Set(span);
+ }
+
+ /**
+ * Set the language subtag. The input must be a valid language subtag.
+ */
+ void SetLanguage(const LanguageSubtag& aLanguage) {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(aLanguage.Span()));
+ mLanguage.Set(aLanguage.Span());
+ }
+
+ /**
+ * Set the script subtag. The input must be a valid script subtag.
+ */
+ template <size_t N>
+ void SetScript(const char (&aScript)[N]) {
+ mozilla::Span<const char> span(aScript, N - 1);
+ MOZ_ASSERT(IsStructurallyValidScriptTag(span));
+ mScript.Set(span);
+ }
+
+ /**
+ * Set the script subtag. The input must be a valid script subtag or the empty
+ * string.
+ */
+ void SetScript(const ScriptSubtag& aScript) {
+ MOZ_ASSERT(aScript.Missing() ||
+ IsStructurallyValidScriptTag(aScript.Span()));
+ mScript.Set(aScript.Span());
+ }
+
+ /**
+ * Set the region subtag. The input must be a valid region subtag.
+ */
+ template <size_t N>
+ void SetRegion(const char (&aRegion)[N]) {
+ mozilla::Span<const char> span(aRegion, N - 1);
+ MOZ_ASSERT(IsStructurallyValidRegionTag(span));
+ mRegion.Set(span);
+ }
+
+ /**
+ * Set the region subtag. The input must be a valid region subtag or the empty
+ * empty string.
+ */
+ void SetRegion(const RegionSubtag& aRegion) {
+ MOZ_ASSERT(aRegion.Missing() ||
+ IsStructurallyValidRegionTag(aRegion.Span()));
+ mRegion.Set(aRegion.Span());
+ }
+
+ /**
+ * Removes all variant subtags.
+ */
+ void ClearVariants() { mVariants.clearAndFree(); }
+
+ /**
+ * Set the Unicode extension subtag. The input must be a valid Unicode
+ * extension subtag.
+ */
+ ICUResult SetUnicodeExtension(Span<const char> aExtension);
+
+ /**
+ * Remove any Unicode extension subtag if present.
+ */
+ void ClearUnicodeExtension();
+
+ /** Canonicalize the base-name (language, script, region, variant) subtags. */
+ Result<Ok, CanonicalizationError> CanonicalizeBaseName();
+
+ /**
+ * Canonicalize all extension subtags.
+ */
+ Result<Ok, CanonicalizationError> CanonicalizeExtensions();
+
+ /**
+ * Canonicalizes the given structurally valid Unicode BCP 47 locale
+ * identifier, including regularized case of subtags. For example, the
+ * locale Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE,
+ * where
+ *
+ * Zh ; 2*3ALPHA
+ * -haNS ; ["-" script]
+ * -bu ; ["-" region]
+ * -variant2 ; *("-" variant)
+ * -Variant1
+ * -u-ca-chinese ; *("-" extension)
+ * -t-Zh-laTN
+ * -x-PRIVATE ; ["-" privateuse]
+ *
+ * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
+ *
+ * Spec: ECMAScript Internationalization API Specification, 6.2.3.
+ */
+ Result<Ok, CanonicalizationError> Canonicalize() {
+ MOZ_TRY(CanonicalizeBaseName());
+ return CanonicalizeExtensions();
+ }
+
+ /**
+ * Fill the buffer with a string representation of the locale.
+ */
+ template <typename B>
+ ICUResult ToString(B& aBuffer) const {
+ static_assert(std::is_same_v<typename B::CharType, char>);
+
+ size_t capacity = ToStringCapacity();
+
+ // Attempt to reserve needed capacity
+ if (!aBuffer.reserve(capacity)) {
+ return Err(ICUError::OutOfMemory);
+ }
+
+ size_t offset = ToStringAppend(aBuffer.data());
+
+ MOZ_ASSERT(capacity == offset);
+ aBuffer.written(offset);
+
+ return Ok();
+ }
+
+ /**
+ * Add likely-subtags to the locale.
+ *
+ * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
+ */
+ ICUResult AddLikelySubtags();
+
+ /**
+ * Remove likely-subtags from the locale.
+ *
+ * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
+ */
+ ICUResult RemoveLikelySubtags();
+
+ /**
+ * Returns the default locale as an ICU locale identifier. The returned string
+ * is NOT a valid BCP 47 locale!
+ *
+ * Also see <https://unicode-org.github.io/icu/userguide/locale>.
+ */
+ static const char* GetDefaultLocale() { return uloc_getDefault(); }
+
+ /**
+ * Returns an iterator over all supported locales.
+ *
+ * The returned strings are ICU locale identifiers and NOT BCP 47 language
+ * tags.
+ *
+ * Also see <https://unicode-org.github.io/icu/userguide/locale>.
+ */
+ static auto GetAvailableLocales() {
+ return AvailableLocalesEnumeration<uloc_countAvailable,
+ uloc_getAvailable>();
+ }
+
+ private:
+ static UniqueChars DuplicateStringToUniqueChars(const char* aStr);
+ static UniqueChars DuplicateStringToUniqueChars(Span<const char> aStr);
+ size_t ToStringCapacity() const;
+ size_t ToStringAppend(char* aBuffer) const;
+};
+
+/**
+ * Parser for Unicode BCP 47 locale identifiers.
+ *
+ * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers>
+ */
+class MOZ_STACK_CLASS LocaleParser final {
+ public:
+ enum class ParserError : uint8_t {
+ // Input was not parseable as a locale, subtag or extension.
+ NotParseable,
+ // Unable to allocate memory for the parser to operate.
+ OutOfMemory,
+ };
+
+ // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|.
+ enum class TokenKind : uint8_t {
+ None = 0b000,
+ Alpha = 0b001,
+ Digit = 0b010,
+ AlphaDigit = 0b011,
+ Error = 0b100
+ };
+
+ private:
+ class Token final {
+ size_t mIndex;
+ size_t mLength;
+ TokenKind mKind;
+
+ public:
+ Token(TokenKind aKind, size_t aIndex, size_t aLength)
+ : mIndex(aIndex), mLength(aLength), mKind(aKind) {}
+
+ TokenKind Kind() const { return mKind; }
+ size_t Index() const { return mIndex; }
+ size_t Length() const { return mLength; }
+
+ bool IsError() const { return mKind == TokenKind::Error; }
+ bool IsNone() const { return mKind == TokenKind::None; }
+ bool IsAlpha() const { return mKind == TokenKind::Alpha; }
+ bool IsDigit() const { return mKind == TokenKind::Digit; }
+ bool IsAlphaDigit() const { return mKind == TokenKind::AlphaDigit; }
+ };
+
+ const char* mLocale;
+ size_t mLength;
+ size_t mIndex = 0;
+
+ explicit LocaleParser(Span<const char> aLocale)
+ : mLocale(aLocale.data()), mLength(aLocale.size()) {}
+
+ char CharAt(size_t aIndex) const { return mLocale[aIndex]; }
+
+ // Copy the token characters into |subtag|.
+ template <size_t N>
+ void CopyChars(const Token& aTok, LanguageTagSubtag<N>& aSubtag) const {
+ aSubtag.Set(mozilla::Span(mLocale + aTok.Index(), aTok.Length()));
+ }
+
+ // Create a string copy of |length| characters starting at |index|.
+ UniqueChars Chars(size_t aIndex, size_t aLength) const;
+
+ // Create a string copy of the token characters.
+ UniqueChars Chars(const Token& aTok) const {
+ return Chars(aTok.Index(), aTok.Length());
+ }
+
+ UniqueChars Extension(const Token& aStart, const Token& aEnd) const {
+ MOZ_ASSERT(aStart.Index() < aEnd.Index());
+
+ size_t length = aEnd.Index() - 1 - aStart.Index();
+ return Chars(aStart.Index(), length);
+ }
+
+ Token NextToken();
+
+ // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
+ //
+ // Four character language subtags are not allowed in Unicode BCP 47 locale
+ // identifiers. Also see the comparison to Unicode CLDR locale identifiers in
+ // <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
+ bool IsLanguage(const Token& aTok) const {
+ return aTok.IsAlpha() && ((2 <= aTok.Length() && aTok.Length() <= 3) ||
+ (5 <= aTok.Length() && aTok.Length() <= 8));
+ }
+
+ // unicode_script_subtag = alpha{4} ;
+ bool IsScript(const Token& aTok) const {
+ return aTok.IsAlpha() && aTok.Length() == 4;
+ }
+
+ // unicode_region_subtag = (alpha{2} | digit{3}) ;
+ bool IsRegion(const Token& aTok) const {
+ return (aTok.IsAlpha() && aTok.Length() == 2) ||
+ (aTok.IsDigit() && aTok.Length() == 3);
+ }
+
+ // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
+ bool IsVariant(const Token& aTok) const {
+ return (5 <= aTok.Length() && aTok.Length() <= 8) ||
+ (aTok.Length() == 4 && mozilla::IsAsciiDigit(CharAt(aTok.Index())));
+ }
+
+ // Returns the code unit of the first character at the given singleton token.
+ // Always returns the lower case form of an alphabetical character.
+ char SingletonKey(const Token& aTok) const {
+ MOZ_ASSERT(aTok.Length() == 1);
+ return AsciiToLowerCase(CharAt(aTok.Index()));
+ }
+
+ // extensions = unicode_locale_extensions |
+ // transformed_extensions |
+ // other_extensions ;
+ //
+ // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
+ // (sep attribute)+ (sep keyword)*) ;
+ //
+ // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) |
+ // (sep tfield)+) ;
+ //
+ // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
+ bool IsExtensionStart(const Token& aTok) const {
+ return aTok.Length() == 1 && SingletonKey(aTok) != 'x';
+ }
+
+ // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
+ bool IsOtherExtensionPart(const Token& aTok) const {
+ return 2 <= aTok.Length() && aTok.Length() <= 8;
+ }
+
+ // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
+ // (sep attribute)+ (sep keyword)*) ;
+ // keyword = key (sep type)? ;
+ bool IsUnicodeExtensionPart(const Token& aTok) const {
+ return IsUnicodeExtensionKey(aTok) || IsUnicodeExtensionType(aTok) ||
+ IsUnicodeExtensionAttribute(aTok);
+ }
+
+ // attribute = alphanum{3,8} ;
+ bool IsUnicodeExtensionAttribute(const Token& aTok) const {
+ return 3 <= aTok.Length() && aTok.Length() <= 8;
+ }
+
+ // key = alphanum alpha ;
+ bool IsUnicodeExtensionKey(const Token& aTok) const {
+ return aTok.Length() == 2 &&
+ mozilla::IsAsciiAlpha(CharAt(aTok.Index() + 1));
+ }
+
+ // type = alphanum{3,8} (sep alphanum{3,8})* ;
+ bool IsUnicodeExtensionType(const Token& aTok) const {
+ return 3 <= aTok.Length() && aTok.Length() <= 8;
+ }
+
+ // tkey = alpha digit ;
+ bool IsTransformExtensionKey(const Token& aTok) const {
+ return aTok.Length() == 2 && mozilla::IsAsciiAlpha(CharAt(aTok.Index())) &&
+ mozilla::IsAsciiDigit(CharAt(aTok.Index() + 1));
+ }
+
+ // tvalue = (sep alphanum{3,8})+ ;
+ bool IsTransformExtensionPart(const Token& aTok) const {
+ return 3 <= aTok.Length() && aTok.Length() <= 8;
+ }
+
+ // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
+ bool IsPrivateUseStart(const Token& aTok) const {
+ return aTok.Length() == 1 && SingletonKey(aTok) == 'x';
+ }
+
+ // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
+ bool IsPrivateUsePart(const Token& aTok) const {
+ return 1 <= aTok.Length() && aTok.Length() <= 8;
+ }
+
+ // Helper function for use in |ParseBaseName| and
+ // |ParseTlangInTransformExtension|. Do not use this directly!
+ static Result<Ok, ParserError> InternalParseBaseName(
+ LocaleParser& aLocaleParser, Locale& aTag, Token& aTok);
+
+ // Parse the `unicode_language_id` production, i.e. the
+ // language/script/region/variants portion of a locale, into |aTag|.
+ // |aTok| must be the current token.
+ static Result<Ok, ParserError> ParseBaseName(LocaleParser& aLocaleParser,
+ Locale& aTag, Token& aTok) {
+ return InternalParseBaseName(aLocaleParser, aTag, aTok);
+ }
+
+ // Parse the `tlang` production within a parsed 't' transform extension.
+ // The precise requirements for "previously parsed" are:
+ //
+ // * the input begins from current token |tok| with a valid `tlang`
+ // * the `tlang` is wholly lowercase (*not* canonical case)
+ // * variant subtags in the `tlang` may contain duplicates and be
+ // unordered
+ //
+ // Return an error on internal failure. Otherwise, return a success value. If
+ // there was no `tlang`, then |tag.language().missing()|. But if there was a
+ // `tlang`, then |tag| is filled with subtags exactly as they appeared in the
+ // parse input.
+ static Result<Ok, ParserError> ParseTlangInTransformExtension(
+ LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) {
+ MOZ_ASSERT(aLocaleParser.IsLanguage(aTok));
+ return InternalParseBaseName(aLocaleParser, aTag, aTok);
+ }
+
+ friend class Locale;
+
+ class Range final {
+ size_t mBegin;
+ size_t mLength;
+
+ public:
+ Range(size_t aBegin, size_t aLength) : mBegin(aBegin), mLength(aLength) {}
+
+ size_t Begin() const { return mBegin; }
+ size_t Length() const { return mLength; }
+ };
+
+ using TFieldVector = Vector<Range, 8>;
+ using AttributesVector = Vector<Range, 8>;
+ using KeywordsVector = Vector<Range, 8>;
+
+ // Parse |extension|, which must be a validated, fully lowercase
+ // `transformed_extensions` subtag, and fill |tag| and |fields| from the
+ // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent
+ // with |extension|.
+ static Result<Ok, ParserError> ParseTransformExtension(
+ mozilla::Span<const char> aExtension, Locale& aTag,
+ TFieldVector& aFields);
+
+ // Parse |extension|, which must be a validated, fully lowercase
+ // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords|
+ // from the `attribute` and `keyword` components.
+ static Result<Ok, ParserError> ParseUnicodeExtension(
+ mozilla::Span<const char> aExtension, AttributesVector& aAttributes,
+ KeywordsVector& aKeywords);
+
+ public:
+ // Parse the input string as a locale.
+ //
+ // NOTE: |aTag| must be a new, empty Locale.
+ static Result<Ok, ParserError> TryParse(Span<const char> aLocale,
+ Locale& aTag);
+
+ // Parse the input string as the base-name parts (language, script, region,
+ // variants) of a locale.
+ //
+ // NOTE: |aTag| must be a new, empty Locale.
+ static Result<Ok, ParserError> TryParseBaseName(Span<const char> aLocale,
+ Locale& aTag);
+
+ // Return Ok() iff |extension| can be parsed as a Unicode extension subtag.
+ static Result<Ok, ParserError> CanParseUnicodeExtension(
+ Span<const char> aExtension);
+
+ // Return Ok() iff |unicodeType| can be parsed as a Unicode extension type.
+ static Result<Ok, ParserError> CanParseUnicodeExtensionType(
+ Span<const char> aUnicodeType);
+};
+
+MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LocaleParser::TokenKind)
+
+} // namespace mozilla::intl
+
+#endif /* intl_components_Locale_h */