/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "mozilla/intl/Locale.h" #include "mozilla/Assertions.h" #include "mozilla/DebugOnly.h" #include "mozilla/MathAlgorithms.h" #include "mozilla/Span.h" #include "mozilla/TextUtils.h" #include "mozilla/Variant.h" #include "ICU4CGlue.h" #include #include #include #include #include #include #include #include #include "unicode/uloc.h" #include "unicode/utypes.h" namespace mozilla::intl { using namespace intl::LanguageTagLimits; template bool IsStructurallyValidLanguageTag(Span aLanguage) { // unicode_language_subtag = alpha{2,3} | alpha{5,8}; size_t length = aLanguage.size(); const CharT* str = aLanguage.data(); return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) && std::all_of(str, str + length, IsAsciiAlpha); } template bool IsStructurallyValidLanguageTag(Span aLanguage); template bool IsStructurallyValidLanguageTag(Span aLanguage); template bool IsStructurallyValidLanguageTag(Span aLanguage); template bool IsStructurallyValidScriptTag(Span aScript) { // unicode_script_subtag = alpha{4} ; size_t length = aScript.size(); const CharT* str = aScript.data(); return length == 4 && std::all_of(str, str + length, IsAsciiAlpha); } template bool IsStructurallyValidScriptTag(Span aScript); template bool IsStructurallyValidScriptTag(Span aScript); template bool IsStructurallyValidScriptTag(Span aScript); template bool IsStructurallyValidRegionTag(Span aRegion) { // unicode_region_subtag = (alpha{2} | digit{3}) ; size_t length = aRegion.size(); const CharT* str = aRegion.data(); return (length == 2 && std::all_of(str, str + length, IsAsciiAlpha)) || (length == 3 && std::all_of(str, str + length, IsAsciiDigit)); } template bool IsStructurallyValidRegionTag(Span aRegion); template bool IsStructurallyValidRegionTag(Span aRegion); template bool IsStructurallyValidRegionTag(Span aRegion); #ifdef DEBUG bool IsStructurallyValidVariantTag(Span aVariant) { // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; size_t length = aVariant.size(); const char* str = aVariant.data(); return ((5 <= length && length <= 8) || (length == 4 && IsAsciiDigit(str[0]))) && std::all_of(str, str + length, IsAsciiAlphanumeric); } bool IsStructurallyValidUnicodeExtensionTag(Span aExtension) { return LocaleParser::CanParseUnicodeExtension(aExtension).isOk(); } static bool IsStructurallyValidExtensionTag(Span aExtension) { // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; // NB: Allow any extension, including Unicode and Transform here, because // this function is only used for an assertion. size_t length = aExtension.size(); const char* str = aExtension.data(); const char* const end = aExtension.data() + length; if (length <= 2) { return false; } if (!IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') { return false; } str++; if (*str++ != '-') { return false; } while (true) { const char* sep = reinterpret_cast(memchr(str, '-', end - str)); size_t len = (sep ? sep : end) - str; if (len < 2 || len > 8 || !std::all_of(str, str + len, IsAsciiAlphanumeric)) { return false; } if (!sep) { return true; } str = sep + 1; } } bool IsStructurallyValidPrivateUseTag(Span aPrivateUse) { // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; size_t length = aPrivateUse.size(); const char* str = aPrivateUse.data(); const char* const end = aPrivateUse.data() + length; if (length <= 2) { return false; } if (str[0] != 'x' && str[0] != 'X') { return false; } str++; if (*str++ != '-') { return false; } while (true) { const char* sep = reinterpret_cast(memchr(str, '-', end - str)); size_t len = (sep ? sep : end) - str; if (len == 0 || len > 8 || !std::all_of(str, str + len, IsAsciiAlphanumeric)) { return false; } if (!sep) { return true; } str = sep + 1; } } #endif ptrdiff_t Locale::UnicodeExtensionIndex() const { // The extension subtags aren't necessarily sorted, so we can't use binary // search here. auto p = std::find_if( mExtensions.begin(), mExtensions.end(), [](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; }); if (p != mExtensions.end()) { return std::distance(mExtensions.begin(), p); } return -1; } Maybe> Locale::GetUnicodeExtension() const { ptrdiff_t index = UnicodeExtensionIndex(); if (index >= 0) { return Some(MakeStringSpan(mExtensions[index].get())); } return Nothing(); } ICUResult Locale::SetUnicodeExtension(Span aExtension) { MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag(aExtension)); auto duplicated = DuplicateStringToUniqueChars(aExtension); // Replace the existing Unicode extension subtag or append a new one. ptrdiff_t index = UnicodeExtensionIndex(); if (index >= 0) { mExtensions[index] = std::move(duplicated); return Ok(); } if (!mExtensions.append(std::move(duplicated))) { return Err(ICUError::OutOfMemory); } return Ok(); } void Locale::ClearUnicodeExtension() { ptrdiff_t index = UnicodeExtensionIndex(); if (index >= 0) { mExtensions.erase(mExtensions.begin() + index); } } template static bool SortAlphabetically(Vector& aSubtags) { size_t length = aSubtags.length(); // Zero or one element lists are already sorted. if (length < 2) { return true; } // Handle two element lists inline. if (length == 2) { if (strcmp(aSubtags[0].get(), aSubtags[1].get()) > 0) { aSubtags[0].swap(aSubtags[1]); } return true; } Vector scratch; if (!scratch.resizeUninitialized(length)) { return false; } for (size_t i = 0; i < length; i++) { scratch[i] = aSubtags[i].release(); } std::stable_sort( scratch.begin(), scratch.end(), [](const char* a, const char* b) { return strcmp(a, b) < 0; }); for (size_t i = 0; i < length; i++) { aSubtags[i] = UniqueChars(scratch[i]); } return true; } Result Locale::CanonicalizeBaseName() { // Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to // canonicalize the syntax by normalizing the case and ordering all subtags. // The canonical syntax form is specified in UTS 35, 3.2.1. // Language codes need to be in lower case. "JA" -> "ja" mLanguage.ToLowerCase(); MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); // The first character of a script code needs to be capitalized. // "hans" -> "Hans" mScript.ToTitleCase(); MOZ_ASSERT(Script().Missing() || IsStructurallyValidScriptTag(Script().Span())); // Region codes need to be in upper case. "bu" -> "BU" mRegion.ToUpperCase(); MOZ_ASSERT(Region().Missing() || IsStructurallyValidRegionTag(Region().Span())); // The canonical case for variant subtags is lowercase. for (UniqueChars& variant : mVariants) { char* variantChars = variant.get(); size_t variantLength = strlen(variantChars); AsciiToLowerCase(variantChars, variantLength, variantChars); MOZ_ASSERT(IsStructurallyValidVariantTag({variantChars, variantLength})); } // Extensions and privateuse subtags are case normalized in the // |canonicalizeExtensions| method. // The second step in UTS 35, 3.2.1, is to order all subtags. if (mVariants.length() > 1) { // 1. Any variants are in alphabetical order. if (!SortAlphabetically(mVariants)) { return Err(CanonicalizationError::OutOfMemory); } // Reject the Locale identifier if a duplicate variant was found, e.g. // "en-variant-Variant". const UniqueChars* duplicate = std::adjacent_find( mVariants.begin(), mVariants.end(), [](const auto& a, const auto& b) { return strcmp(a.get(), b.get()) == 0; }); if (duplicate != mVariants.end()) { return Err(CanonicalizationError::DuplicateVariant); } } // 2. Any extensions are in alphabetical order by their singleton. // 3. All attributes are sorted in alphabetical order. // 4. All keywords and tfields are sorted by alphabetical order of their keys, // within their respective extensions. // 5. Any type or tfield value "true" is removed. // - A subsequent call to canonicalizeExtensions() will perform these steps. // 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier // into its canonical form per UTS 3.2.1. // 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their // canonical forms. // - A subsequent call to canonicalizeExtensions() will perform this step. // 2. Replace aliases in the unicode_language_id and tlang (if any). // - tlang is handled in canonicalizeExtensions(). // Replace deprecated language, region, and variant subtags with their // preferred mappings. if (!UpdateLegacyMappings()) { return Err(CanonicalizationError::OutOfMemory); } // Replace deprecated language subtags with their preferred values. if (!LanguageMapping(mLanguage) && ComplexLanguageMapping(mLanguage)) { PerformComplexLanguageMappings(); } // Replace deprecated script subtags with their preferred values. if (Script().Present()) { ScriptMapping(mScript); } // Replace deprecated region subtags with their preferred values. if (Region().Present()) { if (!RegionMapping(mRegion) && ComplexRegionMapping(mRegion)) { PerformComplexRegionMappings(); } } // Replace deprecated variant subtags with their preferred values. if (!PerformVariantMappings()) { return Err(CanonicalizationError::OutOfMemory); } // No extension replacements are currently present. // Private use sequences are left as is. // 3. Replace aliases in special key values. // - A subsequent call to canonicalizeExtensions() will perform this step. return Ok(); } #ifdef DEBUG static bool IsAsciiLowercaseAlphanumericOrDash(Span aSpan) { const char* ptr = aSpan.data(); size_t length = aSpan.size(); return std::all_of(ptr, ptr + length, [](auto c) { return IsAsciiLowercaseAlpha(c) || IsAsciiDigit(c) || c == '-'; }); } #endif Result Locale::CanonicalizeExtensions() { // The canonical case for all extension subtags is lowercase. for (UniqueChars& extension : mExtensions) { char* extensionChars = extension.get(); size_t extensionLength = strlen(extensionChars); AsciiToLowerCase(extensionChars, extensionLength, extensionChars); MOZ_ASSERT( IsStructurallyValidExtensionTag({extensionChars, extensionLength})); } // Any extensions are in alphabetical order by their singleton. // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" if (!SortAlphabetically(mExtensions)) { return Err(CanonicalizationError::OutOfMemory); } for (UniqueChars& extension : mExtensions) { if (extension[0] == 'u') { MOZ_TRY(CanonicalizeUnicodeExtension(extension)); } else if (extension[0] == 't') { MOZ_TRY(CanonicalizeTransformExtension(extension)); } MOZ_ASSERT( IsAsciiLowercaseAlphanumericOrDash(MakeStringSpan(extension.get()))); } // The canonical case for privateuse subtags is lowercase. if (char* privateuse = mPrivateUse.get()) { size_t privateuseLength = strlen(privateuse); AsciiToLowerCase(privateuse, privateuseLength, privateuse); MOZ_ASSERT( IsStructurallyValidPrivateUseTag({privateuse, privateuseLength})); } return Ok(); } template static inline bool AppendSpan(Vector& vector, Span aSpan) { return vector.append(aSpan.data(), aSpan.size()); } /** * CanonicalizeUnicodeExtension( attributes, keywords ) * * Canonical syntax per * : * * - All attributes and keywords are in lowercase. * - Note: The parser already converted keywords to lowercase. * - All attributes are sorted in alphabetical order. * - All keywords are sorted by alphabetical order of their keys. * - Any type value "true" is removed. * * Canonical form: * - All keys and types use the canonical form (from the name attribute; * see Section 3.6.4 U Extension Data Files). */ Result Locale::CanonicalizeUnicodeExtension( UniqueChars& aUnicodeExtension) { Span extension = MakeStringSpan(aUnicodeExtension.get()); MOZ_ASSERT(extension[0] == 'u'); MOZ_ASSERT(extension[1] == '-'); MOZ_ASSERT(IsStructurallyValidExtensionTag(extension)); LocaleParser::AttributesVector attributes; LocaleParser::KeywordsVector keywords; using Attribute = LocaleParser::AttributesVector::ElementType; using Keyword = LocaleParser::KeywordsVector::ElementType; if (LocaleParser::ParseUnicodeExtension(extension, attributes, keywords) .isErr()) { MOZ_ASSERT_UNREACHABLE("unexpected invalid Unicode extension subtag"); return Err(CanonicalizationError::InternalError); } auto attributesLess = [extension](const Attribute& a, const Attribute& b) { auto astr = extension.Subspan(a.Begin(), a.Length()); auto bstr = extension.Subspan(b.Begin(), b.Length()); return astr < bstr; }; // All attributes are sorted in alphabetical order. if (attributes.length() > 1) { std::stable_sort(attributes.begin(), attributes.end(), attributesLess); } auto keywordsLess = [extension](const Keyword& a, const Keyword& b) { auto astr = extension.Subspan(a.Begin(), UnicodeKeyLength); auto bstr = extension.Subspan(b.Begin(), UnicodeKeyLength); return astr < bstr; }; // All keywords are sorted by alphabetical order of keys. if (keywords.length() > 1) { // Using a stable sort algorithm, guarantees that two keywords using the // same key are never reordered. That means for example // when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to // get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs // before "nu-latn". // This is required so that deduplication below preserves the first keyword // for a given key and discards the rest. std::stable_sort(keywords.begin(), keywords.end(), keywordsLess); } Vector sb; if (!sb.append('u')) { return Err(CanonicalizationError::OutOfMemory); } // Append all Unicode extension attributes. for (size_t i = 0; i < attributes.length(); i++) { const auto& attribute = attributes[i]; auto span = extension.Subspan(attribute.Begin(), attribute.Length()); // Skip duplicate attributes. if (i > 0) { const auto& lastAttribute = attributes[i - 1]; if (span == extension.Subspan(lastAttribute.Begin(), lastAttribute.Length())) { continue; } MOZ_ASSERT(attributesLess(lastAttribute, attribute)); } if (!sb.append('-')) { return Err(CanonicalizationError::OutOfMemory); } if (!AppendSpan(sb, span)) { return Err(CanonicalizationError::OutOfMemory); } } static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1; using StringSpan = Span; static constexpr StringSpan True = MakeStringSpan("true"); // Append all Unicode extension keywords. for (size_t i = 0; i < keywords.length(); i++) { const auto& keyword = keywords[i]; // Skip duplicate keywords. if (i > 0) { const auto& lastKeyword = keywords[i - 1]; if (extension.Subspan(keyword.Begin(), UnicodeKeyLength) == extension.Subspan(lastKeyword.Begin(), UnicodeKeyLength)) { continue; } MOZ_ASSERT(keywordsLess(lastKeyword, keyword)); } if (!sb.append('-')) { return Err(CanonicalizationError::OutOfMemory); } StringSpan span = extension.Subspan(keyword.Begin(), keyword.Length()); if (span.size() == UnicodeKeyLength) { // Keyword without type value. if (!AppendSpan(sb, span)) { return Err(CanonicalizationError::OutOfMemory); } } else { StringSpan key = span.To(UnicodeKeyLength); StringSpan type = span.From(UnicodeKeyWithSepLength); // Search if there's a replacement for the current Unicode keyword. if (const char* replacement = ReplaceUnicodeExtensionType(key, type)) { StringSpan repl = MakeStringSpan(replacement); if (repl == True) { // Elide the type "true" if present in the replacement. if (!AppendSpan(sb, key)) { return Err(CanonicalizationError::OutOfMemory); } } else { // Otherwise append the Unicode key (including the separator) and the // replaced type. if (!AppendSpan(sb, span.To(UnicodeKeyWithSepLength))) { return Err(CanonicalizationError::OutOfMemory); } if (!AppendSpan(sb, repl)) { return Err(CanonicalizationError::OutOfMemory); } } } else { if (type == True) { // Elide the Unicode extension type "true". if (!AppendSpan(sb, key)) { return Err(CanonicalizationError::OutOfMemory); } } else { // Otherwise append the complete Unicode extension keyword. if (!AppendSpan(sb, span)) { return Err(CanonicalizationError::OutOfMemory); } } } } } // We can keep the previous extension when canonicalization didn't modify it. if (static_cast>(sb) != extension) { // Otherwise replace the previous extension with the canonical extension. UniqueChars canonical = DuplicateStringToUniqueChars(sb); if (!canonical) { return Err(CanonicalizationError::OutOfMemory); } aUnicodeExtension = std::move(canonical); } return Ok(); } template static bool LocaleToString(const Locale& aTag, Buffer& aBuffer) { auto appendSubtag = [&aBuffer](const auto& subtag) { auto span = subtag.Span(); MOZ_ASSERT(!span.empty()); return aBuffer.append(span.data(), span.size()); }; auto appendSubtagSpan = [&aBuffer](Span subtag) { MOZ_ASSERT(!subtag.empty()); return aBuffer.append(subtag.data(), subtag.size()); }; auto appendSubtags = [&aBuffer, &appendSubtagSpan](const auto& subtags) { for (const auto& subtag : subtags) { if (!aBuffer.append('-') || !appendSubtagSpan(subtag)) { return false; } } return true; }; // Append the language subtag. if (!appendSubtag(aTag.Language())) { return false; } // Append the script subtag if present. if (aTag.Script().Present()) { if (!aBuffer.append('-') || !appendSubtag(aTag.Script())) { return false; } } // Append the region subtag if present. if (aTag.Region().Present()) { if (!aBuffer.append('-') || !appendSubtag(aTag.Region())) { return false; } } // Append the variant subtags if present. if (!appendSubtags(aTag.Variants())) { return false; } // Append the extensions subtags if present. if (!appendSubtags(aTag.Extensions())) { return false; } // Append the private-use subtag if present. if (auto privateuse = aTag.PrivateUse()) { if (!aBuffer.append('-') || !appendSubtagSpan(privateuse.value())) { return false; } } return true; } /** * CanonicalizeTransformExtension * * Canonical form per : * * - These subtags are all in lowercase (that is the canonical casing for these * subtags), [...]. * * And per * : * * - All keywords and tfields are sorted by alphabetical order of their keys, * within their respective extensions. */ Result Locale::CanonicalizeTransformExtension(UniqueChars& aTransformExtension) { Span extension = MakeStringSpan(aTransformExtension.get()); MOZ_ASSERT(extension[0] == 't'); MOZ_ASSERT(extension[1] == '-'); MOZ_ASSERT(IsStructurallyValidExtensionTag(extension)); Locale tag; LocaleParser::TFieldVector fields; using TField = LocaleParser::TFieldVector::ElementType; if (LocaleParser::ParseTransformExtension(extension, tag, fields).isErr()) { MOZ_ASSERT_UNREACHABLE("unexpected invalid transform extension subtag"); return Err(CanonicalizationError::InternalError); } auto tfieldLess = [extension](const TField& a, const TField& b) { auto astr = extension.Subspan(a.Begin(), TransformKeyLength); auto bstr = extension.Subspan(b.Begin(), TransformKeyLength); return astr < bstr; }; // All tfields are sorted by alphabetical order of their keys. if (fields.length() > 1) { std::stable_sort(fields.begin(), fields.end(), tfieldLess); } Vector sb; if (!sb.append('t')) { return Err(CanonicalizationError::OutOfMemory); } // Append the language subtag if present. // // Replace aliases in tlang per // . if (tag.Language().Present()) { if (!sb.append('-')) { return Err(CanonicalizationError::OutOfMemory); } MOZ_TRY(tag.CanonicalizeBaseName()); // The canonical case for Transform extensions is lowercase per // . Convert the two // subtags which don't use lowercase for their canonical syntax. tag.mScript.ToLowerCase(); tag.mRegion.ToLowerCase(); if (!LocaleToString(tag, sb)) { return Err(CanonicalizationError::OutOfMemory); } } static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1; using StringSpan = Span; // Append all fields. // // UTS 35, 3.2.1 specifies: // - Any type or tfield value "true" is removed. // // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore // this apparently invalid part of the UTS 35 specification and simply // append all `tfield` subtags. for (const auto& field : fields) { if (!sb.append('-')) { return Err(CanonicalizationError::OutOfMemory); } StringSpan span = extension.Subspan(field.Begin(), field.Length()); StringSpan key = span.To(TransformKeyLength); StringSpan value = span.From(TransformKeyWithSepLength); // Search if there's a replacement for the current transform keyword. if (const char* replacement = ReplaceTransformExtensionType(key, value)) { if (!AppendSpan(sb, span.To(TransformKeyWithSepLength))) { return Err(CanonicalizationError::OutOfMemory); } if (!AppendSpan(sb, MakeStringSpan(replacement))) { return Err(CanonicalizationError::OutOfMemory); } } else { if (!AppendSpan(sb, span)) { return Err(CanonicalizationError::OutOfMemory); } } } // We can keep the previous extension when canonicalization didn't modify it. if (static_cast>(sb) != extension) { // Otherwise replace the previous extension with the canonical extension. UniqueChars canonical = DuplicateStringToUniqueChars(sb); if (!canonical) { return Err(CanonicalizationError::OutOfMemory); } aTransformExtension = std::move(canonical); } return Ok(); } // Zero-terminated ICU Locale ID. using LocaleId = Vector; enum class LikelySubtags : bool { Add, Remove }; // Return true iff the locale is already maximized resp. minimized. static bool HasLikelySubtags(LikelySubtags aLikelySubtags, const Locale& aTag) { // The locale is already maximized if the language, script, and region // subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are // used. if (aLikelySubtags == LikelySubtags::Add) { return !aTag.Language().EqualTo("und") && (aTag.Script().Present() && !aTag.Script().EqualTo("Zzzz")) && (aTag.Region().Present() && !aTag.Region().EqualTo("ZZ")); } // The locale is already minimized if it only contains a language // subtag whose value is not the placeholder value "und". return !aTag.Language().EqualTo("und") && aTag.Script().Missing() && aTag.Region().Missing(); } // Create an ICU locale ID from the given locale. static bool CreateLocaleForLikelySubtags(const Locale& aTag, LocaleId& aLocale) { MOZ_ASSERT(aLocale.length() == 0); auto appendSubtag = [&aLocale](const auto& subtag) { auto span = subtag.Span(); MOZ_ASSERT(!span.empty()); return aLocale.append(span.data(), span.size()); }; // Append the language subtag. if (!appendSubtag(aTag.Language())) { return false; } // Append the script subtag if present. if (aTag.Script().Present()) { if (!aLocale.append('_') || !appendSubtag(aTag.Script())) { return false; } } // Append the region subtag if present. if (aTag.Region().Present()) { if (!aLocale.append('_') || !appendSubtag(aTag.Region())) { return false; } } // Zero-terminated for use with ICU. return aLocale.append('\0'); } static ICUError ParserErrorToICUError(LocaleParser::ParserError aErr) { using ParserError = LocaleParser::ParserError; switch (aErr) { case ParserError::NotParseable: return ICUError::InternalError; case ParserError::OutOfMemory: return ICUError::OutOfMemory; } MOZ_CRASH("Unexpected parser error"); } static ICUError CanonicalizationErrorToICUError( Locale::CanonicalizationError aErr) { using CanonicalizationError = Locale::CanonicalizationError; switch (aErr) { case CanonicalizationError::DuplicateVariant: case CanonicalizationError::InternalError: return ICUError::InternalError; case CanonicalizationError::OutOfMemory: return ICUError::OutOfMemory; } MOZ_CRASH("Unexpected canonicalization error"); } // Assign the language, script, and region subtags from an ICU locale ID. // // ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to // retrieve these subtags, but unfortunately these functions are rather slow, so // we use our own implementation. static ICUResult AssignFromLocaleId(LocaleId& aLocaleId, Locale& aTag) { // Replace the ICU locale ID separator. std::replace(aLocaleId.begin(), aLocaleId.end(), '_', '-'); // ICU replaces "und" with the empty string, which means "und" becomes "" and // "und-Latn" becomes "-Latn". Handle this case separately. if (aLocaleId.empty() || aLocaleId[0] == '-') { static constexpr auto und = MakeStringSpan("und"); constexpr size_t length = und.size(); // Insert "und" in front of the locale ID. if (!aLocaleId.growBy(length)) { return Err(ICUError::OutOfMemory); } memmove(aLocaleId.begin() + length, aLocaleId.begin(), aLocaleId.length()); memmove(aLocaleId.begin(), und.data(), length); } // Retrieve the language, script, and region subtags from the locale ID Locale localeTag; MOZ_TRY(LocaleParser::TryParseBaseName(aLocaleId, localeTag) .mapErr(ParserErrorToICUError)); aTag.SetLanguage(localeTag.Language()); aTag.SetScript(localeTag.Script()); aTag.SetRegion(localeTag.Region()); return Ok(); } template static ICUResult CallLikelySubtags(const LocaleId& aLocaleId, LocaleId& aResult) { // Locale ID must be zero-terminated before passing it to ICU. MOZ_ASSERT(aLocaleId.back() == '\0'); MOZ_ASSERT(aResult.length() == 0); // Ensure there's enough room for the result. MOZ_ALWAYS_TRUE(aResult.resize(LocaleId::InlineLength)); return FillBufferWithICUCall( aResult, [&aLocaleId](char* chars, int32_t size, UErrorCode* status) { return likelySubtagsFn(aLocaleId.begin(), chars, size, status); }); } // The canonical way to compute the Unicode BCP 47 locale identifier with likely // subtags is as follows: // // 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU // locale ID. // 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID. // 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into // a Unicode BCP 47 locale identifier. // // Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow // and we know, by construction, that the input Unicode BCP 47 locale identifier // only contains valid language, script, and region subtags, we can avoid both // calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and // AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of // |Intl.Locale.prototype.maximize|.) static ICUResult LikelySubtags(LikelySubtags aLikelySubtags, Locale& aTag) { // Return early if the input is already maximized/minimized. if (HasLikelySubtags(aLikelySubtags, aTag)) { return Ok(); } // Create the locale ID for the input argument. LocaleId locale; if (!CreateLocaleForLikelySubtags(aTag, locale)) { return Err(ICUError::OutOfMemory); } // Either add or remove likely subtags to/from the locale ID. LocaleId localeLikelySubtags; if (aLikelySubtags == LikelySubtags::Add) { MOZ_TRY( CallLikelySubtags(locale, localeLikelySubtags)); } else { MOZ_TRY( CallLikelySubtags(locale, localeLikelySubtags)); } // Assign the language, script, and region subtags from the locale ID. MOZ_TRY(AssignFromLocaleId(localeLikelySubtags, aTag)); // Update mappings in case ICU returned a non-canonical locale. MOZ_TRY(aTag.CanonicalizeBaseName().mapErr(CanonicalizationErrorToICUError)); return Ok(); } ICUResult Locale::AddLikelySubtags() { return LikelySubtags(LikelySubtags::Add, *this); } ICUResult Locale::RemoveLikelySubtags() { return LikelySubtags(LikelySubtags::Remove, *this); } UniqueChars Locale::DuplicateStringToUniqueChars(const char* aStr) { size_t length = strlen(aStr) + 1; auto duplicate = MakeUnique(length); memcpy(duplicate.get(), aStr, length); return duplicate; } UniqueChars Locale::DuplicateStringToUniqueChars(Span aStr) { size_t length = aStr.size(); auto duplicate = MakeUnique(length + 1); memcpy(duplicate.get(), aStr.data(), length); duplicate[length] = '\0'; return duplicate; } size_t Locale::ToStringCapacity() const { // This is a bit awkward, the buffer class currently does not support // being resized, so we need to calculate the required size up front and // reserve it all at once. auto lengthSubtag = [](const auto& subtag) { auto span = subtag.Span(); MOZ_ASSERT(!span.empty()); return span.size(); }; auto lengthSubtagZ = [](const char* subtag) { size_t length = strlen(subtag); MOZ_ASSERT(length > 0); return length; }; auto lengthSubtagsZ = [&lengthSubtagZ](const auto& subtags) { size_t length = 0; for (const auto& subtag : subtags) { length += lengthSubtagZ(subtag.get()) + 1; } return length; }; // First calculate required capacity size_t capacity = 0; capacity += lengthSubtag(mLanguage); if (mScript.Present()) { capacity += lengthSubtag(mScript) + 1; } if (mRegion.Present()) { capacity += lengthSubtag(mRegion) + 1; } capacity += lengthSubtagsZ(mVariants); capacity += lengthSubtagsZ(mExtensions); if (mPrivateUse.get()) { capacity += lengthSubtagZ(mPrivateUse.get()) + 1; } return capacity; } size_t Locale::ToStringAppend(char* aBuffer) const { // Current write position inside buffer. size_t offset = 0; auto appendHyphen = [&offset, &aBuffer]() { aBuffer[offset] = '-'; offset += 1; }; auto appendSubtag = [&offset, &aBuffer](const auto& subtag) { auto span = subtag.Span(); memcpy(aBuffer + offset, span.data(), span.size()); offset += span.size(); }; auto appendSubtagZ = [&offset, &aBuffer](const char* subtag) { size_t length = strlen(subtag); memcpy(aBuffer + offset, subtag, length); offset += length; }; auto appendSubtagsZ = [&appendHyphen, &appendSubtagZ](const auto& subtags) { for (const auto& subtag : subtags) { appendHyphen(); appendSubtagZ(subtag.get()); } }; // Append the language subtag. appendSubtag(mLanguage); // Append the script subtag if present. if (mScript.Present()) { appendHyphen(); appendSubtag(mScript); } // Append the region subtag if present. if (mRegion.Present()) { appendHyphen(); appendSubtag(mRegion); } // Append the variant subtags if present. appendSubtagsZ(mVariants); // Append the extensions subtags if present. appendSubtagsZ(mExtensions); // Append the private-use subtag if present. if (mPrivateUse.get()) { appendHyphen(); appendSubtagZ(mPrivateUse.get()); } return offset; } LocaleParser::Token LocaleParser::NextToken() { MOZ_ASSERT(mIndex <= mLength + 1, "called after 'None' token was read"); TokenKind kind = TokenKind::None; size_t tokenLength = 0; for (size_t i = mIndex; i < mLength; i++) { // UTS 35, section 3.1. // alpha = [A-Z a-z] ; // digit = [0-9] ; char c = CharAt(i); if (IsAsciiAlpha(c)) { kind |= TokenKind::Alpha; } else if (IsAsciiDigit(c)) { kind |= TokenKind::Digit; } else if (c == '-' && i > mIndex && i + 1 < mLength) { break; } else { return {TokenKind::Error, 0, 0}; } tokenLength += 1; } Token token{kind, mIndex, tokenLength}; mIndex += tokenLength + 1; return token; } UniqueChars LocaleParser::Chars(size_t aIndex, size_t aLength) const { // Add +1 to null-terminate the string. auto chars = MakeUnique(aLength + 1); char* dest = chars.get(); std::copy_n(mLocale + aIndex, aLength, dest); dest[aLength] = '\0'; return chars; } // Parse the `unicode_language_id` production. // // unicode_language_id = unicode_language_subtag // (sep unicode_script_subtag)? // (sep unicode_region_subtag)? // (sep unicode_variant_subtag)* ; // // sep = "-" // // Note: Unicode CLDR locale identifier backward compatibility extensions // removed from `unicode_language_id`. // // |tok| is the current token from |ts|. // // All subtags will be added unaltered to |tag|, without canonicalizing their // case or, in the case of variant subtags, detecting and rejecting duplicate // variants. Users must subsequently |CanonicalizeBaseName| to perform these // actions. // // Do not use this function directly: use |ParseBaseName| or // |ParseTlangFromTransformExtension| instead. Result LocaleParser::InternalParseBaseName( LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) { if (aLocaleParser.IsLanguage(aTok)) { aLocaleParser.CopyChars(aTok, aTag.mLanguage); aTok = aLocaleParser.NextToken(); } else { // The language subtag is mandatory. return Err(ParserError::NotParseable); } if (aLocaleParser.IsScript(aTok)) { aLocaleParser.CopyChars(aTok, aTag.mScript); aTok = aLocaleParser.NextToken(); } if (aLocaleParser.IsRegion(aTok)) { aLocaleParser.CopyChars(aTok, aTag.mRegion); aTok = aLocaleParser.NextToken(); } auto& variants = aTag.mVariants; MOZ_ASSERT(variants.length() == 0); while (aLocaleParser.IsVariant(aTok)) { auto variant = aLocaleParser.Chars(aTok); if (!variants.append(std::move(variant))) { return Err(ParserError::OutOfMemory); } aTok = aLocaleParser.NextToken(); } return Ok(); } Result LocaleParser::TryParse( mozilla::Span aLocale, Locale& aTag) { // |aTag| must be a new, empty Locale. MOZ_ASSERT(aTag.Language().Missing()); MOZ_ASSERT(aTag.Script().Missing()); MOZ_ASSERT(aTag.Region().Missing()); MOZ_ASSERT(aTag.Variants().empty()); MOZ_ASSERT(aTag.Extensions().empty()); MOZ_ASSERT(aTag.PrivateUse().isNothing()); // unicode_locale_id = unicode_language_id // extensions* // pu_extensions? ; LocaleParser ts(aLocale); Token tok = ts.NextToken(); MOZ_TRY(ParseBaseName(ts, aTag, tok)); // extensions = unicode_locale_extensions // | transformed_extensions // | other_extensions ; // Bit set of seen singletons. uint64_t seenSingletons = 0; auto& extensions = aTag.mExtensions; while (ts.IsExtensionStart(tok)) { char singleton = ts.SingletonKey(tok); // Reject the input if a duplicate singleton was found. uint64_t hash = 1ULL << (AsciiAlphanumericToNumber(singleton) + 1); if (seenSingletons & hash) { return Err(ParserError::NotParseable); } seenSingletons |= hash; Token start = tok; tok = ts.NextToken(); // We'll check for missing non-singleton subtags after this block by // comparing |startValue| with the then-current position. size_t startValue = tok.Index(); if (singleton == 'u') { while (ts.IsUnicodeExtensionPart(tok)) { tok = ts.NextToken(); } } else if (singleton == 't') { // transformed_extensions = sep [tT] // ((sep tlang (sep tfield)*) // | (sep tfield)+) ; // tlang = unicode_language_subtag // (sep unicode_script_subtag)? // (sep unicode_region_subtag)? // (sep unicode_variant_subtag)* ; if (ts.IsLanguage(tok)) { tok = ts.NextToken(); if (ts.IsScript(tok)) { tok = ts.NextToken(); } if (ts.IsRegion(tok)) { tok = ts.NextToken(); } while (ts.IsVariant(tok)) { tok = ts.NextToken(); } } // tfield = tkey tvalue; while (ts.IsTransformExtensionKey(tok)) { tok = ts.NextToken(); size_t startTValue = tok.Index(); while (ts.IsTransformExtensionPart(tok)) { tok = ts.NextToken(); } // `tfield` requires at least one `tvalue`. if (tok.Index() <= startTValue) { return Err(ParserError::NotParseable); } } } else { while (ts.IsOtherExtensionPart(tok)) { tok = ts.NextToken(); } } // Singletons must be followed by a non-singleton subtag, "en-a-b" is not // allowed. if (tok.Index() <= startValue) { return Err(ParserError::NotParseable); } UniqueChars extension = ts.Extension(start, tok); if (!extensions.append(std::move(extension))) { return Err(ParserError::OutOfMemory); } } // Trailing `pu_extension` component of the `unicode_locale_id` production. if (ts.IsPrivateUseStart(tok)) { Token start = tok; tok = ts.NextToken(); size_t startValue = tok.Index(); while (ts.IsPrivateUsePart(tok)) { tok = ts.NextToken(); } // There must be at least one subtag after the "-x-". if (tok.Index() <= startValue) { return Err(ParserError::NotParseable); } UniqueChars privateUse = ts.Extension(start, tok); aTag.mPrivateUse = std::move(privateUse); } if (!tok.IsNone()) { return Err(ParserError::NotParseable); } return Ok(); } Result LocaleParser::TryParseBaseName( Span aLocale, Locale& aTag) { // |aTag| must be a new, empty Locale. MOZ_ASSERT(aTag.Language().Missing()); MOZ_ASSERT(aTag.Script().Missing()); MOZ_ASSERT(aTag.Region().Missing()); MOZ_ASSERT(aTag.Variants().empty()); MOZ_ASSERT(aTag.Extensions().empty()); MOZ_ASSERT(aTag.PrivateUse().isNothing()); LocaleParser ts(aLocale); Token tok = ts.NextToken(); MOZ_TRY(ParseBaseName(ts, aTag, tok)); if (!tok.IsNone()) { return Err(ParserError::NotParseable); } return Ok(); } // Parse |aExtension|, which must be a valid `transformed_extensions` subtag, // and fill |aTag| and |aFields| from the `tlang` and `tfield` components. Result LocaleParser::ParseTransformExtension( Span aExtension, Locale& aTag, TFieldVector& aFields) { LocaleParser ts(aExtension); Token tok = ts.NextToken(); if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 't') { return Err(ParserError::NotParseable); } tok = ts.NextToken(); if (tok.IsNone()) { return Err(ParserError::NotParseable); } if (ts.IsLanguage(tok)) { // We're parsing a possible `tlang` in a known-valid transform extension, so // use the special-purpose function that takes advantage of this to compute // lowercased |tag| contents in an optimal manner. MOZ_TRY(ParseTlangInTransformExtension(ts, aTag, tok)); // After `tlang` we must have a `tfield` and its `tkey`, or we're at the end // of the transform extension. MOZ_ASSERT(ts.IsTransformExtensionKey(tok) || tok.IsNone()); } else { // If there's no `tlang` subtag, at least one `tfield` must be present. MOZ_ASSERT(ts.IsTransformExtensionKey(tok)); } // Trailing `tfield` subtags. (Any other trailing subtags are an error, // because we're guaranteed to only see a valid tranform extension here.) while (ts.IsTransformExtensionKey(tok)) { size_t begin = tok.Index(); tok = ts.NextToken(); size_t startTValue = tok.Index(); while (ts.IsTransformExtensionPart(tok)) { tok = ts.NextToken(); } // `tfield` requires at least one `tvalue`. if (tok.Index() <= startTValue) { return Err(ParserError::NotParseable); } size_t length = tok.Index() - 1 - begin; if (!aFields.emplaceBack(begin, length)) { return Err(ParserError::OutOfMemory); } } if (!tok.IsNone()) { return Err(ParserError::NotParseable); } return Ok(); } // Parse |aExtension|, which must be a valid `unicode_locale_extensions` subtag, // and fill |aAttributes| and |aKeywords| from the `attribute` and `keyword` // components. Result LocaleParser::ParseUnicodeExtension( Span aExtension, AttributesVector& aAttributes, KeywordsVector& aKeywords) { LocaleParser ts(aExtension); Token tok = ts.NextToken(); // unicode_locale_extensions = sep [uU] ((sep keyword)+ | // (sep attribute)+ (sep keyword)*) ; if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') { return Err(ParserError::NotParseable); } tok = ts.NextToken(); if (tok.IsNone()) { return Err(ParserError::NotParseable); } while (ts.IsUnicodeExtensionAttribute(tok)) { if (!aAttributes.emplaceBack(tok.Index(), tok.Length())) { return Err(ParserError::OutOfMemory); } tok = ts.NextToken(); } // keyword = key (sep type)? ; while (ts.IsUnicodeExtensionKey(tok)) { size_t begin = tok.Index(); tok = ts.NextToken(); while (ts.IsUnicodeExtensionType(tok)) { tok = ts.NextToken(); } if (tok.IsError()) { return Err(ParserError::NotParseable); } size_t length = tok.Index() - 1 - begin; if (!aKeywords.emplaceBack(begin, length)) { return Err(ParserError::OutOfMemory); } } if (!tok.IsNone()) { return Err(ParserError::NotParseable); } return Ok(); } Result LocaleParser::CanParseUnicodeExtension( Span aExtension) { LocaleParser ts(aExtension); Token tok = ts.NextToken(); // unicode_locale_extensions = sep [uU] ((sep keyword)+ | // (sep attribute)+ (sep keyword)*) ; if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') { return Err(ParserError::NotParseable); } tok = ts.NextToken(); if (tok.IsNone()) { return Err(ParserError::NotParseable); } while (ts.IsUnicodeExtensionAttribute(tok)) { tok = ts.NextToken(); } // keyword = key (sep type)? ; while (ts.IsUnicodeExtensionKey(tok)) { tok = ts.NextToken(); while (ts.IsUnicodeExtensionType(tok)) { tok = ts.NextToken(); } if (tok.IsError()) { return Err(ParserError::NotParseable); } } if (!tok.IsNone()) { return Err(ParserError::OutOfMemory); } return Ok(); } Result LocaleParser::CanParseUnicodeExtensionType(Span aUnicodeType) { MOZ_ASSERT(!aUnicodeType.empty(), "caller must exclude empty strings"); LocaleParser ts(aUnicodeType); Token tok = ts.NextToken(); while (ts.IsUnicodeExtensionType(tok)) { tok = ts.NextToken(); } if (!tok.IsNone()) { return Err(ParserError::NotParseable); } return Ok(); } } // namespace mozilla::intl