diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
commit | 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch) | |
tree | a68f146d7fa01f0134297619fbe7e33db084e0aa /intl/components/src/Locale.cpp | |
parent | Initial commit. (diff) | |
download | thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip |
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/components/src/Locale.cpp')
-rw-r--r-- | intl/components/src/Locale.cpp | 1471 |
1 files changed, 1471 insertions, 0 deletions
diff --git a/intl/components/src/Locale.cpp b/intl/components/src/Locale.cpp new file mode 100644 index 0000000000..9a043518cf --- /dev/null +++ b/intl/components/src/Locale.cpp @@ -0,0 +1,1471 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/Locale.h" + +#include "mozilla/Assertions.h" +#include "mozilla/DebugOnly.h" +#include "mozilla/MathAlgorithms.h" +#include "mozilla/Span.h" +#include "mozilla/TextUtils.h" +#include "mozilla/Variant.h" + +#include "ICU4CGlue.h" + +#include <algorithm> +#include <iterator> +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <string.h> +#include <type_traits> +#include <utility> + +#include "unicode/uloc.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +using namespace intl::LanguageTagLimits; + +template <typename CharT> +bool IsStructurallyValidLanguageTag(Span<const CharT> aLanguage) { + // unicode_language_subtag = alpha{2,3} | alpha{5,8}; + size_t length = aLanguage.size(); + const CharT* str = aLanguage.data(); + return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) && + std::all_of(str, str + length, IsAsciiAlpha<CharT>); +} + +template bool IsStructurallyValidLanguageTag(Span<const char> aLanguage); +template bool IsStructurallyValidLanguageTag(Span<const Latin1Char> aLanguage); +template bool IsStructurallyValidLanguageTag(Span<const char16_t> aLanguage); + +template <typename CharT> +bool IsStructurallyValidScriptTag(Span<const CharT> aScript) { + // unicode_script_subtag = alpha{4} ; + size_t length = aScript.size(); + const CharT* str = aScript.data(); + return length == 4 && std::all_of(str, str + length, IsAsciiAlpha<CharT>); +} + +template bool IsStructurallyValidScriptTag(Span<const char> aScript); +template bool IsStructurallyValidScriptTag(Span<const Latin1Char> aScript); +template bool IsStructurallyValidScriptTag(Span<const char16_t> aScript); + +template <typename CharT> +bool IsStructurallyValidRegionTag(Span<const CharT> aRegion) { + // unicode_region_subtag = (alpha{2} | digit{3}) ; + size_t length = aRegion.size(); + const CharT* str = aRegion.data(); + return (length == 2 && std::all_of(str, str + length, IsAsciiAlpha<CharT>)) || + (length == 3 && std::all_of(str, str + length, IsAsciiDigit<CharT>)); +} + +template bool IsStructurallyValidRegionTag(Span<const char> aRegion); +template bool IsStructurallyValidRegionTag(Span<const Latin1Char> aRegion); +template bool IsStructurallyValidRegionTag(Span<const char16_t> aRegion); + +#ifdef DEBUG +bool IsStructurallyValidVariantTag(Span<const char> aVariant) { + // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; + size_t length = aVariant.size(); + const char* str = aVariant.data(); + return ((5 <= length && length <= 8) || + (length == 4 && IsAsciiDigit(str[0]))) && + std::all_of(str, str + length, IsAsciiAlphanumeric<char>); +} + +bool IsStructurallyValidUnicodeExtensionTag(Span<const char> aExtension) { + return LocaleParser::CanParseUnicodeExtension(aExtension).isOk(); +} + +static bool IsStructurallyValidExtensionTag(Span<const char> aExtension) { + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + // NB: Allow any extension, including Unicode and Transform here, because + // this function is only used for an assertion. + + size_t length = aExtension.size(); + const char* str = aExtension.data(); + const char* const end = aExtension.data() + length; + if (length <= 2) { + return false; + } + if (!IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') { + return false; + } + str++; + if (*str++ != '-') { + return false; + } + while (true) { + const char* sep = + reinterpret_cast<const char*>(memchr(str, '-', end - str)); + size_t len = (sep ? sep : end) - str; + if (len < 2 || len > 8 || + !std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) { + return false; + } + if (!sep) { + return true; + } + str = sep + 1; + } +} + +bool IsStructurallyValidPrivateUseTag(Span<const char> aPrivateUse) { + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + + size_t length = aPrivateUse.size(); + const char* str = aPrivateUse.data(); + const char* const end = aPrivateUse.data() + length; + if (length <= 2) { + return false; + } + if (str[0] != 'x' && str[0] != 'X') { + return false; + } + str++; + if (*str++ != '-') { + return false; + } + while (true) { + const char* sep = + reinterpret_cast<const char*>(memchr(str, '-', end - str)); + size_t len = (sep ? sep : end) - str; + if (len == 0 || len > 8 || + !std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) { + return false; + } + if (!sep) { + return true; + } + str = sep + 1; + } +} +#endif + +ptrdiff_t Locale::UnicodeExtensionIndex() const { + // The extension subtags aren't necessarily sorted, so we can't use binary + // search here. + auto p = std::find_if( + mExtensions.begin(), mExtensions.end(), + [](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; }); + if (p != mExtensions.end()) { + return std::distance(mExtensions.begin(), p); + } + return -1; +} + +Maybe<Span<const char>> Locale::GetUnicodeExtension() const { + ptrdiff_t index = UnicodeExtensionIndex(); + if (index >= 0) { + return Some(MakeStringSpan(mExtensions[index].get())); + } + return Nothing(); +} + +ICUResult Locale::SetUnicodeExtension(Span<const char> aExtension) { + MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag(aExtension)); + + auto duplicated = DuplicateStringToUniqueChars(aExtension); + + // Replace the existing Unicode extension subtag or append a new one. + ptrdiff_t index = UnicodeExtensionIndex(); + if (index >= 0) { + mExtensions[index] = std::move(duplicated); + return Ok(); + } + if (!mExtensions.append(std::move(duplicated))) { + return Err(ICUError::OutOfMemory); + } + return Ok(); +} + +void Locale::ClearUnicodeExtension() { + ptrdiff_t index = UnicodeExtensionIndex(); + if (index >= 0) { + mExtensions.erase(mExtensions.begin() + index); + } +} + +template <size_t InitialCapacity> +static bool SortAlphabetically(Vector<UniqueChars, InitialCapacity>& aSubtags) { + size_t length = aSubtags.length(); + + // Zero or one element lists are already sorted. + if (length < 2) { + return true; + } + + // Handle two element lists inline. + if (length == 2) { + if (strcmp(aSubtags[0].get(), aSubtags[1].get()) > 0) { + aSubtags[0].swap(aSubtags[1]); + } + return true; + } + + Vector<char*, 8> scratch; + if (!scratch.resizeUninitialized(length)) { + return false; + } + for (size_t i = 0; i < length; i++) { + scratch[i] = aSubtags[i].release(); + } + + std::stable_sort( + scratch.begin(), scratch.end(), + [](const char* a, const char* b) { return strcmp(a, b) < 0; }); + + for (size_t i = 0; i < length; i++) { + aSubtags[i] = UniqueChars(scratch[i]); + } + return true; +} + +Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeBaseName() { + // Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to + // canonicalize the syntax by normalizing the case and ordering all subtags. + // The canonical syntax form is specified in UTS 35, 3.2.1. + + // Language codes need to be in lower case. "JA" -> "ja" + mLanguage.ToLowerCase(); + MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); + + // The first character of a script code needs to be capitalized. + // "hans" -> "Hans" + mScript.ToTitleCase(); + MOZ_ASSERT(Script().Missing() || + IsStructurallyValidScriptTag(Script().Span())); + + // Region codes need to be in upper case. "bu" -> "BU" + mRegion.ToUpperCase(); + MOZ_ASSERT(Region().Missing() || + IsStructurallyValidRegionTag(Region().Span())); + + // The canonical case for variant subtags is lowercase. + for (UniqueChars& variant : mVariants) { + char* variantChars = variant.get(); + size_t variantLength = strlen(variantChars); + AsciiToLowerCase(variantChars, variantLength, variantChars); + + MOZ_ASSERT(IsStructurallyValidVariantTag({variantChars, variantLength})); + } + + // Extensions and privateuse subtags are case normalized in the + // |canonicalizeExtensions| method. + + // The second step in UTS 35, 3.2.1, is to order all subtags. + + if (mVariants.length() > 1) { + // 1. Any variants are in alphabetical order. + if (!SortAlphabetically(mVariants)) { + return Err(CanonicalizationError::OutOfMemory); + } + + // Reject the Locale identifier if a duplicate variant was found, e.g. + // "en-variant-Variant". + const UniqueChars* duplicate = std::adjacent_find( + mVariants.begin(), mVariants.end(), [](const auto& a, const auto& b) { + return strcmp(a.get(), b.get()) == 0; + }); + if (duplicate != mVariants.end()) { + return Err(CanonicalizationError::DuplicateVariant); + } + } + + // 2. Any extensions are in alphabetical order by their singleton. + // 3. All attributes are sorted in alphabetical order. + // 4. All keywords and tfields are sorted by alphabetical order of their keys, + // within their respective extensions. + // 5. Any type or tfield value "true" is removed. + // - A subsequent call to canonicalizeExtensions() will perform these steps. + + // 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier + // into its canonical form per UTS 3.2.1. + + // 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their + // canonical forms. + // - A subsequent call to canonicalizeExtensions() will perform this step. + + // 2. Replace aliases in the unicode_language_id and tlang (if any). + // - tlang is handled in canonicalizeExtensions(). + + // Replace deprecated language, region, and variant subtags with their + // preferred mappings. + + if (!UpdateLegacyMappings()) { + return Err(CanonicalizationError::OutOfMemory); + } + + // Replace deprecated language subtags with their preferred values. + if (!LanguageMapping(mLanguage) && ComplexLanguageMapping(mLanguage)) { + PerformComplexLanguageMappings(); + } + + // Replace deprecated script subtags with their preferred values. + if (Script().Present()) { + ScriptMapping(mScript); + } + + // Replace deprecated region subtags with their preferred values. + if (Region().Present()) { + if (!RegionMapping(mRegion) && ComplexRegionMapping(mRegion)) { + PerformComplexRegionMappings(); + } + } + + // Replace deprecated variant subtags with their preferred values. + if (!PerformVariantMappings()) { + return Err(CanonicalizationError::OutOfMemory); + } + + // No extension replacements are currently present. + // Private use sequences are left as is. + + // 3. Replace aliases in special key values. + // - A subsequent call to canonicalizeExtensions() will perform this step. + + return Ok(); +} + +#ifdef DEBUG +static bool IsAsciiLowercaseAlphanumericOrDash(Span<const char> aSpan) { + const char* ptr = aSpan.data(); + size_t length = aSpan.size(); + return std::all_of(ptr, ptr + length, [](auto c) { + return IsAsciiLowercaseAlpha(c) || IsAsciiDigit(c) || c == '-'; + }); +} +#endif + +Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeExtensions() { + // The canonical case for all extension subtags is lowercase. + for (UniqueChars& extension : mExtensions) { + char* extensionChars = extension.get(); + size_t extensionLength = strlen(extensionChars); + AsciiToLowerCase(extensionChars, extensionLength, extensionChars); + + MOZ_ASSERT( + IsStructurallyValidExtensionTag({extensionChars, extensionLength})); + } + + // Any extensions are in alphabetical order by their singleton. + // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" + if (!SortAlphabetically(mExtensions)) { + return Err(CanonicalizationError::OutOfMemory); + } + + for (UniqueChars& extension : mExtensions) { + if (extension[0] == 'u') { + MOZ_TRY(CanonicalizeUnicodeExtension(extension)); + } else if (extension[0] == 't') { + MOZ_TRY(CanonicalizeTransformExtension(extension)); + } + + MOZ_ASSERT( + IsAsciiLowercaseAlphanumericOrDash(MakeStringSpan(extension.get()))); + } + + // The canonical case for privateuse subtags is lowercase. + if (char* privateuse = mPrivateUse.get()) { + size_t privateuseLength = strlen(privateuse); + AsciiToLowerCase(privateuse, privateuseLength, privateuse); + + MOZ_ASSERT( + IsStructurallyValidPrivateUseTag({privateuse, privateuseLength})); + } + return Ok(); +} + +template <size_t N> +static inline bool AppendSpan(Vector<char, N>& vector, Span<const char> aSpan) { + return vector.append(aSpan.data(), aSpan.size()); +} + +/** + * CanonicalizeUnicodeExtension( attributes, keywords ) + * + * Canonical syntax per + * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: + * + * - All attributes and keywords are in lowercase. + * - Note: The parser already converted keywords to lowercase. + * - All attributes are sorted in alphabetical order. + * - All keywords are sorted by alphabetical order of their keys. + * - Any type value "true" is removed. + * + * Canonical form: + * - All keys and types use the canonical form (from the name attribute; + * see Section 3.6.4 U Extension Data Files). + */ +Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeUnicodeExtension( + UniqueChars& aUnicodeExtension) { + Span<const char> extension = MakeStringSpan(aUnicodeExtension.get()); + MOZ_ASSERT(extension[0] == 'u'); + MOZ_ASSERT(extension[1] == '-'); + MOZ_ASSERT(IsStructurallyValidExtensionTag(extension)); + + LocaleParser::AttributesVector attributes; + LocaleParser::KeywordsVector keywords; + + using Attribute = LocaleParser::AttributesVector::ElementType; + using Keyword = LocaleParser::KeywordsVector::ElementType; + + if (LocaleParser::ParseUnicodeExtension(extension, attributes, keywords) + .isErr()) { + MOZ_ASSERT_UNREACHABLE("unexpected invalid Unicode extension subtag"); + return Err(CanonicalizationError::InternalError); + } + + auto attributesLess = [extension](const Attribute& a, const Attribute& b) { + auto astr = extension.Subspan(a.Begin(), a.Length()); + auto bstr = extension.Subspan(b.Begin(), b.Length()); + return astr < bstr; + }; + + // All attributes are sorted in alphabetical order. + if (attributes.length() > 1) { + std::stable_sort(attributes.begin(), attributes.end(), attributesLess); + } + + auto keywordsLess = [extension](const Keyword& a, const Keyword& b) { + auto astr = extension.Subspan(a.Begin(), UnicodeKeyLength); + auto bstr = extension.Subspan(b.Begin(), UnicodeKeyLength); + return astr < bstr; + }; + + // All keywords are sorted by alphabetical order of keys. + if (keywords.length() > 1) { + // Using a stable sort algorithm, guarantees that two keywords using the + // same key are never reordered. That means for example + // when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to + // get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs + // before "nu-latn". + // This is required so that deduplication below preserves the first keyword + // for a given key and discards the rest. + std::stable_sort(keywords.begin(), keywords.end(), keywordsLess); + } + + Vector<char, 32> sb; + if (!sb.append('u')) { + return Err(CanonicalizationError::OutOfMemory); + } + + // Append all Unicode extension attributes. + for (size_t i = 0; i < attributes.length(); i++) { + const auto& attribute = attributes[i]; + auto span = extension.Subspan(attribute.Begin(), attribute.Length()); + + // Skip duplicate attributes. + if (i > 0) { + const auto& lastAttribute = attributes[i - 1]; + if (span == + extension.Subspan(lastAttribute.Begin(), lastAttribute.Length())) { + continue; + } + MOZ_ASSERT(attributesLess(lastAttribute, attribute)); + } + + if (!sb.append('-')) { + return Err(CanonicalizationError::OutOfMemory); + } + if (!AppendSpan(sb, span)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + + static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1; + + using StringSpan = Span<const char>; + + static constexpr StringSpan True = MakeStringSpan("true"); + + // Append all Unicode extension keywords. + for (size_t i = 0; i < keywords.length(); i++) { + const auto& keyword = keywords[i]; + + // Skip duplicate keywords. + if (i > 0) { + const auto& lastKeyword = keywords[i - 1]; + if (extension.Subspan(keyword.Begin(), UnicodeKeyLength) == + extension.Subspan(lastKeyword.Begin(), UnicodeKeyLength)) { + continue; + } + MOZ_ASSERT(keywordsLess(lastKeyword, keyword)); + } + + if (!sb.append('-')) { + return Err(CanonicalizationError::OutOfMemory); + } + + StringSpan span = extension.Subspan(keyword.Begin(), keyword.Length()); + if (span.size() == UnicodeKeyLength) { + // Keyword without type value. + if (!AppendSpan(sb, span)) { + return Err(CanonicalizationError::OutOfMemory); + } + } else { + StringSpan key = span.To(UnicodeKeyLength); + StringSpan type = span.From(UnicodeKeyWithSepLength); + + // Search if there's a replacement for the current Unicode keyword. + if (const char* replacement = ReplaceUnicodeExtensionType(key, type)) { + StringSpan repl = MakeStringSpan(replacement); + if (repl == True) { + // Elide the type "true" if present in the replacement. + if (!AppendSpan(sb, key)) { + return Err(CanonicalizationError::OutOfMemory); + } + } else { + // Otherwise append the Unicode key (including the separator) and the + // replaced type. + if (!AppendSpan(sb, span.To(UnicodeKeyWithSepLength))) { + return Err(CanonicalizationError::OutOfMemory); + } + if (!AppendSpan(sb, repl)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + } else { + if (type == True) { + // Elide the Unicode extension type "true". + if (!AppendSpan(sb, key)) { + return Err(CanonicalizationError::OutOfMemory); + } + } else { + // Otherwise append the complete Unicode extension keyword. + if (!AppendSpan(sb, span)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + } + } + } + + // We can keep the previous extension when canonicalization didn't modify it. + if (static_cast<Span<const char>>(sb) != extension) { + // Otherwise replace the previous extension with the canonical extension. + UniqueChars canonical = DuplicateStringToUniqueChars(sb); + if (!canonical) { + return Err(CanonicalizationError::OutOfMemory); + } + aUnicodeExtension = std::move(canonical); + } + + return Ok(); +} + +template <class Buffer> +static bool LocaleToString(const Locale& aTag, Buffer& aBuffer) { + auto appendSubtag = [&aBuffer](const auto& subtag) { + auto span = subtag.Span(); + MOZ_ASSERT(!span.empty()); + return aBuffer.append(span.data(), span.size()); + }; + + auto appendSubtagSpan = [&aBuffer](Span<const char> subtag) { + MOZ_ASSERT(!subtag.empty()); + return aBuffer.append(subtag.data(), subtag.size()); + }; + + auto appendSubtags = [&aBuffer, &appendSubtagSpan](const auto& subtags) { + for (const auto& subtag : subtags) { + if (!aBuffer.append('-') || !appendSubtagSpan(subtag)) { + return false; + } + } + return true; + }; + + // Append the language subtag. + if (!appendSubtag(aTag.Language())) { + return false; + } + + // Append the script subtag if present. + if (aTag.Script().Present()) { + if (!aBuffer.append('-') || !appendSubtag(aTag.Script())) { + return false; + } + } + + // Append the region subtag if present. + if (aTag.Region().Present()) { + if (!aBuffer.append('-') || !appendSubtag(aTag.Region())) { + return false; + } + } + + // Append the variant subtags if present. + if (!appendSubtags(aTag.Variants())) { + return false; + } + + // Append the extensions subtags if present. + if (!appendSubtags(aTag.Extensions())) { + return false; + } + + // Append the private-use subtag if present. + if (auto privateuse = aTag.PrivateUse()) { + if (!aBuffer.append('-') || !appendSubtagSpan(privateuse.value())) { + return false; + } + } + + return true; +} + +/** + * CanonicalizeTransformExtension + * + * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>: + * + * - These subtags are all in lowercase (that is the canonical casing for these + * subtags), [...]. + * + * And per + * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: + * + * - All keywords and tfields are sorted by alphabetical order of their keys, + * within their respective extensions. + */ +Result<Ok, Locale::CanonicalizationError> +Locale::CanonicalizeTransformExtension(UniqueChars& aTransformExtension) { + Span<const char> extension = MakeStringSpan(aTransformExtension.get()); + MOZ_ASSERT(extension[0] == 't'); + MOZ_ASSERT(extension[1] == '-'); + MOZ_ASSERT(IsStructurallyValidExtensionTag(extension)); + + Locale tag; + LocaleParser::TFieldVector fields; + + using TField = LocaleParser::TFieldVector::ElementType; + + if (LocaleParser::ParseTransformExtension(extension, tag, fields).isErr()) { + MOZ_ASSERT_UNREACHABLE("unexpected invalid transform extension subtag"); + return Err(CanonicalizationError::InternalError); + } + + auto tfieldLess = [extension](const TField& a, const TField& b) { + auto astr = extension.Subspan(a.Begin(), TransformKeyLength); + auto bstr = extension.Subspan(b.Begin(), TransformKeyLength); + return astr < bstr; + }; + + // All tfields are sorted by alphabetical order of their keys. + if (fields.length() > 1) { + std::stable_sort(fields.begin(), fields.end(), tfieldLess); + } + + Vector<char, 32> sb; + if (!sb.append('t')) { + return Err(CanonicalizationError::OutOfMemory); + } + + // Append the language subtag if present. + // + // Replace aliases in tlang per + // <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>. + if (tag.Language().Present()) { + if (!sb.append('-')) { + return Err(CanonicalizationError::OutOfMemory); + } + + MOZ_TRY(tag.CanonicalizeBaseName()); + + // The canonical case for Transform extensions is lowercase per + // <https://unicode.org/reports/tr35/#BCP47_T_Extension>. Convert the two + // subtags which don't use lowercase for their canonical syntax. + tag.mScript.ToLowerCase(); + tag.mRegion.ToLowerCase(); + + if (!LocaleToString(tag, sb)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + + static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1; + + using StringSpan = Span<const char>; + + // Append all fields. + // + // UTS 35, 3.2.1 specifies: + // - Any type or tfield value "true" is removed. + // + // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore + // this apparently invalid part of the UTS 35 specification and simply + // append all `tfield` subtags. + for (const auto& field : fields) { + if (!sb.append('-')) { + return Err(CanonicalizationError::OutOfMemory); + } + + StringSpan span = extension.Subspan(field.Begin(), field.Length()); + StringSpan key = span.To(TransformKeyLength); + StringSpan value = span.From(TransformKeyWithSepLength); + + // Search if there's a replacement for the current transform keyword. + if (const char* replacement = ReplaceTransformExtensionType(key, value)) { + if (!AppendSpan(sb, span.To(TransformKeyWithSepLength))) { + return Err(CanonicalizationError::OutOfMemory); + } + if (!AppendSpan(sb, MakeStringSpan(replacement))) { + return Err(CanonicalizationError::OutOfMemory); + } + } else { + if (!AppendSpan(sb, span)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + } + + // We can keep the previous extension when canonicalization didn't modify it. + if (static_cast<Span<const char>>(sb) != extension) { + // Otherwise replace the previous extension with the canonical extension. + UniqueChars canonical = DuplicateStringToUniqueChars(sb); + if (!canonical) { + return Err(CanonicalizationError::OutOfMemory); + } + aTransformExtension = std::move(canonical); + } + + return Ok(); +} + +// Zero-terminated ICU Locale ID. +using LocaleId = + Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>; + +enum class LikelySubtags : bool { Add, Remove }; + +// Return true iff the locale is already maximized resp. minimized. +static bool HasLikelySubtags(LikelySubtags aLikelySubtags, const Locale& aTag) { + // The locale is already maximized if the language, script, and region + // subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are + // used. + if (aLikelySubtags == LikelySubtags::Add) { + return !aTag.Language().EqualTo("und") && + (aTag.Script().Present() && !aTag.Script().EqualTo("Zzzz")) && + (aTag.Region().Present() && !aTag.Region().EqualTo("ZZ")); + } + + // The locale is already minimized if it only contains a language + // subtag whose value is not the placeholder value "und". + return !aTag.Language().EqualTo("und") && aTag.Script().Missing() && + aTag.Region().Missing(); +} + +// Create an ICU locale ID from the given locale. +static bool CreateLocaleForLikelySubtags(const Locale& aTag, + LocaleId& aLocale) { + MOZ_ASSERT(aLocale.length() == 0); + + auto appendSubtag = [&aLocale](const auto& subtag) { + auto span = subtag.Span(); + MOZ_ASSERT(!span.empty()); + return aLocale.append(span.data(), span.size()); + }; + + // Append the language subtag. + if (!appendSubtag(aTag.Language())) { + return false; + } + + // Append the script subtag if present. + if (aTag.Script().Present()) { + if (!aLocale.append('_') || !appendSubtag(aTag.Script())) { + return false; + } + } + + // Append the region subtag if present. + if (aTag.Region().Present()) { + if (!aLocale.append('_') || !appendSubtag(aTag.Region())) { + return false; + } + } + + // Zero-terminated for use with ICU. + return aLocale.append('\0'); +} + +static ICUError ParserErrorToICUError(LocaleParser::ParserError aErr) { + using ParserError = LocaleParser::ParserError; + + switch (aErr) { + case ParserError::NotParseable: + return ICUError::InternalError; + case ParserError::OutOfMemory: + return ICUError::OutOfMemory; + } + MOZ_CRASH("Unexpected parser error"); +} + +static ICUError CanonicalizationErrorToICUError( + Locale::CanonicalizationError aErr) { + using CanonicalizationError = Locale::CanonicalizationError; + + switch (aErr) { + case CanonicalizationError::DuplicateVariant: + case CanonicalizationError::InternalError: + return ICUError::InternalError; + case CanonicalizationError::OutOfMemory: + return ICUError::OutOfMemory; + } + MOZ_CRASH("Unexpected canonicalization error"); +} + +// Assign the language, script, and region subtags from an ICU locale ID. +// +// ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to +// retrieve these subtags, but unfortunately these functions are rather slow, so +// we use our own implementation. +static ICUResult AssignFromLocaleId(LocaleId& aLocaleId, Locale& aTag) { + // Replace the ICU locale ID separator. + std::replace(aLocaleId.begin(), aLocaleId.end(), '_', '-'); + + // ICU replaces "und" with the empty string, which means "und" becomes "" and + // "und-Latn" becomes "-Latn". Handle this case separately. + if (aLocaleId.empty() || aLocaleId[0] == '-') { + static constexpr auto und = MakeStringSpan("und"); + constexpr size_t length = und.size(); + + // Insert "und" in front of the locale ID. + if (!aLocaleId.growBy(length)) { + return Err(ICUError::OutOfMemory); + } + memmove(aLocaleId.begin() + length, aLocaleId.begin(), aLocaleId.length()); + memmove(aLocaleId.begin(), und.data(), length); + } + + // Retrieve the language, script, and region subtags from the locale ID + Locale localeTag; + MOZ_TRY(LocaleParser::TryParseBaseName(aLocaleId, localeTag) + .mapErr(ParserErrorToICUError)); + + aTag.SetLanguage(localeTag.Language()); + aTag.SetScript(localeTag.Script()); + aTag.SetRegion(localeTag.Region()); + + return Ok(); +} + +template <decltype(uloc_addLikelySubtags) likelySubtagsFn> +static ICUResult CallLikelySubtags(const LocaleId& aLocaleId, + LocaleId& aResult) { + // Locale ID must be zero-terminated before passing it to ICU. + MOZ_ASSERT(aLocaleId.back() == '\0'); + MOZ_ASSERT(aResult.length() == 0); + + // Ensure there's enough room for the result. + MOZ_ALWAYS_TRUE(aResult.resize(LocaleId::InlineLength)); + + return FillBufferWithICUCall( + aResult, [&aLocaleId](char* chars, int32_t size, UErrorCode* status) { + return likelySubtagsFn(aLocaleId.begin(), chars, size, status); + }); +} + +// The canonical way to compute the Unicode BCP 47 locale identifier with likely +// subtags is as follows: +// +// 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU +// locale ID. +// 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID. +// 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into +// a Unicode BCP 47 locale identifier. +// +// Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow +// and we know, by construction, that the input Unicode BCP 47 locale identifier +// only contains valid language, script, and region subtags, we can avoid both +// calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and +// AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of +// |Intl.Locale.prototype.maximize|.) +static ICUResult LikelySubtags(LikelySubtags aLikelySubtags, Locale& aTag) { + // Return early if the input is already maximized/minimized. + if (HasLikelySubtags(aLikelySubtags, aTag)) { + return Ok(); + } + + // Create the locale ID for the input argument. + LocaleId locale; + if (!CreateLocaleForLikelySubtags(aTag, locale)) { + return Err(ICUError::OutOfMemory); + } + + // Either add or remove likely subtags to/from the locale ID. + LocaleId localeLikelySubtags; + if (aLikelySubtags == LikelySubtags::Add) { + MOZ_TRY( + CallLikelySubtags<uloc_addLikelySubtags>(locale, localeLikelySubtags)); + } else { + MOZ_TRY( + CallLikelySubtags<uloc_minimizeSubtags>(locale, localeLikelySubtags)); + } + + // Assign the language, script, and region subtags from the locale ID. + MOZ_TRY(AssignFromLocaleId(localeLikelySubtags, aTag)); + + // Update mappings in case ICU returned a non-canonical locale. + MOZ_TRY(aTag.CanonicalizeBaseName().mapErr(CanonicalizationErrorToICUError)); + + return Ok(); +} + +ICUResult Locale::AddLikelySubtags() { + return LikelySubtags(LikelySubtags::Add, *this); +} + +ICUResult Locale::RemoveLikelySubtags() { + return LikelySubtags(LikelySubtags::Remove, *this); +} + +UniqueChars Locale::DuplicateStringToUniqueChars(const char* aStr) { + size_t length = strlen(aStr) + 1; + auto duplicate = MakeUnique<char[]>(length); + memcpy(duplicate.get(), aStr, length); + return duplicate; +} + +UniqueChars Locale::DuplicateStringToUniqueChars(Span<const char> aStr) { + size_t length = aStr.size(); + auto duplicate = MakeUnique<char[]>(length + 1); + memcpy(duplicate.get(), aStr.data(), length); + duplicate[length] = '\0'; + return duplicate; +} + +size_t Locale::ToStringCapacity() const { + // This is a bit awkward, the buffer class currently does not support + // being resized, so we need to calculate the required size up front and + // reserve it all at once. + auto lengthSubtag = [](const auto& subtag) { + auto span = subtag.Span(); + MOZ_ASSERT(!span.empty()); + return span.size(); + }; + + auto lengthSubtagZ = [](const char* subtag) { + size_t length = strlen(subtag); + MOZ_ASSERT(length > 0); + return length; + }; + + auto lengthSubtagsZ = [&lengthSubtagZ](const auto& subtags) { + size_t length = 0; + for (const auto& subtag : subtags) { + length += lengthSubtagZ(subtag.get()) + 1; + } + return length; + }; + + // First calculate required capacity + size_t capacity = 0; + + capacity += lengthSubtag(mLanguage); + + if (mScript.Present()) { + capacity += lengthSubtag(mScript) + 1; + } + + if (mRegion.Present()) { + capacity += lengthSubtag(mRegion) + 1; + } + + capacity += lengthSubtagsZ(mVariants); + + capacity += lengthSubtagsZ(mExtensions); + + if (mPrivateUse.get()) { + capacity += lengthSubtagZ(mPrivateUse.get()) + 1; + } + + return capacity; +} + +size_t Locale::ToStringAppend(char* aBuffer) const { + // Current write position inside buffer. + size_t offset = 0; + + auto appendHyphen = [&offset, &aBuffer]() { + aBuffer[offset] = '-'; + offset += 1; + }; + + auto appendSubtag = [&offset, &aBuffer](const auto& subtag) { + auto span = subtag.Span(); + memcpy(aBuffer + offset, span.data(), span.size()); + offset += span.size(); + }; + + auto appendSubtagZ = [&offset, &aBuffer](const char* subtag) { + size_t length = strlen(subtag); + memcpy(aBuffer + offset, subtag, length); + offset += length; + }; + + auto appendSubtagsZ = [&appendHyphen, &appendSubtagZ](const auto& subtags) { + for (const auto& subtag : subtags) { + appendHyphen(); + appendSubtagZ(subtag.get()); + } + }; + + // Append the language subtag. + appendSubtag(mLanguage); + + // Append the script subtag if present. + if (mScript.Present()) { + appendHyphen(); + appendSubtag(mScript); + } + + // Append the region subtag if present. + if (mRegion.Present()) { + appendHyphen(); + appendSubtag(mRegion); + } + + // Append the variant subtags if present. + appendSubtagsZ(mVariants); + + // Append the extensions subtags if present. + appendSubtagsZ(mExtensions); + + // Append the private-use subtag if present. + if (mPrivateUse.get()) { + appendHyphen(); + appendSubtagZ(mPrivateUse.get()); + } + + return offset; +} + +LocaleParser::Token LocaleParser::NextToken() { + MOZ_ASSERT(mIndex <= mLength + 1, "called after 'None' token was read"); + + TokenKind kind = TokenKind::None; + size_t tokenLength = 0; + for (size_t i = mIndex; i < mLength; i++) { + // UTS 35, section 3.1. + // alpha = [A-Z a-z] ; + // digit = [0-9] ; + char c = CharAt(i); + if (IsAsciiAlpha(c)) { + kind |= TokenKind::Alpha; + } else if (IsAsciiDigit(c)) { + kind |= TokenKind::Digit; + } else if (c == '-' && i > mIndex && i + 1 < mLength) { + break; + } else { + return {TokenKind::Error, 0, 0}; + } + tokenLength += 1; + } + + Token token{kind, mIndex, tokenLength}; + mIndex += tokenLength + 1; + return token; +} + +UniqueChars LocaleParser::Chars(size_t aIndex, size_t aLength) const { + // Add +1 to null-terminate the string. + auto chars = MakeUnique<char[]>(aLength + 1); + char* dest = chars.get(); + std::copy_n(mLocale + aIndex, aLength, dest); + dest[aLength] = '\0'; + return chars; +} + +// Parse the `unicode_language_id` production. +// +// unicode_language_id = unicode_language_subtag +// (sep unicode_script_subtag)? +// (sep unicode_region_subtag)? +// (sep unicode_variant_subtag)* ; +// +// sep = "-" +// +// Note: Unicode CLDR locale identifier backward compatibility extensions +// removed from `unicode_language_id`. +// +// |tok| is the current token from |ts|. +// +// All subtags will be added unaltered to |tag|, without canonicalizing their +// case or, in the case of variant subtags, detecting and rejecting duplicate +// variants. Users must subsequently |CanonicalizeBaseName| to perform these +// actions. +// +// Do not use this function directly: use |ParseBaseName| or +// |ParseTlangFromTransformExtension| instead. +Result<Ok, LocaleParser::ParserError> LocaleParser::InternalParseBaseName( + LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) { + if (aLocaleParser.IsLanguage(aTok)) { + aLocaleParser.CopyChars(aTok, aTag.mLanguage); + + aTok = aLocaleParser.NextToken(); + } else { + // The language subtag is mandatory. + return Err(ParserError::NotParseable); + } + + if (aLocaleParser.IsScript(aTok)) { + aLocaleParser.CopyChars(aTok, aTag.mScript); + + aTok = aLocaleParser.NextToken(); + } + + if (aLocaleParser.IsRegion(aTok)) { + aLocaleParser.CopyChars(aTok, aTag.mRegion); + + aTok = aLocaleParser.NextToken(); + } + + auto& variants = aTag.mVariants; + MOZ_ASSERT(variants.length() == 0); + while (aLocaleParser.IsVariant(aTok)) { + auto variant = aLocaleParser.Chars(aTok); + if (!variants.append(std::move(variant))) { + return Err(ParserError::OutOfMemory); + } + + aTok = aLocaleParser.NextToken(); + } + + return Ok(); +} + +Result<Ok, LocaleParser::ParserError> LocaleParser::TryParse( + mozilla::Span<const char> aLocale, Locale& aTag) { + // |aTag| must be a new, empty Locale. + MOZ_ASSERT(aTag.Language().Missing()); + MOZ_ASSERT(aTag.Script().Missing()); + MOZ_ASSERT(aTag.Region().Missing()); + MOZ_ASSERT(aTag.Variants().empty()); + MOZ_ASSERT(aTag.Extensions().empty()); + MOZ_ASSERT(aTag.PrivateUse().isNothing()); + + // unicode_locale_id = unicode_language_id + // extensions* + // pu_extensions? ; + + LocaleParser ts(aLocale); + Token tok = ts.NextToken(); + + MOZ_TRY(ParseBaseName(ts, aTag, tok)); + + // extensions = unicode_locale_extensions + // | transformed_extensions + // | other_extensions ; + + // Bit set of seen singletons. + uint64_t seenSingletons = 0; + + auto& extensions = aTag.mExtensions; + while (ts.IsExtensionStart(tok)) { + char singleton = ts.SingletonKey(tok); + + // Reject the input if a duplicate singleton was found. + uint64_t hash = 1ULL << (AsciiAlphanumericToNumber(singleton) + 1); + if (seenSingletons & hash) { + return Err(ParserError::NotParseable); + } + seenSingletons |= hash; + + Token start = tok; + tok = ts.NextToken(); + + // We'll check for missing non-singleton subtags after this block by + // comparing |startValue| with the then-current position. + size_t startValue = tok.Index(); + + if (singleton == 'u') { + while (ts.IsUnicodeExtensionPart(tok)) { + tok = ts.NextToken(); + } + } else if (singleton == 't') { + // transformed_extensions = sep [tT] + // ((sep tlang (sep tfield)*) + // | (sep tfield)+) ; + + // tlang = unicode_language_subtag + // (sep unicode_script_subtag)? + // (sep unicode_region_subtag)? + // (sep unicode_variant_subtag)* ; + if (ts.IsLanguage(tok)) { + tok = ts.NextToken(); + + if (ts.IsScript(tok)) { + tok = ts.NextToken(); + } + + if (ts.IsRegion(tok)) { + tok = ts.NextToken(); + } + + while (ts.IsVariant(tok)) { + tok = ts.NextToken(); + } + } + + // tfield = tkey tvalue; + while (ts.IsTransformExtensionKey(tok)) { + tok = ts.NextToken(); + + size_t startTValue = tok.Index(); + while (ts.IsTransformExtensionPart(tok)) { + tok = ts.NextToken(); + } + + // `tfield` requires at least one `tvalue`. + if (tok.Index() <= startTValue) { + return Err(ParserError::NotParseable); + } + } + } else { + while (ts.IsOtherExtensionPart(tok)) { + tok = ts.NextToken(); + } + } + + // Singletons must be followed by a non-singleton subtag, "en-a-b" is not + // allowed. + if (tok.Index() <= startValue) { + return Err(ParserError::NotParseable); + } + + UniqueChars extension = ts.Extension(start, tok); + if (!extensions.append(std::move(extension))) { + return Err(ParserError::OutOfMemory); + } + } + + // Trailing `pu_extension` component of the `unicode_locale_id` production. + if (ts.IsPrivateUseStart(tok)) { + Token start = tok; + tok = ts.NextToken(); + + size_t startValue = tok.Index(); + while (ts.IsPrivateUsePart(tok)) { + tok = ts.NextToken(); + } + + // There must be at least one subtag after the "-x-". + if (tok.Index() <= startValue) { + return Err(ParserError::NotParseable); + } + + UniqueChars privateUse = ts.Extension(start, tok); + aTag.mPrivateUse = std::move(privateUse); + } + + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +Result<Ok, LocaleParser::ParserError> LocaleParser::TryParseBaseName( + Span<const char> aLocale, Locale& aTag) { + // |aTag| must be a new, empty Locale. + MOZ_ASSERT(aTag.Language().Missing()); + MOZ_ASSERT(aTag.Script().Missing()); + MOZ_ASSERT(aTag.Region().Missing()); + MOZ_ASSERT(aTag.Variants().empty()); + MOZ_ASSERT(aTag.Extensions().empty()); + MOZ_ASSERT(aTag.PrivateUse().isNothing()); + + LocaleParser ts(aLocale); + Token tok = ts.NextToken(); + + MOZ_TRY(ParseBaseName(ts, aTag, tok)); + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +// Parse |aExtension|, which must be a valid `transformed_extensions` subtag, +// and fill |aTag| and |aFields| from the `tlang` and `tfield` components. +Result<Ok, LocaleParser::ParserError> LocaleParser::ParseTransformExtension( + Span<const char> aExtension, Locale& aTag, TFieldVector& aFields) { + LocaleParser ts(aExtension); + Token tok = ts.NextToken(); + + if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 't') { + return Err(ParserError::NotParseable); + } + + tok = ts.NextToken(); + + if (tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + if (ts.IsLanguage(tok)) { + // We're parsing a possible `tlang` in a known-valid transform extension, so + // use the special-purpose function that takes advantage of this to compute + // lowercased |tag| contents in an optimal manner. + MOZ_TRY(ParseTlangInTransformExtension(ts, aTag, tok)); + + // After `tlang` we must have a `tfield` and its `tkey`, or we're at the end + // of the transform extension. + MOZ_ASSERT(ts.IsTransformExtensionKey(tok) || tok.IsNone()); + } else { + // If there's no `tlang` subtag, at least one `tfield` must be present. + MOZ_ASSERT(ts.IsTransformExtensionKey(tok)); + } + + // Trailing `tfield` subtags. (Any other trailing subtags are an error, + // because we're guaranteed to only see a valid tranform extension here.) + while (ts.IsTransformExtensionKey(tok)) { + size_t begin = tok.Index(); + tok = ts.NextToken(); + + size_t startTValue = tok.Index(); + while (ts.IsTransformExtensionPart(tok)) { + tok = ts.NextToken(); + } + + // `tfield` requires at least one `tvalue`. + if (tok.Index() <= startTValue) { + return Err(ParserError::NotParseable); + } + + size_t length = tok.Index() - 1 - begin; + if (!aFields.emplaceBack(begin, length)) { + return Err(ParserError::OutOfMemory); + } + } + + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +// Parse |aExtension|, which must be a valid `unicode_locale_extensions` subtag, +// and fill |aAttributes| and |aKeywords| from the `attribute` and `keyword` +// components. +Result<Ok, LocaleParser::ParserError> LocaleParser::ParseUnicodeExtension( + Span<const char> aExtension, AttributesVector& aAttributes, + KeywordsVector& aKeywords) { + LocaleParser ts(aExtension); + Token tok = ts.NextToken(); + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + + if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') { + return Err(ParserError::NotParseable); + } + + tok = ts.NextToken(); + + if (tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + while (ts.IsUnicodeExtensionAttribute(tok)) { + if (!aAttributes.emplaceBack(tok.Index(), tok.Length())) { + return Err(ParserError::OutOfMemory); + } + + tok = ts.NextToken(); + } + + // keyword = key (sep type)? ; + while (ts.IsUnicodeExtensionKey(tok)) { + size_t begin = tok.Index(); + tok = ts.NextToken(); + + while (ts.IsUnicodeExtensionType(tok)) { + tok = ts.NextToken(); + } + + if (tok.IsError()) { + return Err(ParserError::NotParseable); + } + + size_t length = tok.Index() - 1 - begin; + if (!aKeywords.emplaceBack(begin, length)) { + return Err(ParserError::OutOfMemory); + } + } + + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +Result<Ok, LocaleParser::ParserError> LocaleParser::CanParseUnicodeExtension( + Span<const char> aExtension) { + LocaleParser ts(aExtension); + Token tok = ts.NextToken(); + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + + if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') { + return Err(ParserError::NotParseable); + } + + tok = ts.NextToken(); + + if (tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + while (ts.IsUnicodeExtensionAttribute(tok)) { + tok = ts.NextToken(); + } + + // keyword = key (sep type)? ; + while (ts.IsUnicodeExtensionKey(tok)) { + tok = ts.NextToken(); + + while (ts.IsUnicodeExtensionType(tok)) { + tok = ts.NextToken(); + } + + if (tok.IsError()) { + return Err(ParserError::NotParseable); + } + } + + if (!tok.IsNone()) { + return Err(ParserError::OutOfMemory); + } + + return Ok(); +} + +Result<Ok, LocaleParser::ParserError> +LocaleParser::CanParseUnicodeExtensionType(Span<const char> aUnicodeType) { + MOZ_ASSERT(!aUnicodeType.empty(), "caller must exclude empty strings"); + + LocaleParser ts(aUnicodeType); + Token tok = ts.NextToken(); + + while (ts.IsUnicodeExtensionType(tok)) { + tok = ts.NextToken(); + } + + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +} // namespace mozilla::intl |