From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- js/src/builtin/intl/make_intl_data.py | 4139 +++++++++++++++++++++++++++++++++ 1 file changed, 4139 insertions(+) create mode 100755 js/src/builtin/intl/make_intl_data.py (limited to 'js/src/builtin/intl/make_intl_data.py') diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py new file mode 100755 index 0000000000..ff631ce219 --- /dev/null +++ b/js/src/builtin/intl/make_intl_data.py @@ -0,0 +1,4139 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" Usage: + make_intl_data.py langtags [cldr_common.zip] + make_intl_data.py tzdata + make_intl_data.py currency + make_intl_data.py units + make_intl_data.py numbering + + + Target "langtags": + This script extracts information about 1) mappings between deprecated and + current Unicode BCP 47 locale identifiers, and 2) deprecated and current + BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping + code in intl/components/LocaleGenerated.cpp. The code is used in + intl/components/Locale.cpp. + + + Target "tzdata": + This script computes which time zone informations are not up-to-date in ICU + and provides the necessary mappings to workaround this problem. + https://ssl.icu-project.org/trac/ticket/12044 + + + Target "currency": + Generates the mapping from currency codes to decimal digits used for them. + + + Target "units": + Generate source and test files using the list of so-called "sanctioned unit + identifiers" and verifies that the ICU data filter includes these units. + + + Target "numbering": + Generate source and test files using the list of numbering systems with + simple digit mappings and verifies that it's in sync with ICU/CLDR. +""" + +import io +import json +import os +import re +import sys +import tarfile +import tempfile +from contextlib import closing +from functools import partial, total_ordering +from itertools import chain, groupby, tee +from operator import attrgetter, itemgetter +from zipfile import ZipFile + +import yaml + +if sys.version_info.major == 2: + from itertools import ifilter as filter + from itertools import ifilterfalse as filterfalse + from itertools import imap as map + from itertools import izip_longest as zip_longest + + from urllib2 import Request as UrlRequest + from urllib2 import urlopen + from urlparse import urlsplit +else: + from itertools import filterfalse, zip_longest + from urllib.parse import urlsplit + from urllib.request import Request as UrlRequest + from urllib.request import urlopen + + +# From https://docs.python.org/3/library/itertools.html +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" + args = [iter(iterable)] * n + return zip_longest(*args, fillvalue=fillvalue) + + +def writeMappingHeader(println, description, source, url): + if type(description) is not list: + description = [description] + for desc in description: + println("// {0}".format(desc)) + println("// Derived from {0}.".format(source)) + println("// {0}".format(url)) + + +def writeMappingsVar(println, mapping, name, description, source, url): + """Writes a variable definition with a mapping table. + + Writes the contents of dictionary |mapping| through the |println| + function with the given variable name and a comment with description, + fileDate, and URL. + """ + println("") + writeMappingHeader(println, description, source, url) + println("var {0} = {{".format(name)) + for (key, value) in sorted(mapping.items(), key=itemgetter(0)): + println(' "{0}": "{1}",'.format(key, value)) + println("};") + + +def writeMappingsBinarySearch( + println, + fn_name, + type_name, + name, + validate_fn, + validate_case_fn, + mappings, + tag_maxlength, + description, + source, + url, +): + """Emit code to perform a binary search on language tag subtags. + + Uses the contents of |mapping|, which can either be a dictionary or set, + to emit a mapping function to find subtag replacements. + """ + println("") + writeMappingHeader(println, description, source, url) + println( + """ +bool mozilla::intl::Locale::{0}({1} {2}) {{ + MOZ_ASSERT({3}({2}.Span())); + MOZ_ASSERT({4}({2}.Span())); +""".format( + fn_name, type_name, name, validate_fn, validate_case_fn + ).strip() + ) + writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength) + + println( + """ +}""".lstrip( + "\n" + ) + ) + + +def writeMappingsBinarySearchBody( + println, source_name, target_name, mappings, tag_maxlength +): + def write_array(subtags, name, length, fixed): + if fixed: + println( + " static const char {}[{}][{}] = {{".format( + name, len(subtags), length + 1 + ) + ) + else: + println(" static const char* {}[{}] = {{".format(name, len(subtags))) + + # Group in pairs of ten to not exceed the 80 line column limit. + for entries in grouper(subtags, 10): + entries = ( + '"{}"'.format(tag).rjust(length + 2) + for tag in entries + if tag is not None + ) + println(" {},".format(", ".join(entries))) + + println(" };") + + trailing_return = True + + # Sort the subtags by length. That enables using an optimized comparator + # for the binary search, which only performs a single |memcmp| for multiple + # of two subtag lengths. + mappings_keys = mappings.keys() if type(mappings) == dict else mappings + for (length, subtags) in groupby(sorted(mappings_keys, key=len), len): + # Omit the length check if the current length is the maximum length. + if length != tag_maxlength: + println( + """ + if ({}.Length() == {}) {{ +""".format( + source_name, length + ).rstrip( + "\n" + ) + ) + else: + trailing_return = False + println( + """ + { +""".rstrip( + "\n" + ) + ) + + # The subtags need to be sorted for binary search to work. + subtags = sorted(subtags) + + def equals(subtag): + return """{}.EqualTo("{}")""".format(source_name, subtag) + + # Don't emit a binary search for short lists. + if len(subtags) == 1: + if type(mappings) == dict: + println( + """ + if ({}) {{ + {}.Set(mozilla::MakeStringSpan("{}")); + return true; + }} + return false; +""".format( + equals(subtags[0]), target_name, mappings[subtags[0]] + ).strip( + "\n" + ) + ) + else: + println( + """ + return {}; +""".format( + equals(subtags[0]) + ).strip( + "\n" + ) + ) + elif len(subtags) <= 4: + if type(mappings) == dict: + for subtag in subtags: + println( + """ + if ({}) {{ + {}.Set("{}"); + return true; + }} +""".format( + equals(subtag), target_name, mappings[subtag] + ).strip( + "\n" + ) + ) + + println( + """ + return false; +""".strip( + "\n" + ) + ) + else: + cond = (equals(subtag) for subtag in subtags) + cond = (" ||\n" + " " * (4 + len("return "))).join(cond) + println( + """ + return {}; +""".format( + cond + ).strip( + "\n" + ) + ) + else: + write_array(subtags, source_name + "s", length, True) + + if type(mappings) == dict: + write_array([mappings[k] for k in subtags], "aliases", length, False) + + println( + """ + if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{ + {1}.Set(mozilla::MakeStringSpan(replacement)); + return true; + }} + return false; +""".format( + source_name, target_name + ).rstrip() + ) + else: + println( + """ + return HasReplacement({0}s, {0}); +""".format( + source_name + ).rstrip() + ) + + println( + """ + } +""".strip( + "\n" + ) + ) + + if trailing_return: + println( + """ + return false;""" + ) + + +def writeComplexLanguageTagMappings( + println, complex_language_mappings, description, source, url +): + println("") + writeMappingHeader(println, description, source, url) + println( + """ +void mozilla::intl::Locale::PerformComplexLanguageMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); +""".lstrip() + ) + + # Merge duplicate language entries. + language_aliases = {} + for (deprecated_language, (language, script, region)) in sorted( + complex_language_mappings.items(), key=itemgetter(0) + ): + key = (language, script, region) + if key not in language_aliases: + language_aliases[key] = [] + else: + language_aliases[key].append(deprecated_language) + + first_language = True + for (deprecated_language, (language, script, region)) in sorted( + complex_language_mappings.items(), key=itemgetter(0) + ): + key = (language, script, region) + if deprecated_language in language_aliases[key]: + continue + + if_kind = "if" if first_language else "else if" + first_language = False + + cond = ( + 'Language().EqualTo("{}")'.format(lang) + for lang in [deprecated_language] + language_aliases[key] + ) + cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) + + println( + """ + {} ({}) {{""".format( + if_kind, cond + ).strip( + "\n" + ) + ) + + println( + """ + SetLanguage("{}");""".format( + language + ).strip( + "\n" + ) + ) + + if script is not None: + println( + """ + if (Script().Missing()) {{ + SetScript("{}"); + }}""".format( + script + ).strip( + "\n" + ) + ) + if region is not None: + println( + """ + if (Region().Missing()) {{ + SetRegion("{}"); + }}""".format( + region + ).strip( + "\n" + ) + ) + println( + """ + }""".strip( + "\n" + ) + ) + + println( + """ +} +""".strip( + "\n" + ) + ) + + +def writeComplexRegionTagMappings( + println, complex_region_mappings, description, source, url +): + println("") + writeMappingHeader(println, description, source, url) + println( + """ +void mozilla::intl::Locale::PerformComplexRegionMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); + MOZ_ASSERT(IsStructurallyValidRegionTag(Region().Span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(Region().Span())); +""".lstrip() + ) + + # |non_default_replacements| is a list and hence not hashable. Convert it + # to a string to get a proper hashable value. + def hash_key(default, non_default_replacements): + return (default, str(sorted(str(v) for v in non_default_replacements))) + + # Merge duplicate region entries. + region_aliases = {} + for (deprecated_region, (default, non_default_replacements)) in sorted( + complex_region_mappings.items(), key=itemgetter(0) + ): + key = hash_key(default, non_default_replacements) + if key not in region_aliases: + region_aliases[key] = [] + else: + region_aliases[key].append(deprecated_region) + + first_region = True + for (deprecated_region, (default, non_default_replacements)) in sorted( + complex_region_mappings.items(), key=itemgetter(0) + ): + key = hash_key(default, non_default_replacements) + if deprecated_region in region_aliases[key]: + continue + + if_kind = "if" if first_region else "else if" + first_region = False + + cond = ( + 'Region().EqualTo("{}")'.format(region) + for region in [deprecated_region] + region_aliases[key] + ) + cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) + + println( + """ + {} ({}) {{""".format( + if_kind, cond + ).strip( + "\n" + ) + ) + + replacement_regions = sorted( + {region for (_, _, region) in non_default_replacements} + ) + + first_case = True + for replacement_region in replacement_regions: + replacement_language_script = sorted( + (language, script) + for (language, script, region) in (non_default_replacements) + if region == replacement_region + ) + + if_kind = "if" if first_case else "else if" + first_case = False + + def compare_tags(language, script): + if script is None: + return 'Language().EqualTo("{}")'.format(language) + return '(Language().EqualTo("{}") && Script().EqualTo("{}"))'.format( + language, script + ) + + cond = ( + compare_tags(language, script) + for (language, script) in replacement_language_script + ) + cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond) + + println( + """ + {} ({}) {{ + SetRegion("{}"); + }}""".format( + if_kind, cond, replacement_region + ) + .rstrip() + .strip("\n") + ) + + println( + """ + else {{ + SetRegion("{}"); + }} + }}""".format( + default + ) + .rstrip() + .strip("\n") + ) + + println( + """ +} +""".strip( + "\n" + ) + ) + + +def writeVariantTagMappings(println, variant_mappings, description, source, url): + """Writes a function definition that maps variant subtags.""" + println( + """ +static const char* ToCharPointer(const char* str) { + return str; +} + +static const char* ToCharPointer(const mozilla::intl::UniqueChars& str) { + return str.get(); +} + +template +static bool IsLessThan(const T& a, const U& b) { + return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0; +} +""" + ) + writeMappingHeader(println, description, source, url) + println( + """ +bool mozilla::intl::Locale::PerformVariantMappings() { + // The variant subtags need to be sorted for binary search. + MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), + IsLessThan)); + + auto removeVariantAt = [&](size_t index) { + mVariants.erase(mVariants.begin() + index); + }; + + auto insertVariantSortedIfNotPresent = [&](const char* variant) { + auto* p = std::lower_bound( + mVariants.begin(), mVariants.end(), variant, + IsLessThan); + + // Don't insert the replacement when already present. + if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { + return true; + } + + // Insert the preferred variant in sort order. + auto preferred = DuplicateStringToUniqueChars(variant); + return !!mVariants.insert(p, std::move(preferred)); + }; + + for (size_t i = 0; i < mVariants.length();) { + const char* variant = mVariants[i].get(); + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant))); +""".lstrip() + ) + + (no_alias, with_alias) = partition( + variant_mappings.items(), lambda item: item[1] is None + ) + + no_replacements = " ||\n ".join( + f"""strcmp(variant, "{deprecated_variant}") == 0""" + for (deprecated_variant, _) in sorted(no_alias, key=itemgetter(0)) + ) + + println( + f""" + if ({no_replacements}) {{ + removeVariantAt(i); + }} +""".strip( + "\n" + ) + ) + + for (deprecated_variant, (type, replacement)) in sorted( + with_alias, key=itemgetter(0) + ): + println( + f""" + else if (strcmp(variant, "{deprecated_variant}") == 0) {{ + removeVariantAt(i); +""".strip( + "\n" + ) + ) + + if type == "language": + println( + f""" + SetLanguage("{replacement}"); +""".strip( + "\n" + ) + ) + elif type == "region": + println( + f""" + SetRegion("{replacement}"); +""".strip( + "\n" + ) + ) + else: + assert type == "variant" + println( + f""" + if (!insertVariantSortedIfNotPresent("{replacement}")) {{ + return false; + }} +""".strip( + "\n" + ) + ) + + println( + """ + } +""".strip( + "\n" + ) + ) + + println( + """ + else { + i++; + } + } + return true; +} +""".strip( + "\n" + ) + ) + + +def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url): + """Writes a function definition that maps legacy language tags.""" + println("") + writeMappingHeader(println, description, source, url) + println( + """\ +bool mozilla::intl::Locale::UpdateLegacyMappings() { + // We're mapping legacy tags to non-legacy form here. + // Other tags remain unchanged. + // + // Legacy tags are either sign language tags ("sgn") or have one or multiple + // variant subtags. Therefore we can quickly exclude most tags by checking + // these two subtags. + + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); + + if (!Language().EqualTo("sgn") && mVariants.length() == 0) { + return true; + } + +#ifdef DEBUG + for (const auto& variant : Variants()) { + MOZ_ASSERT(IsStructurallyValidVariantTag(variant)); + MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant)); + } +#endif + + // The variant subtags need to be sorted for binary search. + MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), + IsLessThan)); + + auto findVariant = [this](const char* variant) { + auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, + IsLessThan); + + if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { + return p; + } + return static_cast(nullptr); + }; + + auto insertVariantSortedIfNotPresent = [&](const char* variant) { + auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, + IsLessThan); + + // Don't insert the replacement when already present. + if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { + return true; + } + + // Insert the preferred variant in sort order. + auto preferred = DuplicateStringToUniqueChars(variant); + return !!mVariants.insert(p, std::move(preferred)); + }; + + auto removeVariant = [&](auto* p) { + size_t index = std::distance(mVariants.begin(), p); + mVariants.erase(mVariants.begin() + index); + }; + + auto removeVariants = [&](auto* p, auto* q) { + size_t pIndex = std::distance(mVariants.begin(), p); + size_t qIndex = std::distance(mVariants.begin(), q); + MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted"); + + mVariants.erase(mVariants.begin() + qIndex); + mVariants.erase(mVariants.begin() + pIndex); + };""" + ) + + # Helper class for pattern matching. + class AnyClass: + def __eq__(self, obj): + return obj is not None + + Any = AnyClass() + + # Group the mappings by language. + legacy_mappings_by_language = {} + for (type, replacement) in legacy_mappings.items(): + (language, _, _, _) = type + legacy_mappings_by_language.setdefault(language, {})[type] = replacement + + # Handle the empty language case first. + if None in legacy_mappings_by_language: + # Get the mappings and remove them from the dict. + mappings = legacy_mappings_by_language.pop(None) + + # This case only applies for the "hepburn-heploc" -> "alalc97" + # mapping, so just inline it here. + from_tag = (None, None, None, "hepburn-heploc") + to_tag = (None, None, None, "alalc97") + + assert len(mappings) == 1 + assert mappings[from_tag] == to_tag + + println( + """ + if (mVariants.length() >= 2) { + if (auto* hepburn = findVariant("hepburn")) { + if (auto* heploc = findVariant("heploc")) { + removeVariants(hepburn, heploc); + + if (!insertVariantSortedIfNotPresent("alalc97")) { + return false; + } + } + } + } +""" + ) + + # Handle sign languages next. + if "sgn" in legacy_mappings_by_language: + mappings = legacy_mappings_by_language.pop("sgn") + + # Legacy sign language mappings have the form "sgn-XX" where "XX" is + # some region code. + assert all(type == ("sgn", None, Any, None) for type in mappings.keys()) + + # Legacy sign languages are mapped to a single language subtag. + assert all( + replacement == (Any, None, None, None) for replacement in mappings.values() + ) + + println( + """ + if (Language().EqualTo("sgn")) { + if (Region().Present() && SignLanguageMapping(mLanguage, Region())) { + mRegion.Set(mozilla::MakeStringSpan("")); + } + } +""".rstrip().lstrip( + "\n" + ) + ) + + # Finally handle all remaining cases. + + # The remaining mappings have neither script nor region subtags in the source locale. + assert all( + type == (Any, None, None, Any) + for mappings in legacy_mappings_by_language.values() + for type in mappings.keys() + ) + + # And they have neither script nor region nor variant subtags in the target locale. + assert all( + replacement == (Any, None, None, None) + for mappings in legacy_mappings_by_language.values() + for replacement in mappings.values() + ) + + # Compact the mappings table by removing empty fields. + legacy_mappings_by_language = { + lang: { + variants: r_language + for ((_, _, _, variants), (r_language, _, _, _)) in mappings.items() + } + for (lang, mappings) in legacy_mappings_by_language.items() + } + + # Try to combine the remaining cases. + legacy_mappings_compact = {} + + # Python can't hash dicts or lists, so use the string representation as the hash key. + def hash_key(mappings): + return str(sorted(mappings.items(), key=itemgetter(0))) + + for (lang, mappings) in sorted( + legacy_mappings_by_language.items(), key=itemgetter(0) + ): + key = hash_key(mappings) + legacy_mappings_compact.setdefault(key, []).append(lang) + + for langs in legacy_mappings_compact.values(): + language_equal_to = ( + f"""Language().EqualTo("{lang}")""" for lang in sorted(langs) + ) + cond = f""" ||\n{" " * len(" else if (")}""".join(language_equal_to) + + println( + f""" + else if ({cond}) {{ +""".rstrip().lstrip( + "\n" + ) + ) + + mappings = legacy_mappings_by_language[langs[0]] + + # Count the variant subtags to determine the sort order. + def variant_size(m): + (k, _) = m + return len(k.split("-")) + + # Alias rules are applied by largest union size first. + for (size, mappings_by_size) in groupby( + sorted(mappings.items(), key=variant_size, reverse=True), key=variant_size + ): + + # Convert grouper object to dict. + mappings_by_size = dict(mappings_by_size) + + is_first = True + chain_if = size == 1 + + # Alias rules are applied in alphabetical order + for (variants, r_language) in sorted( + mappings_by_size.items(), key=itemgetter(0) + ): + sorted_variants = sorted(variants.split("-")) + len_variants = len(sorted_variants) + + maybe_else = "else " if chain_if and not is_first else "" + is_first = False + + for (i, variant) in enumerate(sorted_variants): + println( + f""" + {" " * i}{maybe_else}if (auto* {variant} = findVariant("{variant}")) {{ +""".rstrip().lstrip( + "\n" + ) + ) + + indent = " " * len_variants + + println( + f""" + {indent}removeVariant{"s" if len_variants > 1 else ""}({", ".join(sorted_variants)}); + {indent}SetLanguage("{r_language}"); + {indent}{"return true;" if not chain_if else ""} +""".rstrip().lstrip( + "\n" + ) + ) + + for i in range(len_variants, 0, -1): + println( + f""" + {" " * (i - 1)}}} +""".rstrip().lstrip( + "\n" + ) + ) + + println( + """ + } +""".rstrip().lstrip( + "\n" + ) + ) + + println( + """ + return true; +}""" + ) + + +def writeSignLanguageMappingsFunction( + println, legacy_mappings, description, source, url +): + """Writes a function definition that maps legacy sign language tags.""" + println("") + writeMappingHeader(println, description, source, url) + println( + """\ +bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language, + const RegionSubtag& region) { + MOZ_ASSERT(language.EqualTo("sgn")); + MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span())); +""".rstrip() + ) + + region_mappings = { + rg: lg + for ((lang, _, rg, _), (lg, _, _, _)) in legacy_mappings.items() + if lang == "sgn" + } + + source_name = "region" + target_name = "language" + tag_maxlength = 3 + writeMappingsBinarySearchBody( + println, source_name, target_name, region_mappings, tag_maxlength + ) + + println( + """ +}""".lstrip() + ) + + +def readSupplementalData(core_file): + """Reads CLDR Supplemental Data and extracts information for Intl.js. + + Information extracted: + - legacyMappings: mappings from legacy tags to preferred complete language tags + - languageMappings: mappings from language subtags to preferred subtags + - complexLanguageMappings: mappings from language subtags with complex rules + - regionMappings: mappings from region subtags to preferred subtags + - complexRegionMappings: mappings from region subtags with complex rules + - variantMappings: mappings from variant subtags to preferred subtags + - likelySubtags: likely subtags used for generating test data only + Returns these mappings as dictionaries. + """ + import xml.etree.ElementTree as ET + + # From Unicode BCP 47 locale identifier . + re_unicode_language_id = re.compile( + r""" + ^ + # unicode_language_id = unicode_language_subtag + # unicode_language_subtag = alpha{2,3} | alpha{5,8} + (?P[a-z]{2,3}|[a-z]{5,8}) + + # (sep unicode_script_subtag)? + # unicode_script_subtag = alpha{4} + (?:-(?P