summaryrefslogtreecommitdiffstats
path: root/js/src/builtin/intl/make_intl_data.py
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
commit2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
treeb80bf8bf13c3766139fbacc530efd0dd9d54394c /js/src/builtin/intl/make_intl_data.py
parentInitial commit. (diff)
downloadfirefox-upstream.tar.xz
firefox-upstream.zip
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'js/src/builtin/intl/make_intl_data.py')
-rwxr-xr-xjs/src/builtin/intl/make_intl_data.py3731
1 files changed, 3731 insertions, 0 deletions
diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py
new file mode 100755
index 0000000000..802902336e
--- /dev/null
+++ b/js/src/builtin/intl/make_intl_data.py
@@ -0,0 +1,3731 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+""" Usage:
+ make_intl_data.py langtags [cldr_core.zip]
+ make_intl_data.py tzdata
+ make_intl_data.py currency
+ make_intl_data.py units
+ make_intl_data.py numbering
+
+
+ Target "langtags":
+ This script extracts information about 1) mappings between deprecated and
+ current Unicode BCP 47 locale identifiers, and 2) deprecated and current
+ BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping
+ code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp.
+
+
+ Target "tzdata":
+ This script computes which time zone informations are not up-to-date in ICU
+ and provides the necessary mappings to workaround this problem.
+ https://ssl.icu-project.org/trac/ticket/12044
+
+
+ Target "currency":
+ Generates the mapping from currency codes to decimal digits used for them.
+
+
+ Target "units":
+ Generate source and test files using the list of so-called "sanctioned unit
+ identifiers" and verifies that the ICU data filter includes these units.
+
+
+ Target "numbering":
+ Generate source and test files using the list of numbering systems with
+ simple digit mappings and verifies that it's in sync with ICU/CLDR.
+"""
+
+from __future__ import print_function
+import os
+import re
+import io
+import json
+import sys
+import tarfile
+import tempfile
+import yaml
+from contextlib import closing
+from functools import partial, total_ordering
+from itertools import chain, groupby, tee
+from operator import attrgetter, itemgetter
+from zipfile import ZipFile
+
+if sys.version_info.major == 2:
+ from itertools import (
+ ifilter as filter,
+ ifilterfalse as filterfalse,
+ imap as map,
+ izip_longest as zip_longest,
+ )
+ from urllib2 import urlopen, Request as UrlRequest
+ from urlparse import urlsplit
+else:
+ from itertools import filterfalse, zip_longest
+ from urllib.request import urlopen, Request as UrlRequest
+ from urllib.parse import urlsplit
+
+
+# From https://docs.python.org/3/library/itertools.html
+def grouper(iterable, n, fillvalue=None):
+ "Collect data into fixed-length chunks or blocks"
+ # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
+ args = [iter(iterable)] * n
+ return zip_longest(*args, fillvalue=fillvalue)
+
+
+def writeMappingHeader(println, description, source, url):
+ if type(description) is not list:
+ description = [description]
+ for desc in description:
+ println("// {0}".format(desc))
+ println("// Derived from {0}.".format(source))
+ println("// {0}".format(url))
+
+
+def writeMappingsVar(println, mapping, name, description, source, url):
+ """Writes a variable definition with a mapping table.
+
+ Writes the contents of dictionary |mapping| through the |println|
+ function with the given variable name and a comment with description,
+ fileDate, and URL.
+ """
+ println("")
+ writeMappingHeader(println, description, source, url)
+ println("var {0} = {{".format(name))
+ for (key, value) in sorted(mapping.items(), key=itemgetter(0)):
+ println(' "{0}": "{1}",'.format(key, value))
+ println("};")
+
+
+def writeMappingsBinarySearch(
+ println,
+ fn_name,
+ type_name,
+ name,
+ validate_fn,
+ validate_case_fn,
+ mappings,
+ tag_maxlength,
+ description,
+ source,
+ url,
+):
+ """Emit code to perform a binary search on language tag subtags.
+
+ Uses the contents of |mapping|, which can either be a dictionary or set,
+ to emit a mapping function to find subtag replacements.
+ """
+ println("")
+ writeMappingHeader(println, description, source, url)
+ println(
+ """
+bool js::intl::LanguageTag::{0}({1} {2}) {{
+ MOZ_ASSERT({3}({2}.span()));
+ MOZ_ASSERT({4}({2}.span()));
+""".format(
+ fn_name, type_name, name, validate_fn, validate_case_fn
+ ).strip()
+ )
+
+ def write_array(subtags, name, length, fixed):
+ if fixed:
+ println(
+ " static const char {}[{}][{}] = {{".format(
+ name, len(subtags), length + 1
+ )
+ )
+ else:
+ println(" static const char* {}[{}] = {{".format(name, len(subtags)))
+
+ # Group in pairs of ten to not exceed the 80 line column limit.
+ for entries in grouper(subtags, 10):
+ entries = (
+ '"{}"'.format(tag).rjust(length + 2)
+ for tag in entries
+ if tag is not None
+ )
+ println(" {},".format(", ".join(entries)))
+
+ println(" };")
+
+ trailing_return = True
+
+ # Sort the subtags by length. That enables using an optimized comparator
+ # for the binary search, which only performs a single |memcmp| for multiple
+ # of two subtag lengths.
+ mappings_keys = mappings.keys() if type(mappings) == dict else mappings
+ for (length, subtags) in groupby(sorted(mappings_keys, key=len), len):
+ # Omit the length check if the current length is the maximum length.
+ if length != tag_maxlength:
+ println(
+ """
+ if ({}.length() == {}) {{
+""".format(
+ name, length
+ ).rstrip(
+ "\n"
+ )
+ )
+ else:
+ trailing_return = False
+ println(
+ """
+ {
+""".rstrip(
+ "\n"
+ )
+ )
+
+ # The subtags need to be sorted for binary search to work.
+ subtags = sorted(subtags)
+
+ def equals(subtag):
+ return """{}.equalTo("{}")""".format(name, subtag)
+
+ # Don't emit a binary search for short lists.
+ if len(subtags) == 1:
+ if type(mappings) == dict:
+ println(
+ """
+ if ({}) {{
+ {}.set("{}");
+ return true;
+ }}
+ return false;
+""".format(
+ equals(subtags[0]), name, mappings[subtags[0]]
+ ).strip(
+ "\n"
+ )
+ )
+ else:
+ println(
+ """
+ return {};
+""".format(
+ equals(subtags[0])
+ ).strip(
+ "\n"
+ )
+ )
+ elif len(subtags) <= 4:
+ if type(mappings) == dict:
+ for subtag in subtags:
+ println(
+ """
+ if ({}) {{
+ {}.set("{}");
+ return true;
+ }}
+""".format(
+ equals(subtag), name, mappings[subtag]
+ ).strip(
+ "\n"
+ )
+ )
+
+ println(
+ """
+ return false;
+""".strip(
+ "\n"
+ )
+ )
+ else:
+ cond = (equals(subtag) for subtag in subtags)
+ cond = (" ||\n" + " " * (4 + len("return "))).join(cond)
+ println(
+ """
+ return {};
+""".format(
+ cond
+ ).strip(
+ "\n"
+ )
+ )
+ else:
+ write_array(subtags, name + "s", length, True)
+
+ if type(mappings) == dict:
+ write_array([mappings[k] for k in subtags], "aliases", length, False)
+
+ println(
+ """
+ if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
+ {0}.set(mozilla::MakeStringSpan(replacement));
+ return true;
+ }}
+ return false;
+""".format(
+ name
+ ).rstrip()
+ )
+ else:
+ println(
+ """
+ return HasReplacement({0}s, {0});
+""".format(
+ name
+ ).rstrip()
+ )
+
+ println(
+ """
+ }
+""".strip(
+ "\n"
+ )
+ )
+
+ if trailing_return:
+ println(
+ """
+ return false;"""
+ )
+
+ println(
+ """
+}""".lstrip(
+ "\n"
+ )
+ )
+
+
+def writeComplexLanguageTagMappings(
+ println, complex_language_mappings, description, source, url
+):
+ println("")
+ writeMappingHeader(println, description, source, url)
+ println(
+ """
+void js::intl::LanguageTag::performComplexLanguageMappings() {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
+ MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
+""".lstrip()
+ )
+
+ # Merge duplicate language entries.
+ language_aliases = {}
+ for (deprecated_language, (language, script, region)) in sorted(
+ complex_language_mappings.items(), key=itemgetter(0)
+ ):
+ key = (language, script, region)
+ if key not in language_aliases:
+ language_aliases[key] = []
+ else:
+ language_aliases[key].append(deprecated_language)
+
+ first_language = True
+ for (deprecated_language, (language, script, region)) in sorted(
+ complex_language_mappings.items(), key=itemgetter(0)
+ ):
+ key = (language, script, region)
+ if deprecated_language in language_aliases[key]:
+ continue
+
+ if_kind = "if" if first_language else "else if"
+ first_language = False
+
+ cond = (
+ 'language().equalTo("{}")'.format(lang)
+ for lang in [deprecated_language] + language_aliases[key]
+ )
+ cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
+
+ println(
+ """
+ {} ({}) {{""".format(
+ if_kind, cond
+ ).strip(
+ "\n"
+ )
+ )
+
+ println(
+ """
+ setLanguage("{}");""".format(
+ language
+ ).strip(
+ "\n"
+ )
+ )
+
+ if script is not None:
+ println(
+ """
+ if (script().missing()) {{
+ setScript("{}");
+ }}""".format(
+ script
+ ).strip(
+ "\n"
+ )
+ )
+ if region is not None:
+ println(
+ """
+ if (region().missing()) {{
+ setRegion("{}");
+ }}""".format(
+ region
+ ).strip(
+ "\n"
+ )
+ )
+ println(
+ """
+ }""".strip(
+ "\n"
+ )
+ )
+
+ println(
+ """
+}
+""".strip(
+ "\n"
+ )
+ )
+
+
+def writeComplexRegionTagMappings(
+ println, complex_region_mappings, description, source, url
+):
+ println("")
+ writeMappingHeader(println, description, source, url)
+ println(
+ """
+void js::intl::LanguageTag::performComplexRegionMappings() {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
+ MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
+ MOZ_ASSERT(IsStructurallyValidRegionTag(region().span()));
+ MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span()));
+""".lstrip()
+ )
+
+ # |non_default_replacements| is a list and hence not hashable. Convert it
+ # to a string to get a proper hashable value.
+ def hash_key(default, non_default_replacements):
+ return (default, str(sorted(str(v) for v in non_default_replacements)))
+
+ # Merge duplicate region entries.
+ region_aliases = {}
+ for (deprecated_region, (default, non_default_replacements)) in sorted(
+ complex_region_mappings.items(), key=itemgetter(0)
+ ):
+ key = hash_key(default, non_default_replacements)
+ if key not in region_aliases:
+ region_aliases[key] = []
+ else:
+ region_aliases[key].append(deprecated_region)
+
+ first_region = True
+ for (deprecated_region, (default, non_default_replacements)) in sorted(
+ complex_region_mappings.items(), key=itemgetter(0)
+ ):
+ key = hash_key(default, non_default_replacements)
+ if deprecated_region in region_aliases[key]:
+ continue
+
+ if_kind = "if" if first_region else "else if"
+ first_region = False
+
+ cond = (
+ 'region().equalTo("{}")'.format(region)
+ for region in [deprecated_region] + region_aliases[key]
+ )
+ cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
+
+ println(
+ """
+ {} ({}) {{""".format(
+ if_kind, cond
+ ).strip(
+ "\n"
+ )
+ )
+
+ replacement_regions = sorted(
+ {region for (_, _, region) in non_default_replacements}
+ )
+
+ first_case = True
+ for replacement_region in replacement_regions:
+ replacement_language_script = sorted(
+ (language, script)
+ for (language, script, region) in (non_default_replacements)
+ if region == replacement_region
+ )
+
+ if_kind = "if" if first_case else "else if"
+ first_case = False
+
+ def compare_tags(language, script):
+ if script is None:
+ return 'language().equalTo("{}")'.format(language)
+ return '(language().equalTo("{}") && script().equalTo("{}"))'.format(
+ language, script
+ )
+
+ cond = (
+ compare_tags(language, script)
+ for (language, script) in replacement_language_script
+ )
+ cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond)
+
+ println(
+ """
+ {} ({}) {{
+ setRegion("{}");
+ }}""".format(
+ if_kind, cond, replacement_region
+ )
+ .rstrip()
+ .strip("\n")
+ )
+
+ println(
+ """
+ else {{
+ setRegion("{}");
+ }}
+ }}""".format(
+ default
+ )
+ .rstrip()
+ .strip("\n")
+ )
+
+ println(
+ """
+}
+""".strip(
+ "\n"
+ )
+ )
+
+
+def writeVariantTagMappings(println, variant_mappings, description, source, url):
+ """ Writes a function definition that maps variant subtags. """
+ println(
+ """
+static const char* ToCharPointer(const char* str) {
+ return str;
+}
+
+static const char* ToCharPointer(const js::UniqueChars& str) {
+ return str.get();
+}
+
+template <typename T, typename U = T>
+static bool IsLessThan(const T& a, const U& b) {
+ return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0;
+}
+"""
+ )
+ writeMappingHeader(println, description, source, url)
+ println(
+ """
+bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) {
+ // The variant subtags need to be sorted for binary search.
+ MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(),
+ IsLessThan<decltype(variants_)::ElementType>));
+
+ auto insertVariantSortedIfNotPresent = [&](const char* variant) {
+ auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant,
+ IsLessThan<decltype(variants_)::ElementType,
+ decltype(variant)>);
+
+ // Don't insert the replacement when already present.
+ if (p != variants_.end() && strcmp(p->get(), variant) == 0) {
+ return true;
+ }
+
+ // Insert the preferred variant in sort order.
+ auto preferred = DuplicateString(cx, variant);
+ if (!preferred) {
+ return false;
+ }
+ return !!variants_.insert(p, std::move(preferred));
+ };
+
+ for (size_t i = 0; i < variants_.length(); ) {
+ auto& variant = variants_[i];
+ MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant.get())));
+""".lstrip()
+ )
+
+ first_variant = True
+
+ for (deprecated_variant, (type, replacement)) in sorted(
+ variant_mappings.items(), key=itemgetter(0)
+ ):
+ if_kind = "if" if first_variant else "else if"
+ first_variant = False
+
+ println(
+ """
+ {} (strcmp(variant.get(), "{}") == 0) {{
+ variants_.erase(variants_.begin() + i);
+""".format(
+ if_kind, deprecated_variant
+ ).strip(
+ "\n"
+ )
+ )
+
+ if type == "language":
+ println(
+ """
+ setLanguage("{}");
+""".format(
+ replacement
+ ).strip(
+ "\n"
+ )
+ )
+ elif type == "region":
+ println(
+ """
+ setRegion("{}");
+""".format(
+ replacement
+ ).strip(
+ "\n"
+ )
+ )
+ else:
+ assert type == "variant"
+ println(
+ """
+ if (!insertVariantSortedIfNotPresent("{}")) {{
+ return false;
+ }}
+""".format(
+ replacement
+ ).strip(
+ "\n"
+ )
+ )
+
+ println(
+ """
+ }
+""".strip(
+ "\n"
+ )
+ )
+
+ println(
+ """
+ else {
+ i++;
+ }
+ }
+ return true;
+}
+""".strip(
+ "\n"
+ )
+ )
+
+
+def writeGrandfatheredMappingsFunction(
+ println, grandfathered_mappings, description, source, url
+):
+ """ Writes a function definition that maps grandfathered language tags. """
+ println("")
+ writeMappingHeader(println, description, source, url)
+ println(
+ """\
+bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
+ // We're mapping regular grandfathered tags to non-grandfathered form here.
+ // Other tags remain unchanged.
+ //
+ // regular = "art-lojban"
+ // / "cel-gaulish"
+ // / "no-bok"
+ // / "no-nyn"
+ // / "zh-guoyu"
+ // / "zh-hakka"
+ // / "zh-min"
+ // / "zh-min-nan"
+ // / "zh-xiang"
+ //
+ // Therefore we can quickly exclude most tags by checking every
+ // |unicode_locale_id| subcomponent for characteristics not shared by any of
+ // the regular grandfathered (RG) tags:
+ //
+ // * Real-world |unicode_language_subtag|s are all two or three letters,
+ // so don't waste time running a useless |language.length > 3| fast-path.
+ // * No RG tag has a "script"-looking component.
+ // * No RG tag has a "region"-looking component.
+ // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
+ // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
+ // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
+ // that |unicode_locale_id| doesn't support.)
+ // * No RG tag contains |extensions| or |pu_extensions|.
+ if (script().present() ||
+ region().present() ||
+ variants().length() != 1 ||
+ extensions().length() != 0 ||
+ privateuse()) {
+ return true;
+ }
+
+ MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
+ MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variants()[0].get())));
+
+ auto variantEqualTo = [this](const char* variant) {
+ return strcmp(variants()[0].get(), variant) == 0;
+ };"""
+ )
+
+ # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
+ #
+ # Doesn't allow any 'extensions' subtags.
+ re_unicode_locale_id = re.compile(
+ r"""
+ ^
+ # unicode_language_id = unicode_language_subtag
+ # unicode_language_subtag = alpha{2,3} | alpha{5,8}
+ (?P<language>[a-z]{2,3}|[a-z]{5,8})
+
+ # (sep unicode_script_subtag)?
+ # unicode_script_subtag = alpha{4}
+ (?:-(?P<script>[a-z]{4}))?
+
+ # (sep unicode_region_subtag)?
+ # unicode_region_subtag = (alpha{2} | digit{3})
+ (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
+
+ # (sep unicode_variant_subtag)*
+ # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+ (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
+
+ # pu_extensions?
+ # pu_extensions = sep [xX] (sep alphanum{1,8})+
+ (?:-(?P<privateuse>x(-[a-z0-9]{1,8})+))?
+ $
+ """,
+ re.IGNORECASE | re.VERBOSE,
+ )
+
+ is_first = True
+
+ for (tag, modern) in sorted(grandfathered_mappings.items(), key=itemgetter(0)):
+ tag_match = re_unicode_locale_id.match(tag)
+ assert tag_match is not None
+
+ tag_language = tag_match.group("language")
+ assert (
+ tag_match.group("script") is None
+ ), "{} does not contain a script subtag".format(tag)
+ assert (
+ tag_match.group("region") is None
+ ), "{} does not contain a region subtag".format(tag)
+ tag_variants = tag_match.group("variants")
+ assert tag_variants is not None, "{} contains a variant subtag".format(tag)
+ assert (
+ tag_match.group("privateuse") is None
+ ), "{} does not contain a privateuse subtag".format(tag)
+
+ tag_variant = tag_variants[1:]
+ assert "-" not in tag_variant, "{} contains only a single variant".format(tag)
+
+ modern_match = re_unicode_locale_id.match(modern)
+ assert modern_match is not None
+
+ modern_language = modern_match.group("language")
+ modern_script = modern_match.group("script")
+ modern_region = modern_match.group("region")
+ modern_variants = modern_match.group("variants")
+ modern_privateuse = modern_match.group("privateuse")
+
+ println(
+ """
+ // {} -> {}
+""".format(
+ tag, modern
+ ).rstrip()
+ )
+
+ println(
+ """
+ {}if (language().equalTo("{}") && variantEqualTo("{}")) {{
+ """.format(
+ "" if is_first else "else ", tag_language, tag_variant
+ )
+ .rstrip()
+ .strip("\n")
+ )
+
+ is_first = False
+
+ println(
+ """
+ setLanguage("{}");
+ """.format(
+ modern_language
+ )
+ .rstrip()
+ .strip("\n")
+ )
+
+ if modern_script is not None:
+ println(
+ """
+ setScript("{}");
+ """.format(
+ modern_script
+ )
+ .rstrip()
+ .strip("\n")
+ )
+
+ if modern_region is not None:
+ println(
+ """
+ setRegion("{}");
+ """.format(
+ modern_region
+ )
+ .rstrip()
+ .strip("\n")
+ )
+
+ assert (
+ modern_variants is None
+ ), "all regular grandfathered tags' modern forms do not contain variant subtags"
+
+ println(
+ """
+ clearVariants();
+ """.rstrip().strip(
+ "\n"
+ )
+ )
+
+ if modern_privateuse is not None:
+ println(
+ """
+ auto privateuse = DuplicateString(cx, "{}");
+ if (!privateuse) {{
+ return false;
+ }}
+ setPrivateuse(std::move(privateuse));
+ """.format(
+ modern_privateuse
+ )
+ .rstrip()
+ .rstrip("\n")
+ )
+
+ println(
+ """
+ return true;
+ }""".rstrip().strip(
+ "\n"
+ )
+ )
+
+ println(
+ """
+ return true;
+}"""
+ )
+
+
+def readSupplementalData(core_file):
+ """Reads CLDR Supplemental Data and extracts information for Intl.js.
+
+ Information extracted:
+ - grandfatheredMappings: mappings from grandfathered tags to preferred
+ complete language tags
+ - languageMappings: mappings from language subtags to preferred subtags
+ - complexLanguageMappings: mappings from language subtags with complex rules
+ - regionMappings: mappings from region subtags to preferred subtags
+ - complexRegionMappings: mappings from region subtags with complex rules
+ - variantMappings: mappings from variant subtags to preferred subtags
+ - likelySubtags: likely subtags used for generating test data only
+ Returns these mappings as dictionaries.
+ """
+ import xml.etree.ElementTree as ET
+
+ # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
+ re_unicode_language_id = re.compile(
+ r"""
+ ^
+ # unicode_language_id = unicode_language_subtag
+ # unicode_language_subtag = alpha{2,3} | alpha{5,8}
+ (?P<language>[a-z]{2,3}|[a-z]{5,8})
+
+ # (sep unicode_script_subtag)?
+ # unicode_script_subtag = alpha{4}
+ (?:-(?P<script>[a-z]{4}))?
+
+ # (sep unicode_region_subtag)?
+ # unicode_region_subtag = (alpha{2} | digit{3})
+ (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
+
+ # (sep unicode_variant_subtag)*
+ # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+ (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
+ $
+ """,
+ re.IGNORECASE | re.VERBOSE,
+ )
+
+ re_unicode_language_subtag = re.compile(
+ r"""
+ ^
+ # unicode_language_subtag = alpha{2,3} | alpha{5,8}
+ ([a-z]{2,3}|[a-z]{5,8})
+ $
+ """,
+ re.IGNORECASE | re.VERBOSE,
+ )
+
+ re_unicode_region_subtag = re.compile(
+ r"""
+ ^
+ # unicode_region_subtag = (alpha{2} | digit{3})
+ ([a-z]{2}|[0-9]{3})
+ $
+ """,
+ re.IGNORECASE | re.VERBOSE,
+ )
+
+ re_unicode_variant_subtag = re.compile(
+ r"""
+ ^
+ # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+ ([a-z0-9]{5,8}|(?:[0-9][a-z0-9]{3}))
+ $
+ """,
+ re.IGNORECASE | re.VERBOSE,
+ )
+
+ # The fixed list of BCP 47 grandfathered language tags.
+ grandfathered_tags = (
+ "art-lojban",
+ "cel-gaulish",
+ "en-GB-oed",
+ "i-ami",
+ "i-bnn",
+ "i-default",
+ "i-enochian",
+ "i-hak",
+ "i-klingon",
+ "i-lux",
+ "i-mingo",
+ "i-navajo",
+ "i-pwn",
+ "i-tao",
+ "i-tay",
+ "i-tsu",
+ "no-bok",
+ "no-nyn",
+ "sgn-BE-FR",
+ "sgn-BE-NL",
+ "sgn-CH-DE",
+ "zh-guoyu",
+ "zh-hakka",
+ "zh-min",
+ "zh-min-nan",
+ "zh-xiang",
+ )
+
+ # The list of grandfathered tags which are valid Unicode BCP 47 locale identifiers.
+ unicode_bcp47_grandfathered_tags = {
+ tag for tag in grandfathered_tags if re_unicode_language_id.match(tag)
+ }
+
+ # Dictionary of simple language subtag mappings, e.g. "in" -> "id".
+ language_mappings = {}
+
+ # Dictionary of complex language subtag mappings, modifying more than one
+ # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
+ complex_language_mappings = {}
+
+ # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
+ region_mappings = {}
+
+ # Dictionary of complex region subtag mappings, containing more than one
+ # replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
+ complex_region_mappings = {}
+
+ # Dictionary of aliased variant subtags to a tuple of preferred replacement
+ # type and replacement, e.g. "arevela" -> ("language", "hy") or
+ # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
+ variant_mappings = {}
+
+ # Dictionary of grandfathered mappings to preferred values.
+ grandfathered_mappings = {}
+
+ # CLDR uses "_" as the separator for some elements. Replace it with "-".
+ def bcp47_id(cldr_id):
+ return cldr_id.replace("_", "-")
+
+ # CLDR uses the canonical case for most entries, but there are some
+ # exceptions, like:
+ # <languageAlias type="drw" replacement="fa_af" reason="deprecated"/>
+ # Therefore canonicalize all tags to be on the safe side.
+ def bcp47_canonical(language, script, region):
+ # Canonical case for language subtags is lower case.
+ # Canonical case for script subtags is title case.
+ # Canonical case for region subtags is upper case.
+ return (
+ language.lower() if language else None,
+ script.title() if script else None,
+ region.upper() if region else None,
+ )
+
+ tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
+
+ for language_alias in tree.iterfind(".//languageAlias"):
+ type = bcp47_id(language_alias.get("type"))
+ replacement = bcp47_id(language_alias.get("replacement"))
+
+ # Handle grandfathered mappings first.
+ if type in unicode_bcp47_grandfathered_tags:
+ grandfathered_mappings[type] = replacement
+ continue
+
+ # We're only interested in language subtag matches, so ignore any
+ # entries which have additional subtags.
+ if re_unicode_language_subtag.match(type) is None:
+ continue
+
+ assert type.islower()
+
+ if re_unicode_language_subtag.match(replacement) is not None:
+ # Canonical case for language subtags is lower-case.
+ language_mappings[type] = replacement.lower()
+ else:
+ replacement_match = re_unicode_language_id.match(replacement)
+ assert (
+ replacement_match is not None
+ ), "{} invalid Unicode BCP 47 locale identifier".format(replacement)
+ assert (
+ replacement_match.group("variants") is None
+ ), "{}: unexpected variant subtags in {}".format(type, replacement)
+
+ complex_language_mappings[type] = bcp47_canonical(
+ replacement_match.group("language"),
+ replacement_match.group("script"),
+ replacement_match.group("region"),
+ )
+
+ for territory_alias in tree.iterfind(".//territoryAlias"):
+ type = territory_alias.get("type")
+ replacement = territory_alias.get("replacement")
+
+ # We're only interested in region subtag matches, so ignore any entries
+ # which contain legacy formats, e.g. three letter region codes.
+ if re_unicode_region_subtag.match(type) is None:
+ continue
+
+ assert type.isupper() or type.isdigit()
+
+ if re_unicode_region_subtag.match(replacement) is not None:
+ # Canonical case for region subtags is upper-case.
+ region_mappings[type] = replacement.upper()
+ else:
+ # Canonical case for region subtags is upper-case.
+ replacements = [r.upper() for r in replacement.split(" ")]
+ assert all(
+ re_unicode_region_subtag.match(loc) is not None for loc in replacements
+ ), "{} invalid region subtags".format(replacement)
+ complex_region_mappings[type] = replacements
+
+ for variant_alias in tree.iterfind(".//variantAlias"):
+ type = variant_alias.get("type")
+ replacement = variant_alias.get("replacement")
+
+ assert (
+ re_unicode_variant_subtag.match(type) is not None
+ ), "{} invalid variant subtag".format(type)
+
+ # Normalize the case, because some variants are in upper case.
+ type = type.lower()
+
+ # The replacement can be a language, a region, or a variant subtag.
+ # Language and region subtags are case normalized, variant subtags can
+ # be in any case.
+
+ if (
+ re_unicode_language_subtag.match(replacement) is not None
+ and replacement.islower()
+ ):
+ variant_mappings[type] = ("language", replacement)
+
+ elif re_unicode_region_subtag.match(replacement) is not None:
+ assert (
+ replacement.isupper() or replacement.isdigit()
+ ), "{} invalid variant subtag replacement".format(replacement)
+ variant_mappings[type] = ("region", replacement)
+
+ else:
+ assert (
+ re_unicode_variant_subtag.match(replacement) is not None
+ ), "{} invalid variant subtag replacement".format(replacement)
+ variant_mappings[type] = ("variant", replacement.lower())
+
+ tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
+
+ likely_subtags = {}
+
+ for likely_subtag in tree.iterfind(".//likelySubtag"):
+ from_tag = bcp47_id(likely_subtag.get("from"))
+ from_match = re_unicode_language_id.match(from_tag)
+ assert (
+ from_match is not None
+ ), "{} invalid Unicode BCP 47 locale identifier".format(from_tag)
+ assert (
+ from_match.group("variants") is None
+ ), "unexpected variant subtags in {}".format(from_tag)
+
+ to_tag = bcp47_id(likely_subtag.get("to"))
+ to_match = re_unicode_language_id.match(to_tag)
+ assert (
+ to_match is not None
+ ), "{} invalid Unicode BCP 47 locale identifier".format(to_tag)
+ assert (
+ to_match.group("variants") is None
+ ), "unexpected variant subtags in {}".format(to_tag)
+
+ from_canonical = bcp47_canonical(
+ from_match.group("language"),
+ from_match.group("script"),
+ from_match.group("region"),
+ )
+
+ to_canonical = bcp47_canonical(
+ to_match.group("language"),
+ to_match.group("script"),
+ to_match.group("region"),
+ )
+
+ likely_subtags[from_canonical] = to_canonical
+
+ complex_region_mappings_final = {}
+
+ for (deprecated_region, replacements) in complex_region_mappings.items():
+ # Find all likely subtag entries which don't already contain a region
+ # subtag and whose target region is in the list of replacement regions.
+ region_likely_subtags = [
+ (from_language, from_script, to_region)
+ for (
+ (from_language, from_script, from_region),
+ (_, _, to_region),
+ ) in likely_subtags.items()
+ if from_region is None and to_region in replacements
+ ]
+
+ # The first replacement entry is the default region.
+ default = replacements[0]
+
+ # Find all likely subtag entries whose region matches the default region.
+ default_replacements = {
+ (language, script)
+ for (language, script, region) in region_likely_subtags
+ if region == default
+ }
+
+ # And finally find those entries which don't use the default region.
+ # These are the entries we're actually interested in, because those need
+ # to be handled specially when selecting the correct preferred region.
+ non_default_replacements = [
+ (language, script, region)
+ for (language, script, region) in region_likely_subtags
+ if (language, script) not in default_replacements
+ ]
+
+ # If there are no non-default replacements, we can handle the region as
+ # part of the simple region mapping.
+ if non_default_replacements:
+ complex_region_mappings_final[deprecated_region] = (
+ default,
+ non_default_replacements,
+ )
+ else:
+ region_mappings[deprecated_region] = default
+
+ return {
+ "grandfatheredMappings": grandfathered_mappings,
+ "languageMappings": language_mappings,
+ "complexLanguageMappings": complex_language_mappings,
+ "regionMappings": region_mappings,
+ "complexRegionMappings": complex_region_mappings_final,
+ "variantMappings": variant_mappings,
+ "likelySubtags": likely_subtags,
+ }
+
+
+def readUnicodeExtensions(core_file):
+ import xml.etree.ElementTree as ET
+
+ # Match all xml-files in the BCP 47 directory.
+ bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$")
+
+ # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
+ #
+ # type = alphanum{3,8} (sep alphanum{3,8})* ;
+ typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
+
+ # Mapping from Unicode extension types to dict of deprecated to
+ # preferred values.
+ mapping = {
+ # Unicode BCP 47 U Extension
+ "u": {},
+ # Unicode BCP 47 T Extension
+ "t": {},
+ }
+
+ def readBCP47File(file):
+ tree = ET.parse(file)
+ for keyword in tree.iterfind(".//keyword/key"):
+ extension = keyword.get("extension", "u")
+ assert (
+ extension == "u" or extension == "t"
+ ), "unknown extension type: {}".format(extension)
+
+ extension_name = keyword.get("name")
+
+ for type in keyword.iterfind("type"):
+ # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+ #
+ # The key or type name used by Unicode locale extension with 'u' extension
+ # syntax or the 't' extensions syntax. When alias below is absent, this name
+ # can be also used with the old style "@key=type" syntax.
+ name = type.get("name")
+
+ # Ignore the special name:
+ # - <https://unicode.org/reports/tr35/#CODEPOINTS>
+ # - <https://unicode.org/reports/tr35/#REORDER_CODE>
+ # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
+ # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
+ # - <https://unicode.org/reports/tr35/#PRIVATE_USE>
+ if name in (
+ "CODEPOINTS",
+ "REORDER_CODE",
+ "RG_KEY_VALUE",
+ "SUBDIVISION_CODE",
+ "PRIVATE_USE",
+ ):
+ continue
+
+ # All other names should match the 'type' production.
+ assert (
+ typeRE.match(name) is not None
+ ), "{} matches the 'type' production".format(name)
+
+ # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+ #
+ # The preferred value of the deprecated key, type or attribute element.
+ # When a key, type or attribute element is deprecated, this attribute is
+ # used for specifying a new canonical form if available.
+ preferred = type.get("preferred")
+
+ # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+ #
+ # The BCP 47 form is the canonical form, and recommended. Other aliases are
+ # included only for backwards compatibility.
+ alias = type.get("alias")
+
+ # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+ #
+ # Use the bcp47 data to replace keys, types, tfields, and tvalues by their
+ # canonical forms. See Section 3.6.4 U Extension Data Files) and Section
+ # 3.7.1 T Extension Data Files. The aliases are in the alias attribute
+ # value, while the canonical is in the name attribute value.
+
+ # 'preferred' contains the new preferred name, 'alias' the compatibility
+ # name, but then there's this entry where 'preferred' and 'alias' are the
+ # same. So which one to choose? Assume 'preferred' is the actual canonical
+ # name.
+ #
+ # <type name="islamicc"
+ # description="Civil (algorithmic) Arabic calendar"
+ # deprecated="true"
+ # preferred="islamic-civil"
+ # alias="islamic-civil"/>
+
+ if preferred is not None:
+ assert typeRE.match(preferred), preferred
+ mapping[extension].setdefault(extension_name, {})[name] = preferred
+
+ if alias is not None:
+ for alias_name in alias.lower().split(" "):
+ # Ignore alias entries which don't match the 'type' production.
+ if typeRE.match(alias_name) is None:
+ continue
+
+ # See comment above when 'alias' and 'preferred' are both present.
+ if (
+ preferred is not None
+ and name in mapping[extension][extension_name]
+ ):
+ continue
+
+ # Skip over entries where 'name' and 'alias' are equal.
+ #
+ # <type name="pst8pdt"
+ # description="POSIX style time zone for US Pacific Time"
+ # alias="PST8PDT"
+ # since="1.8"/>
+ if name == alias_name:
+ continue
+
+ mapping[extension].setdefault(extension_name, {})[
+ alias_name
+ ] = name
+
+ def readSupplementalMetadata(file):
+ # Find subdivision and region replacements.
+ #
+ # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+ #
+ # Replace aliases in special key values:
+ # - If there is an 'sd' or 'rg' key, replace any subdivision alias
+ # in its value in the same way, using subdivisionAlias data.
+ tree = ET.parse(file)
+ for alias in tree.iterfind(".//subdivisionAlias"):
+ type = alias.get("type")
+ assert (
+ typeRE.match(type) is not None
+ ), "{} matches the 'type' production".format(type)
+
+ # Take the first replacement when multiple ones are present.
+ replacement = alias.get("replacement").split(" ")[0].lower()
+
+ # Skip over invalid replacements.
+ #
+ # <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/>
+ #
+ # It's not entirely clear to me if CLDR actually wants to use
+ # "axzzzz" as the replacement for this case.
+ if typeRE.match(replacement) is None:
+ continue
+
+ # 'subdivisionAlias' applies to 'rg' and 'sd' keys.
+ mapping["u"].setdefault("rg", {})[type] = replacement
+ mapping["u"].setdefault("sd", {})[type] = replacement
+
+ for name in core_file.namelist():
+ if bcpFileRE.match(name):
+ readBCP47File(core_file.open(name))
+
+ readSupplementalMetadata(
+ core_file.open("common/supplemental/supplementalMetadata.xml")
+ )
+
+ return {
+ "unicodeMappings": mapping["u"],
+ "transformMappings": mapping["t"],
+ }
+
+
+def writeCLDRLanguageTagData(println, data, url):
+ """ Writes the language tag data to the Intl data file. """
+
+ println(generatedFileWarning)
+ println("// Version: CLDR-{}".format(data["version"]))
+ println("// URL: {}".format(url))
+
+ println(
+ """
+#include "mozilla/Assertions.h"
+#include "mozilla/Span.h"
+#include "mozilla/TextUtils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <type_traits>
+
+#include "builtin/intl/LanguageTag.h"
+#include "util/Text.h"
+#include "vm/JSContext.h"
+
+using namespace js::intl::LanguageTagLimits;
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline bool HasReplacement(
+ const char (&subtags)[Length][TagLength],
+ const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+ MOZ_ASSERT(subtag.length() == TagLength - 1,
+ "subtag must have the same length as the list of subtags");
+
+ const char* ptr = subtag.span().data();
+ return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
+ [](const char* a, const char* b) {
+ return memcmp(a, b, TagLength - 1) < 0;
+ });
+}
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline const char* SearchReplacement(
+ const char (&subtags)[Length][TagLength],
+ const char* (&aliases)[Length],
+ const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+ MOZ_ASSERT(subtag.length() == TagLength - 1,
+ "subtag must have the same length as the list of subtags");
+
+ const char* ptr = subtag.span().data();
+ auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
+ [](const char* a, const char* b) {
+ return memcmp(a, b, TagLength - 1) < 0;
+ });
+ if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
+ return aliases[std::distance(std::begin(subtags), p)];
+ }
+ return nullptr;
+}
+
+#ifdef DEBUG
+static bool IsAsciiLowercaseAlphanumeric(char c) {
+ return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
+}
+
+static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
+ return IsAsciiLowercaseAlphanumeric(c) || c == '-';
+}
+
+static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha<char>);
+}
+
+static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha<char>) ||
+ std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>);
+}
+
+static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
+ return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
+ return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
+}
+
+static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
+ return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
+ return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
+}
+#endif
+""".rstrip()
+ )
+
+ source = "CLDR Supplemental Data, version {}".format(data["version"])
+ grandfathered_mappings = data["grandfatheredMappings"]
+ language_mappings = data["languageMappings"]
+ complex_language_mappings = data["complexLanguageMappings"]
+ region_mappings = data["regionMappings"]
+ complex_region_mappings = data["complexRegionMappings"]
+ variant_mappings = data["variantMappings"]
+ unicode_mappings = data["unicodeMappings"]
+ transform_mappings = data["transformMappings"]
+
+ # unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
+ language_maxlength = 8
+
+ # unicode_region_subtag = (alpha{2} | digit{3}) ;
+ region_maxlength = 3
+
+ writeMappingsBinarySearch(
+ println,
+ "languageMapping",
+ "LanguageSubtag&",
+ "language",
+ "IsStructurallyValidLanguageTag",
+ "IsCanonicallyCasedLanguageTag",
+ language_mappings,
+ language_maxlength,
+ "Mappings from language subtags to preferred values.",
+ source,
+ url,
+ )
+ writeMappingsBinarySearch(
+ println,
+ "complexLanguageMapping",
+ "const LanguageSubtag&",
+ "language",
+ "IsStructurallyValidLanguageTag",
+ "IsCanonicallyCasedLanguageTag",
+ complex_language_mappings.keys(),
+ language_maxlength,
+ "Language subtags with complex mappings.",
+ source,
+ url,
+ )
+ writeMappingsBinarySearch(
+ println,
+ "regionMapping",
+ "RegionSubtag&",
+ "region",
+ "IsStructurallyValidRegionTag",
+ "IsCanonicallyCasedRegionTag",
+ region_mappings,
+ region_maxlength,
+ "Mappings from region subtags to preferred values.",
+ source,
+ url,
+ )
+ writeMappingsBinarySearch(
+ println,
+ "complexRegionMapping",
+ "const RegionSubtag&",
+ "region",
+ "IsStructurallyValidRegionTag",
+ "IsCanonicallyCasedRegionTag",
+ complex_region_mappings.keys(),
+ region_maxlength,
+ "Region subtags with complex mappings.",
+ source,
+ url,
+ )
+
+ writeComplexLanguageTagMappings(
+ println,
+ complex_language_mappings,
+ "Language subtags with complex mappings.",
+ source,
+ url,
+ )
+ writeComplexRegionTagMappings(
+ println,
+ complex_region_mappings,
+ "Region subtags with complex mappings.",
+ source,
+ url,
+ )
+
+ writeVariantTagMappings(
+ println,
+ variant_mappings,
+ "Mappings from variant subtags to preferred values.",
+ source,
+ url,
+ )
+
+ writeGrandfatheredMappingsFunction(
+ println,
+ grandfathered_mappings,
+ "Canonicalize grandfathered locale identifiers.",
+ source,
+ url,
+ )
+
+ writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode")
+ writeUnicodeExtensionsMappings(println, transform_mappings, "Transform")
+
+
+def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
+ """ Writes the likely-subtags test file. """
+
+ println(generatedFileWarning)
+
+ source = "CLDR Supplemental Data, version {}".format(data["version"])
+ language_mappings = data["languageMappings"]
+ complex_language_mappings = data["complexLanguageMappings"]
+ region_mappings = data["regionMappings"]
+ complex_region_mappings = data["complexRegionMappings"]
+ likely_subtags = data["likelySubtags"]
+
+ def bcp47(tag):
+ (language, script, region) = tag
+ return "{}{}{}".format(
+ language, "-" + script if script else "", "-" + region if region else ""
+ )
+
+ def canonical(tag):
+ (language, script, region) = tag
+
+ # Map deprecated language subtags.
+ if language in language_mappings:
+ language = language_mappings[language]
+ elif language in complex_language_mappings:
+ (language2, script2, region2) = complex_language_mappings[language]
+ (language, script, region) = (
+ language2,
+ script if script else script2,
+ region if region else region2,
+ )
+
+ # Map deprecated region subtags.
+ if region in region_mappings:
+ region = region_mappings[region]
+ else:
+ # Assume no complex region mappings are needed for now.
+ assert (
+ region not in complex_region_mappings
+ ), "unexpected region with complex mappings: {}".format(region)
+
+ return (language, script, region)
+
+ # https://unicode.org/reports/tr35/#Likely_Subtags
+
+ def addLikelySubtags(tag):
+ # Step 1: Canonicalize.
+ (language, script, region) = canonical(tag)
+ if script == "Zzzz":
+ script = None
+ if region == "ZZ":
+ region = None
+
+ # Step 2: Lookup.
+ searches = (
+ (language, script, region),
+ (language, None, region),
+ (language, script, None),
+ (language, None, None),
+ ("und", script, None),
+ )
+ search = next(search for search in searches if search in likely_subtags)
+
+ (language_s, script_s, region_s) = search
+ (language_m, script_m, region_m) = likely_subtags[search]
+
+ # Step 3: Return.
+ return (
+ language if language != language_s else language_m,
+ script if script != script_s else script_m,
+ region if region != region_s else region_m,
+ )
+
+ # https://unicode.org/reports/tr35/#Likely_Subtags
+ def removeLikelySubtags(tag):
+ # Step 1: Add likely subtags.
+ max = addLikelySubtags(tag)
+
+ # Step 2: Remove variants (doesn't apply here).
+
+ # Step 3: Find a match.
+ (language, script, region) = max
+ for trial in (
+ (language, None, None),
+ (language, None, region),
+ (language, script, None),
+ ):
+ if addLikelySubtags(trial) == max:
+ return trial
+
+ # Step 4: Return maximized if no match found.
+ return max
+
+ def likely_canonical(from_tag, to_tag):
+ # Canonicalize the input tag.
+ from_tag = canonical(from_tag)
+
+ # Update the expected result if necessary.
+ if from_tag in likely_subtags:
+ to_tag = likely_subtags[from_tag]
+
+ # Canonicalize the expected output.
+ to_canonical = canonical(to_tag)
+
+ # Sanity check: This should match the result of |addLikelySubtags|.
+ assert to_canonical == addLikelySubtags(from_tag)
+
+ return to_canonical
+
+ # |likely_subtags| contains non-canonicalized tags, so canonicalize it first.
+ likely_subtags_canonical = {
+ k: likely_canonical(k, v) for (k, v) in likely_subtags.items()
+ }
+
+ # Add test data for |Intl.Locale.prototype.maximize()|.
+ writeMappingsVar(
+ println,
+ {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()},
+ "maxLikelySubtags",
+ "Extracted from likelySubtags.xml.",
+ source,
+ url,
+ )
+
+ # Use the maximalized tags as the input for the remove likely-subtags test.
+ minimized = {
+ tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()
+ }
+
+ # Add test data for |Intl.Locale.prototype.minimize()|.
+ writeMappingsVar(
+ println,
+ {bcp47(k): bcp47(v) for (k, v) in minimized.items()},
+ "minLikelySubtags",
+ "Extracted from likelySubtags.xml.",
+ source,
+ url,
+ )
+
+ println(
+ """
+for (let [tag, maximal] of Object.entries(maxLikelySubtags)) {
+ assertEq(new Intl.Locale(tag).maximize().toString(), maximal);
+}"""
+ )
+
+ println(
+ """
+for (let [tag, minimal] of Object.entries(minLikelySubtags)) {
+ assertEq(new Intl.Locale(tag).minimize().toString(), minimal);
+}"""
+ )
+
+ println(
+ """
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);"""
+ )
+
+
+def readCLDRVersionFromICU():
+ icuDir = os.path.join(topsrcdir, "intl/icu/source")
+ if not os.path.isdir(icuDir):
+ raise RuntimeError("not a directory: {}".format(icuDir))
+
+ reVersion = re.compile(r'\s*cldrVersion\{"(\d+(?:\.\d+)?)"\}')
+
+ for line in flines(os.path.join(icuDir, "data/misc/supplementalData.txt")):
+ m = reVersion.match(line)
+ if m:
+ version = m.group(1)
+ break
+
+ if version is None:
+ raise RuntimeError("can't resolve CLDR version")
+
+ return version
+
+
+def updateCLDRLangTags(args):
+ """ Update the LanguageTagGenerated.cpp file. """
+ version = args.version
+ url = args.url
+ out = args.out
+ filename = args.file
+
+ # Determine current CLDR version from ICU.
+ if version is None:
+ version = readCLDRVersionFromICU()
+
+ url = url.replace("<VERSION>", version)
+
+ print("Arguments:")
+ print("\tCLDR version: %s" % version)
+ print("\tDownload url: %s" % url)
+ if filename is not None:
+ print("\tLocal CLDR core.zip file: %s" % filename)
+ print("\tOutput file: %s" % out)
+ print("")
+
+ data = {
+ "version": version,
+ }
+
+ def readFiles(cldr_file):
+ with ZipFile(cldr_file) as zip_file:
+ data.update(readSupplementalData(zip_file))
+ data.update(readUnicodeExtensions(zip_file))
+
+ print("Processing CLDR data...")
+ if filename is not None:
+ print("Always make sure you have the newest CLDR core.zip!")
+ with open(filename, "rb") as cldr_file:
+ readFiles(cldr_file)
+ else:
+ print("Downloading CLDR core.zip...")
+ with closing(urlopen(url)) as cldr_file:
+ cldr_data = io.BytesIO(cldr_file.read())
+ readFiles(cldr_data)
+
+ print("Writing Intl data...")
+ with io.open(out, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ writeCLDRLanguageTagData(println, data, url)
+
+ print("Writing Intl test data...")
+ js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
+ test_file = os.path.join(
+ js_src_builtin_intl_dir,
+ "../../tests/non262/Intl/Locale/likely-subtags-generated.js",
+ )
+ with io.open(test_file, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ println("// |reftest| skip-if(!this.hasOwnProperty('Intl'))")
+ writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
+
+
+def flines(filepath, encoding="utf-8"):
+ """ Open filepath and iterate over its content. """
+ with io.open(filepath, mode="r", encoding=encoding) as f:
+ for line in f:
+ yield line
+
+
+@total_ordering
+class Zone(object):
+ """ Time zone with optional file name. """
+
+ def __init__(self, name, filename=""):
+ self.name = name
+ self.filename = filename
+
+ def __eq__(self, other):
+ return hasattr(other, "name") and self.name == other.name
+
+ def __lt__(self, other):
+ return self.name < other.name
+
+ def __hash__(self):
+ return hash(self.name)
+
+ def __str__(self):
+ return self.name
+
+ def __repr__(self):
+ return self.name
+
+
+class TzDataDir(object):
+ """ tzdata source from a directory. """
+
+ def __init__(self, obj):
+ self.name = partial(os.path.basename, obj)
+ self.resolve = partial(os.path.join, obj)
+ self.basename = os.path.basename
+ self.isfile = os.path.isfile
+ self.listdir = partial(os.listdir, obj)
+ self.readlines = flines
+
+
+class TzDataFile(object):
+ """ tzdata source from a file (tar or gzipped). """
+
+ def __init__(self, obj):
+ self.name = lambda: os.path.splitext(
+ os.path.splitext(os.path.basename(obj))[0]
+ )[0]
+ self.resolve = obj.getmember
+ self.basename = attrgetter("name")
+ self.isfile = tarfile.TarInfo.isfile
+ self.listdir = obj.getnames
+ self.readlines = partial(self._tarlines, obj)
+
+ def _tarlines(self, tar, m):
+ with closing(tar.extractfile(m)) as f:
+ for line in f:
+ yield line.decode("utf-8")
+
+
+def validateTimeZones(zones, links):
+ """ Validate the zone and link entries. """
+ linkZones = set(links.keys())
+ intersect = linkZones.intersection(zones)
+ if intersect:
+ raise RuntimeError("Links also present in zones: %s" % intersect)
+
+ zoneNames = {z.name for z in zones}
+ linkTargets = set(links.values())
+ if not linkTargets.issubset(zoneNames):
+ raise RuntimeError(
+ "Link targets not found: %s" % linkTargets.difference(zoneNames)
+ )
+
+
+def partition(iterable, *predicates):
+ def innerPartition(pred, it):
+ it1, it2 = tee(it)
+ return (filter(pred, it1), filterfalse(pred, it2))
+
+ if len(predicates) == 0:
+ return iterable
+ (left, right) = innerPartition(predicates[0], iterable)
+ if len(predicates) == 1:
+ return (left, right)
+ return tuple([left] + list(partition(right, *predicates[1:])))
+
+
+def listIANAFiles(tzdataDir):
+ def isTzFile(d, m, f):
+ return m(f) and d.isfile(d.resolve(f))
+
+ return filter(
+ partial(isTzFile, tzdataDir, re.compile("^[a-z0-9]+$").match),
+ tzdataDir.listdir(),
+ )
+
+
+def readIANAFiles(tzdataDir, files):
+ """ Read all IANA time zone files from the given iterable. """
+ nameSyntax = "[\w/+\-]+"
+ pZone = re.compile(r"Zone\s+(?P<name>%s)\s+.*" % nameSyntax)
+ pLink = re.compile(
+ r"Link\s+(?P<target>%s)\s+(?P<name>%s)(?:\s+#.*)?" % (nameSyntax, nameSyntax)
+ )
+
+ def createZone(line, fname):
+ match = pZone.match(line)
+ name = match.group("name")
+ return Zone(name, fname)
+
+ def createLink(line, fname):
+ match = pLink.match(line)
+ (name, target) = match.group("name", "target")
+ return (Zone(name, fname), target)
+
+ zones = set()
+ links = dict()
+ for filename in files:
+ filepath = tzdataDir.resolve(filename)
+ for line in tzdataDir.readlines(filepath):
+ if line.startswith("Zone"):
+ zones.add(createZone(line, filename))
+ if line.startswith("Link"):
+ (link, target) = createLink(line, filename)
+ links[link] = target
+
+ return (zones, links)
+
+
+def readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory):
+ """ Read the IANA time zone information from `tzdataDir`. """
+
+ backzoneFiles = {"backzone"}
+ (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
+
+ # Read zone and link infos.
+ (zones, links) = readIANAFiles(tzdataDir, tzfiles)
+ (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
+
+ # Remove the placeholder time zone "Factory".
+ if ignoreFactory:
+ zones.remove(Zone("Factory"))
+
+ # Merge with backzone data.
+ if not ignoreBackzone:
+ zones |= backzones
+ links = {
+ name: target for name, target in links.items() if name not in backzones
+ }
+ links.update(backlinks)
+
+ validateTimeZones(zones, links)
+
+ return (zones, links)
+
+
+def readICUResourceFile(filename):
+ """Read an ICU resource file.
+
+ Yields (<table-name>, <startOrEnd>, <value>) for each table.
+ """
+
+ numberValue = r"-?\d+"
+ stringValue = r'".+?"'
+
+ def asVector(val):
+ return r"%s(?:\s*,\s*%s)*" % (val, val)
+
+ numberVector = asVector(numberValue)
+ stringVector = asVector(stringValue)
+
+ reNumberVector = re.compile(numberVector)
+ reStringVector = re.compile(stringVector)
+ reNumberValue = re.compile(numberValue)
+ reStringValue = re.compile(stringValue)
+
+ def parseValue(value):
+ m = reNumberVector.match(value)
+ if m:
+ return [int(v) for v in reNumberValue.findall(value)]
+ m = reStringVector.match(value)
+ if m:
+ return [v[1:-1] for v in reStringValue.findall(value)]
+ raise RuntimeError("unknown value type: %s" % value)
+
+ def extractValue(values):
+ if len(values) == 0:
+ return None
+ if len(values) == 1:
+ return values[0]
+ return values
+
+ def line(*args):
+ maybeMultiComments = r"(?:/\*[^*]*\*/)*"
+ maybeSingleComment = r"(?://.*)?"
+ lineStart = "^%s" % maybeMultiComments
+ lineEnd = "%s\s*%s$" % (maybeMultiComments, maybeSingleComment)
+ return re.compile(r"\s*".join(chain([lineStart], args, [lineEnd])))
+
+ tableName = r'(?P<quote>"?)(?P<name>.+?)(?P=quote)'
+ tableValue = r"(?P<value>%s|%s)" % (numberVector, stringVector)
+
+ reStartTable = line(tableName, r"\{")
+ reEndTable = line(r"\}")
+ reSingleValue = line(r",?", tableValue, r",?")
+ reCompactTable = line(tableName, r"\{", tableValue, r"\}")
+ reEmptyLine = line()
+
+ tables = []
+
+ def currentTable():
+ return "|".join(tables)
+
+ values = []
+ for line in flines(filename, "utf-8-sig"):
+ line = line.strip()
+ if line == "":
+ continue
+
+ m = reEmptyLine.match(line)
+ if m:
+ continue
+
+ m = reStartTable.match(line)
+ if m:
+ assert len(values) == 0
+ tables.append(m.group("name"))
+ continue
+
+ m = reEndTable.match(line)
+ if m:
+ yield (currentTable(), extractValue(values))
+ tables.pop()
+ values = []
+ continue
+
+ m = reCompactTable.match(line)
+ if m:
+ assert len(values) == 0
+ tables.append(m.group("name"))
+ yield (currentTable(), extractValue(parseValue(m.group("value"))))
+ tables.pop()
+ continue
+
+ m = reSingleValue.match(line)
+ if m and tables:
+ values.extend(parseValue(m.group("value")))
+ continue
+
+ raise RuntimeError("unknown entry: %s" % line)
+
+
+def readICUTimeZonesFromTimezoneTypes(icuTzDir):
+ """Read the ICU time zone information from `icuTzDir`/timezoneTypes.txt
+ and returns the tuple (zones, links).
+ """
+ typeMapTimeZoneKey = "timezoneTypes:table(nofallback)|typeMap|timezone|"
+ typeAliasTimeZoneKey = "timezoneTypes:table(nofallback)|typeAlias|timezone|"
+
+ def toTimeZone(name):
+ return Zone(name.replace(":", "/"))
+
+ zones = set()
+ links = dict()
+
+ for name, value in readICUResourceFile(os.path.join(icuTzDir, "timezoneTypes.txt")):
+ if name.startswith(typeMapTimeZoneKey):
+ zones.add(toTimeZone(name[len(typeMapTimeZoneKey) :]))
+ if name.startswith(typeAliasTimeZoneKey):
+ links[toTimeZone(name[len(typeAliasTimeZoneKey) :])] = value
+
+ validateTimeZones(zones, links)
+
+ return (zones, links)
+
+
+def readICUTimeZonesFromZoneInfo(icuTzDir):
+ """Read the ICU time zone information from `icuTzDir`/zoneinfo64.txt
+ and returns the tuple (zones, links).
+ """
+ zoneKey = "zoneinfo64:table(nofallback)|Zones:array|:table"
+ linkKey = "zoneinfo64:table(nofallback)|Zones:array|:int"
+ namesKey = "zoneinfo64:table(nofallback)|Names"
+
+ tzId = 0
+ tzLinks = dict()
+ tzNames = []
+
+ for name, value in readICUResourceFile(os.path.join(icuTzDir, "zoneinfo64.txt")):
+ if name == zoneKey:
+ tzId += 1
+ elif name == linkKey:
+ tzLinks[tzId] = int(value)
+ tzId += 1
+ elif name == namesKey:
+ tzNames.extend(value)
+
+ links = {Zone(tzNames[zone]): tzNames[target] for (zone, target) in tzLinks.items()}
+ zones = {Zone(v) for v in tzNames if Zone(v) not in links}
+
+ validateTimeZones(zones, links)
+
+ return (zones, links)
+
+
+def readICUTimeZones(icuDir, icuTzDir, ignoreFactory):
+ # zoneinfo64.txt contains the supported time zones by ICU. This data is
+ # generated from tzdata files, it doesn't include "backzone" in stock ICU.
+ (zoneinfoZones, zoneinfoLinks) = readICUTimeZonesFromZoneInfo(icuTzDir)
+
+ # timezoneTypes.txt contains the canonicalization information for ICU. This
+ # data is generated from CLDR files. It includes data about time zones from
+ # tzdata's "backzone" file.
+ (typesZones, typesLinks) = readICUTimeZonesFromTimezoneTypes(icuTzDir)
+
+ # Remove the placeholder time zone "Factory".
+ # See also <https://github.com/eggert/tz/blob/master/factory>.
+ if ignoreFactory:
+ zoneinfoZones.remove(Zone("Factory"))
+
+ # Remove the ICU placeholder time zone "Etc/Unknown".
+ # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>.
+ for zones in (zoneinfoZones, typesZones):
+ zones.remove(Zone("Etc/Unknown"))
+
+ # Remove any outdated ICU links.
+ for links in (zoneinfoLinks, typesLinks):
+ for zone in otherICULegacyLinks().keys():
+ if zone not in links:
+ raise KeyError(f"Can't remove non-existent link from '{zone}'")
+ del links[zone]
+
+ # Information in zoneinfo64 should be a superset of timezoneTypes.
+ def inZoneInfo64(zone):
+ return zone in zoneinfoZones or zone in zoneinfoLinks
+
+ notFoundInZoneInfo64 = [zone for zone in typesZones if not inZoneInfo64(zone)]
+ if notFoundInZoneInfo64:
+ raise RuntimeError(
+ "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64
+ )
+
+ notFoundInZoneInfo64 = [
+ zone for zone in typesLinks.keys() if not inZoneInfo64(zone)
+ ]
+ if notFoundInZoneInfo64:
+ raise RuntimeError(
+ "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64
+ )
+
+ # zoneinfo64.txt only defines the supported time zones by ICU, the canonicalization
+ # rules are defined through timezoneTypes.txt. Merge both to get the actual zones
+ # and links used by ICU.
+ icuZones = set(
+ chain(
+ (zone for zone in zoneinfoZones if zone not in typesLinks),
+ (zone for zone in typesZones),
+ )
+ )
+ icuLinks = dict(
+ chain(
+ (
+ (zone, target)
+ for (zone, target) in zoneinfoLinks.items()
+ if zone not in typesZones
+ ),
+ ((zone, target) for (zone, target) in typesLinks.items()),
+ )
+ )
+
+ return (icuZones, icuLinks)
+
+
+def readICULegacyZones(icuDir):
+ """Read the ICU legacy time zones from `icuTzDir`/tools/tzcode/icuzones
+ and returns the tuple (zones, links).
+ """
+ tzdir = TzDataDir(os.path.join(icuDir, "tools/tzcode"))
+
+ # Per spec we must recognize only IANA time zones and links, but ICU
+ # recognizes various legacy, non-IANA time zones and links. Compute these
+ # non-IANA time zones and links.
+
+ # Most legacy, non-IANA time zones and links are in the icuzones file.
+ (zones, links) = readIANAFiles(tzdir, ["icuzones"])
+
+ # Remove the ICU placeholder time zone "Etc/Unknown".
+ # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>.
+ zones.remove(Zone("Etc/Unknown"))
+
+ # A handful of non-IANA zones/links are not in icuzones and must be added
+ # manually so that we won't invoke ICU with them.
+ for (zone, target) in otherICULegacyLinks().items():
+ if zone in links:
+ if links[zone] != target:
+ raise KeyError(
+ f"Can't overwrite link '{zone} -> {links[zone]}' with '{target}'"
+ )
+ else:
+ print(
+ f"Info: Link '{zone} -> {target}' can be removed from otherICULegacyLinks()"
+ )
+ links[zone] = target
+
+ return (zones, links)
+
+
+def otherICULegacyLinks():
+ """The file `icuTzDir`/tools/tzcode/icuzones contains all ICU legacy time
+ zones with the exception of time zones which are removed by IANA after an
+ ICU release.
+
+ For example ICU 67 uses tzdata2018i, but tzdata2020b removed the link from
+ "US/Pacific-New" to "America/Los_Angeles". ICU standalone tzdata updates
+ don't include modified icuzones files, so we must manually record any IANA
+ modifications here.
+
+ After an ICU update, we can remove any no longer needed entries from this
+ function by checking if the relevant entries are now included in icuzones.
+ """
+
+ return {
+ # tzdata2020b removed the link US/Pacific-New -> America/Los_Angeles.
+ Zone("US/Pacific-New"): "America/Los_Angeles",
+ }
+
+
+def icuTzDataVersion(icuTzDir):
+ """ Read the ICU time zone version from `icuTzDir`/zoneinfo64.txt. """
+
+ def searchInFile(pattern, f):
+ p = re.compile(pattern)
+ for line in flines(f, "utf-8-sig"):
+ m = p.search(line)
+ if m:
+ return m.group(1)
+ return None
+
+ zoneinfo = os.path.join(icuTzDir, "zoneinfo64.txt")
+ if not os.path.isfile(zoneinfo):
+ raise RuntimeError("file not found: %s" % zoneinfo)
+ version = searchInFile("^//\s+tz version:\s+([0-9]{4}[a-z])$", zoneinfo)
+ if version is None:
+ raise RuntimeError(
+ "%s does not contain a valid tzdata version string" % zoneinfo
+ )
+ return version
+
+
+def findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone):
+ """ Find incorrect ICU zone entries. """
+
+ def isIANATimeZone(zone):
+ return zone in ianaZones or zone in ianaLinks
+
+ def isICUTimeZone(zone):
+ return zone in icuZones or zone in icuLinks
+
+ def isICULink(zone):
+ return zone in icuLinks
+
+ # All IANA zones should be present in ICU.
+ missingTimeZones = [zone for zone in ianaZones if not isICUTimeZone(zone)]
+ # Normally zones in backzone are also present as links in one of the other
+ # time zone files. The only exception to this rule is the Asia/Hanoi time
+ # zone, this zone is only present in the backzone file.
+ expectedMissing = [] if ignoreBackzone else [Zone("Asia/Hanoi")]
+ if missingTimeZones != expectedMissing:
+ raise RuntimeError(
+ "Not all zones are present in ICU, did you forget "
+ "to run intl/update-tzdata.sh? %s" % missingTimeZones
+ )
+
+ # Zones which are only present in ICU?
+ additionalTimeZones = [zone for zone in icuZones if not isIANATimeZone(zone)]
+ if additionalTimeZones:
+ raise RuntimeError(
+ "Additional zones present in ICU, did you forget "
+ "to run intl/update-tzdata.sh? %s" % additionalTimeZones
+ )
+
+ # Zones which are marked as links in ICU.
+ result = ((zone, icuLinks[zone]) for zone in ianaZones if isICULink(zone))
+
+ # Remove unnecessary UTC mappings.
+ utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
+ result = ((zone, target) for (zone, target) in result if zone.name not in utcnames)
+
+ return sorted(result, key=itemgetter(0))
+
+
+def findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks):
+ """ Find incorrect ICU link entries. """
+
+ def isIANATimeZone(zone):
+ return zone in ianaZones or zone in ianaLinks
+
+ def isICUTimeZone(zone):
+ return zone in icuZones or zone in icuLinks
+
+ def isICULink(zone):
+ return zone in icuLinks
+
+ def isICUZone(zone):
+ return zone in icuZones
+
+ # All links should be present in ICU.
+ missingTimeZones = [zone for zone in ianaLinks.keys() if not isICUTimeZone(zone)]
+ if missingTimeZones:
+ raise RuntimeError(
+ "Not all zones are present in ICU, did you forget "
+ "to run intl/update-tzdata.sh? %s" % missingTimeZones
+ )
+
+ # Links which are only present in ICU?
+ additionalTimeZones = [zone for zone in icuLinks.keys() if not isIANATimeZone(zone)]
+ if additionalTimeZones:
+ raise RuntimeError(
+ "Additional links present in ICU, did you forget "
+ "to run intl/update-tzdata.sh? %s" % additionalTimeZones
+ )
+
+ result = chain(
+ # IANA links which have a different target in ICU.
+ (
+ (zone, target, icuLinks[zone])
+ for (zone, target) in ianaLinks.items()
+ if isICULink(zone) and target != icuLinks[zone]
+ ),
+ # IANA links which are zones in ICU.
+ (
+ (zone, target, zone.name)
+ for (zone, target) in ianaLinks.items()
+ if isICUZone(zone)
+ ),
+ )
+
+ # Remove unnecessary UTC mappings.
+ utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
+ result = (
+ (zone, target, icuTarget)
+ for (zone, target, icuTarget) in result
+ if target not in utcnames or icuTarget not in utcnames
+ )
+
+ return sorted(result, key=itemgetter(0))
+
+
+generatedFileWarning = "// Generated by make_intl_data.py. DO NOT EDIT."
+tzdataVersionComment = "// tzdata version = {0}"
+
+
+def processTimeZones(
+ tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignoreFactory, out
+):
+ """ Read the time zone info and create a new time zone cpp file. """
+ print("Processing tzdata mapping...")
+ (ianaZones, ianaLinks) = readIANATimeZones(tzdataDir, ignoreBackzone, ignoreFactory)
+ (icuZones, icuLinks) = readICUTimeZones(icuDir, icuTzDir, ignoreFactory)
+ (legacyZones, legacyLinks) = readICULegacyZones(icuDir)
+
+ # Remove all legacy ICU time zones.
+ icuZones = {zone for zone in icuZones if zone not in legacyZones}
+ icuLinks = {
+ zone: target for (zone, target) in icuLinks.items() if zone not in legacyLinks
+ }
+
+ incorrectZones = findIncorrectICUZones(
+ ianaZones, ianaLinks, icuZones, icuLinks, ignoreBackzone
+ )
+ if not incorrectZones:
+ print("<<< No incorrect ICU time zones found, please update Intl.js! >>>")
+ print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
+
+ incorrectLinks = findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks)
+ if not incorrectLinks:
+ print("<<< No incorrect ICU time zone links found, please update Intl.js! >>>")
+ print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
+
+ print("Writing Intl tzdata file...")
+ with io.open(out, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ println(generatedFileWarning)
+ println(tzdataVersionComment.format(version))
+ println("")
+
+ println("#ifndef builtin_intl_TimeZoneDataGenerated_h")
+ println("#define builtin_intl_TimeZoneDataGenerated_h")
+ println("")
+
+ println("namespace js {")
+ println("namespace timezone {")
+ println("")
+
+ println("// Format:")
+ println('// "ZoneName" // ICU-Name [time zone file]')
+ println("const char* const ianaZonesTreatedAsLinksByICU[] = {")
+ for (zone, icuZone) in incorrectZones:
+ println(' "%s", // %s [%s]' % (zone, icuZone, zone.filename))
+ println("};")
+ println("")
+
+ println("// Format:")
+ println('// "LinkName", "Target" // ICU-Target [time zone file]')
+ println("struct LinkAndTarget")
+ println("{")
+ println(" const char* const link;")
+ println(" const char* const target;")
+ println("};")
+ println("")
+ println("const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {")
+ for (zone, target, icuTarget) in incorrectLinks:
+ println(
+ ' { "%s", "%s" }, // %s [%s]'
+ % (zone, target, icuTarget, zone.filename)
+ )
+ println("};")
+ println("")
+
+ println(
+ "// Legacy ICU time zones, these are not valid IANA time zone names. We also"
+ )
+ println("// disallow the old and deprecated System V time zones.")
+ println(
+ "// https://ssl.icu-project.org/repos/icu/trunk/icu4c/source/tools/tzcode/icuzones"
+ ) # NOQA: E501
+ println("const char* const legacyICUTimeZones[] = {")
+ for zone in chain(sorted(legacyLinks.keys()), sorted(legacyZones)):
+ println(' "%s",' % zone)
+ println("};")
+ println("")
+
+ println("} // namespace timezone")
+ println("} // namespace js")
+ println("")
+ println("#endif /* builtin_intl_TimeZoneDataGenerated_h */")
+
+
+def updateBackzoneLinks(tzdataDir, links):
+ def withZone(fn):
+ return lambda zone_target: fn(zone_target[0])
+
+ (backzoneZones, backzoneLinks) = readIANAFiles(tzdataDir, ["backzone"])
+ (stableZones, updatedLinks, updatedZones) = partition(
+ links.items(),
+ # Link not changed in backzone.
+ withZone(lambda zone: zone not in backzoneLinks and zone not in backzoneZones),
+ # Link has a new target.
+ withZone(lambda zone: zone in backzoneLinks),
+ )
+ # Keep stable zones and links with updated target.
+ return dict(
+ chain(
+ stableZones,
+ map(withZone(lambda zone: (zone, backzoneLinks[zone])), updatedLinks),
+ )
+ )
+
+
+def generateTzDataLinkTestContent(testDir, version, fileName, description, links):
+ with io.open(
+ os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
+ ) as f:
+ println = partial(print, file=f)
+
+ println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
+ println("")
+ println(generatedFileWarning)
+ println(tzdataVersionComment.format(version))
+ println(
+ """
+const tzMapper = [
+ x => x,
+ x => x.toUpperCase(),
+ x => x.toLowerCase(),
+];
+"""
+ )
+
+ println(description)
+ println("const links = {")
+ for (zone, target) in sorted(links, key=itemgetter(0)):
+ println(' "%s": "%s",' % (zone, target))
+ println("};")
+
+ println(
+ """
+for (let [linkName, target] of Object.entries(links)) {
+ if (target === "Etc/UTC" || target === "Etc/GMT")
+ target = "UTC";
+
+ for (let map of tzMapper) {
+ let dtf = new Intl.DateTimeFormat(undefined, {timeZone: map(linkName)});
+ let resolvedTimeZone = dtf.resolvedOptions().timeZone;
+ assertEq(resolvedTimeZone, target, `${linkName} -> ${target}`);
+ }
+}
+"""
+ )
+ println(
+ """
+if (typeof reportCompare === "function")
+ reportCompare(0, 0, "ok");
+"""
+ )
+
+
+def generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir):
+ (zones, links) = readIANAFiles(tzdataDir, ["backward"])
+ assert len(zones) == 0
+
+ if not ignoreBackzone:
+ links = updateBackzoneLinks(tzdataDir, links)
+
+ generateTzDataLinkTestContent(
+ testDir,
+ version,
+ "timeZone_backward_links.js",
+ "// Link names derived from IANA Time Zone Database, backward file.",
+ links.items(),
+ )
+
+
+def generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir):
+ tzfiles = filterfalse(
+ {"backward", "backzone"}.__contains__, listIANAFiles(tzdataDir)
+ )
+ (zones, links) = readIANAFiles(tzdataDir, tzfiles)
+
+ if not ignoreBackzone:
+ links = updateBackzoneLinks(tzdataDir, links)
+
+ generateTzDataLinkTestContent(
+ testDir,
+ version,
+ "timeZone_notbackward_links.js",
+ "// Link names derived from IANA Time Zone Database, excluding backward file.",
+ links.items(),
+ )
+
+
+def generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir):
+ backzoneFiles = {"backzone"}
+ (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
+
+ # Read zone and link infos.
+ (zones, links) = readIANAFiles(tzdataDir, tzfiles)
+ (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
+
+ if not ignoreBackzone:
+ comment = """\
+// This file was generated with historical, pre-1970 backzone information
+// respected. Therefore, every zone key listed below is its own Zone, not
+// a Link to a modern-day target as IANA ignoring backzones would say.
+
+"""
+ else:
+ comment = """\
+// This file was generated while ignoring historical, pre-1970 backzone
+// information. Therefore, every zone key listed below is part of a Link
+// whose target is the corresponding value.
+
+"""
+
+ generateTzDataLinkTestContent(
+ testDir,
+ version,
+ "timeZone_backzone.js",
+ comment + "// Backzone zones derived from IANA Time Zone Database.",
+ (
+ (zone, zone if not ignoreBackzone else links[zone])
+ for zone in backzones
+ if zone in links
+ ),
+ )
+
+
+def generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir):
+ backzoneFiles = {"backzone"}
+ (bkfiles, tzfiles) = partition(listIANAFiles(tzdataDir), backzoneFiles.__contains__)
+
+ # Read zone and link infos.
+ (zones, links) = readIANAFiles(tzdataDir, tzfiles)
+ (backzones, backlinks) = readIANAFiles(tzdataDir, bkfiles)
+
+ if not ignoreBackzone:
+ comment = """\
+// This file was generated with historical, pre-1970 backzone information
+// respected. Therefore, every zone key listed below points to a target
+// in the backzone file and not to its modern-day target as IANA ignoring
+// backzones would say.
+
+"""
+ else:
+ comment = """\
+// This file was generated while ignoring historical, pre-1970 backzone
+// information. Therefore, every zone key listed below is part of a Link
+// whose target is the corresponding value ignoring any backzone entries.
+
+"""
+
+ generateTzDataLinkTestContent(
+ testDir,
+ version,
+ "timeZone_backzone_links.js",
+ comment + "// Backzone links derived from IANA Time Zone Database.",
+ (
+ (zone, target if not ignoreBackzone else links[zone])
+ for (zone, target) in backlinks.items()
+ ),
+ )
+
+
+def generateTzDataTestVersion(tzdataDir, version, testDir):
+ fileName = "timeZone_version.js"
+
+ with io.open(
+ os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
+ ) as f:
+ println = partial(print, file=f)
+
+ println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
+ println("")
+ println(generatedFileWarning)
+ println(tzdataVersionComment.format(version))
+ println("""const tzdata = "{0}";""".format(version))
+
+ println(
+ """
+if (typeof getICUOptions === "undefined") {
+ var getICUOptions = SpecialPowers.Cu.getJSTestingFunctions().getICUOptions;
+}
+
+var options = getICUOptions();
+
+assertEq(options.tzdata, tzdata);
+
+if (typeof reportCompare === "function")
+ reportCompare(0, 0, "ok");
+"""
+ )
+
+
+def generateTzDataTests(tzdataDir, version, ignoreBackzone, testDir):
+ generateTzDataTestBackwardLinks(tzdataDir, version, ignoreBackzone, testDir)
+ generateTzDataTestNotBackwardLinks(tzdataDir, version, ignoreBackzone, testDir)
+ generateTzDataTestBackzone(tzdataDir, version, ignoreBackzone, testDir)
+ generateTzDataTestBackzoneLinks(tzdataDir, version, ignoreBackzone, testDir)
+ generateTzDataTestVersion(tzdataDir, version, testDir)
+
+
+def updateTzdata(topsrcdir, args):
+ """ Update the time zone cpp file. """
+
+ icuDir = os.path.join(topsrcdir, "intl/icu/source")
+ if not os.path.isdir(icuDir):
+ raise RuntimeError("not a directory: %s" % icuDir)
+
+ icuTzDir = os.path.join(topsrcdir, "intl/tzdata/source")
+ if not os.path.isdir(icuTzDir):
+ raise RuntimeError("not a directory: %s" % icuTzDir)
+
+ dateTimeFormatTestDir = os.path.join(
+ topsrcdir, "js/src/tests/non262/Intl/DateTimeFormat"
+ )
+ if not os.path.isdir(dateTimeFormatTestDir):
+ raise RuntimeError("not a directory: %s" % dateTimeFormatTestDir)
+
+ tzDir = args.tz
+ if tzDir is not None and not (os.path.isdir(tzDir) or os.path.isfile(tzDir)):
+ raise RuntimeError("not a directory or file: %s" % tzDir)
+ ignoreBackzone = args.ignore_backzone
+ # TODO: Accept or ignore the placeholder time zone "Factory"?
+ ignoreFactory = False
+ out = args.out
+
+ version = icuTzDataVersion(icuTzDir)
+ url = (
+ "https://www.iana.org/time-zones/repository/releases/tzdata%s.tar.gz" % version
+ )
+
+ print("Arguments:")
+ print("\ttzdata version: %s" % version)
+ print("\ttzdata URL: %s" % url)
+ print("\ttzdata directory|file: %s" % tzDir)
+ print("\tICU directory: %s" % icuDir)
+ print("\tICU timezone directory: %s" % icuTzDir)
+ print("\tIgnore backzone file: %s" % ignoreBackzone)
+ print("\tOutput file: %s" % out)
+ print("")
+
+ def updateFrom(f):
+ if os.path.isfile(f) and tarfile.is_tarfile(f):
+ with tarfile.open(f, "r:*") as tar:
+ processTimeZones(
+ TzDataFile(tar),
+ icuDir,
+ icuTzDir,
+ version,
+ ignoreBackzone,
+ ignoreFactory,
+ out,
+ )
+ generateTzDataTests(
+ TzDataFile(tar), version, ignoreBackzone, dateTimeFormatTestDir
+ )
+ elif os.path.isdir(f):
+ processTimeZones(
+ TzDataDir(f),
+ icuDir,
+ icuTzDir,
+ version,
+ ignoreBackzone,
+ ignoreFactory,
+ out,
+ )
+ generateTzDataTests(
+ TzDataDir(f), version, ignoreBackzone, dateTimeFormatTestDir
+ )
+ else:
+ raise RuntimeError("unknown format")
+
+ if tzDir is None:
+ print("Downloading tzdata file...")
+ with closing(urlopen(url)) as tzfile:
+ fname = urlsplit(tzfile.geturl()).path.split("/")[-1]
+ with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile:
+ print("File stored in %s" % tztmpfile.name)
+ tztmpfile.write(tzfile.read())
+ tztmpfile.flush()
+ updateFrom(tztmpfile.name)
+ else:
+ updateFrom(tzDir)
+
+
+def readCurrencyFile(tree):
+ reCurrency = re.compile(r"^[A-Z]{3}$")
+ reIntMinorUnits = re.compile(r"^\d+$")
+
+ for country in tree.iterfind(".//CcyNtry"):
+ # Skip entry if no currency information is available.
+ currency = country.findtext("Ccy")
+ if currency is None:
+ continue
+ assert reCurrency.match(currency)
+
+ minorUnits = country.findtext("CcyMnrUnts")
+ assert minorUnits is not None
+
+ # Skip all entries without minorUnits or which use the default minorUnits.
+ if reIntMinorUnits.match(minorUnits) and int(minorUnits) != 2:
+ currencyName = country.findtext("CcyNm")
+ countryName = country.findtext("CtryNm")
+ yield (currency, int(minorUnits), currencyName, countryName)
+
+
+def writeCurrencyFile(published, currencies, out):
+ with io.open(out, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ println(generatedFileWarning)
+ println("// Version: {}".format(published))
+
+ println(
+ """
+/**
+ * Mapping from currency codes to the number of decimal digits used for them.
+ * Default is 2 digits.
+ *
+ * Spec: ISO 4217 Currency and Funds Code List.
+ * http://www.currency-iso.org/en/home/tables/table-a1.html
+ */"""
+ )
+ println("var currencyDigits = {")
+ for (currency, entries) in groupby(
+ sorted(currencies, key=itemgetter(0)), itemgetter(0)
+ ):
+ for (_, minorUnits, currencyName, countryName) in entries:
+ println(" // {} ({})".format(currencyName, countryName))
+ println(" {}: {},".format(currency, minorUnits))
+ println("};")
+
+
+def updateCurrency(topsrcdir, args):
+ """ Update the CurrencyDataGenerated.js file. """
+ import xml.etree.ElementTree as ET
+ from random import randint
+
+ url = args.url
+ out = args.out
+ filename = args.file
+
+ print("Arguments:")
+ print("\tDownload url: %s" % url)
+ print("\tLocal currency file: %s" % filename)
+ print("\tOutput file: %s" % out)
+ print("")
+
+ def updateFrom(currencyFile):
+ print("Processing currency code list file...")
+ tree = ET.parse(currencyFile)
+ published = tree.getroot().attrib["Pblshd"]
+ currencies = readCurrencyFile(tree)
+
+ print("Writing CurrencyData file...")
+ writeCurrencyFile(published, currencies, out)
+
+ if filename is not None:
+ print("Always make sure you have the newest currency code list file!")
+ updateFrom(filename)
+ else:
+ print("Downloading currency & funds code list...")
+ request = UrlRequest(url)
+ request.add_header(
+ "User-agent",
+ "Mozilla/5.0 (Mobile; rv:{0}.0) Gecko/{0}.0 Firefox/{0}.0".format(
+ randint(1, 999)
+ ),
+ )
+ with closing(urlopen(request)) as currencyFile:
+ fname = urlsplit(currencyFile.geturl()).path.split("/")[-1]
+ with tempfile.NamedTemporaryFile(suffix=fname) as currencyTmpFile:
+ print("File stored in %s" % currencyTmpFile.name)
+ currencyTmpFile.write(currencyFile.read())
+ currencyTmpFile.flush()
+ updateFrom(currencyTmpFile.name)
+
+
+def writeUnicodeExtensionsMappings(println, mapping, extension):
+ println(
+ """
+template <size_t Length>
+static inline bool Is{0}Key(
+ mozilla::Span<const char> key, const char (&str)[Length]) {{
+ static_assert(Length == {0}KeyLength + 1,
+ "{0} extension key is two characters long");
+ return memcmp(key.data(), str, Length - 1) == 0;
+}}
+
+template <size_t Length>
+static inline bool Is{0}Type(
+ mozilla::Span<const char> type, const char (&str)[Length]) {{
+ static_assert(Length > {0}KeyLength + 1,
+ "{0} extension type contains more than two characters");
+ return type.size() == (Length - 1) &&
+ memcmp(type.data(), str, Length - 1) == 0;
+}}
+""".format(
+ extension
+ ).rstrip(
+ "\n"
+ )
+ )
+
+ linear_search_max_length = 4
+
+ needs_binary_search = any(
+ len(replacements.items()) > linear_search_max_length
+ for replacements in mapping.values()
+ )
+
+ if needs_binary_search:
+ println(
+ """
+static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{
+ MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'),
+ "unexpected null-character in string");
+
+ using UnsignedChar = unsigned char;
+ for (size_t i = 0; i < b.size(); i++) {{
+ // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
+ // we've reached the end of |a|, the below if-statement will always be true.
+ // That ensures we don't read past the end of |a|.
+ if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{
+ return r;
+ }}
+ }}
+
+ // Return zero if both strings are equal or a negative number if |b| is a
+ // prefix of |a|.
+ return -int32_t(UnsignedChar(a[b.size()]));
+}}
+
+template <size_t Length>
+static inline const char* Search{0}Replacement(
+ const char* (&types)[Length], const char* (&aliases)[Length],
+ mozilla::Span<const char> type) {{
+
+ auto p = std::lower_bound(std::begin(types), std::end(types), type,
+ [](const auto& a, const auto& b) {{
+ return Compare{0}Type(a, b) < 0;
+ }});
+ if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{
+ return aliases[std::distance(std::begin(types), p)];
+ }}
+ return nullptr;
+}}
+""".format(
+ extension
+ ).rstrip(
+ "\n"
+ )
+ )
+
+ println(
+ """
+/**
+ * Mapping from deprecated BCP 47 {0} extension types to their preferred
+ * values.
+ *
+ * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
+ * Spec: https://www.unicode.org/reports/tr35/#t_Extension
+ */
+const char* js::intl::LanguageTag::replace{0}ExtensionType(
+ mozilla::Span<const char> key, mozilla::Span<const char> type) {{
+ MOZ_ASSERT(key.size() == {0}KeyLength);
+ MOZ_ASSERT(IsCanonicallyCased{0}Key(key));
+
+ MOZ_ASSERT(type.size() > {0}KeyLength);
+ MOZ_ASSERT(IsCanonicallyCased{0}Type(type));
+""".format(
+ extension
+ )
+ )
+
+ def to_hash_key(replacements):
+ return str(sorted(replacements.items()))
+
+ def write_array(subtags, name, length):
+ max_entries = (80 - len(" ")) // (length + len('"", '))
+
+ println(" static const char* {}[{}] = {{".format(name, len(subtags)))
+
+ for entries in grouper(subtags, max_entries):
+ entries = (
+ '"{}"'.format(tag).rjust(length + 2)
+ for tag in entries
+ if tag is not None
+ )
+ println(" {},".format(", ".join(entries)))
+
+ println(" };")
+
+ # Merge duplicate keys.
+ key_aliases = {}
+ for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
+ hash_key = to_hash_key(replacements)
+ if hash_key not in key_aliases:
+ key_aliases[hash_key] = []
+ else:
+ key_aliases[hash_key].append(key)
+
+ first_key = True
+ for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
+ hash_key = to_hash_key(replacements)
+ if key in key_aliases[hash_key]:
+ continue
+
+ cond = (
+ 'Is{}Key(key, "{}")'.format(extension, k)
+ for k in [key] + key_aliases[hash_key]
+ )
+
+ if_kind = "if" if first_key else "else if"
+ cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
+ println(
+ """
+ {} ({}) {{""".format(
+ if_kind, cond
+ ).strip(
+ "\n"
+ )
+ )
+ first_key = False
+
+ replacements = sorted(replacements.items(), key=itemgetter(0))
+
+ if len(replacements) > linear_search_max_length:
+ types = [t for (t, _) in replacements]
+ preferred = [r for (_, r) in replacements]
+ max_len = max(len(k) for k in types + preferred)
+
+ write_array(types, "types", max_len)
+ write_array(preferred, "aliases", max_len)
+ println(
+ """
+ return Search{}Replacement(types, aliases, type);
+""".format(
+ extension
+ ).strip(
+ "\n"
+ )
+ )
+ else:
+ for (type, replacement) in replacements:
+ println(
+ """
+ if (Is{}Type(type, "{}")) {{
+ return "{}";
+ }}""".format(
+ extension, type, replacement
+ ).strip(
+ "\n"
+ )
+ )
+
+ println(
+ """
+ }""".lstrip(
+ "\n"
+ )
+ )
+
+ println(
+ """
+ return nullptr;
+}
+""".strip(
+ "\n"
+ )
+ )
+
+
+def readICUUnitResourceFile(filepath):
+ """Return a set of unit descriptor pairs where the first entry denotes the unit type and the
+ second entry the unit name.
+
+ Example:
+
+ root{
+ units{
+ compound{
+ }
+ coordinate{
+ }
+ length{
+ meter{
+ }
+ }
+ }
+ unitsNarrow:alias{"/LOCALE/unitsShort"}
+ unitsShort{
+ duration{
+ day{
+ }
+ day-person:alias{"/LOCALE/unitsShort/duration/day"}
+ }
+ length{
+ meter{
+ }
+ }
+ }
+ }
+
+ Returns {("length", "meter"), ("duration", "day"), ("duration", "day-person")}
+ """
+
+ start_table_re = re.compile(r"^([\w\-%:\"]+)\{$")
+ end_table_re = re.compile(r"^\}$")
+ table_entry_re = re.compile(r"^([\w\-%:\"]+)\{\"(.*?)\"\}$")
+
+ # The current resource table.
+ table = {}
+
+ # List of parent tables when parsing.
+ parents = []
+
+ # Track multi-line comments state.
+ in_multiline_comment = False
+
+ for line in flines(filepath, "utf-8-sig"):
+ # Remove leading and trailing whitespace.
+ line = line.strip()
+
+ # Skip over comments.
+ if in_multiline_comment:
+ if line.endswith("*/"):
+ in_multiline_comment = False
+ continue
+
+ if line.startswith("//"):
+ continue
+
+ if line.startswith("/*"):
+ in_multiline_comment = True
+ continue
+
+ # Try to match the start of a table, e.g. `length{` or `meter{`.
+ match = start_table_re.match(line)
+ if match:
+ parents.append(table)
+ table_name = match.group(1)
+ new_table = {}
+ table[table_name] = new_table
+ table = new_table
+ continue
+
+ # Try to match the end of a table.
+ match = end_table_re.match(line)
+ if match:
+ table = parents.pop()
+ continue
+
+ # Try to match a table entry, e.g. `dnam{"meter"}`.
+ match = table_entry_re.match(line)
+ if match:
+ entry_key = match.group(1)
+ entry_value = match.group(2)
+ table[entry_key] = entry_value
+ continue
+
+ raise Exception("unexpected line: '{}' in {}".format(line, filepath))
+
+ assert len(parents) == 0, "Not all tables closed"
+ assert len(table) == 1, "More than one root table"
+
+ # Remove the top-level language identifier table.
+ (_, unit_table) = table.popitem()
+
+ # Add all units for the three display formats "units", "unitsNarrow", and "unitsShort".
+ # But exclude the pseudo-units "compound" and "ccoordinate".
+ return {
+ (unit_type, unit_name if not unit_name.endswith(":alias") else unit_name[:-6])
+ for unit_display in ("units", "unitsNarrow", "unitsShort")
+ if unit_display in unit_table
+ for (unit_type, unit_names) in unit_table[unit_display].items()
+ if unit_type != "compound" and unit_type != "coordinate"
+ for unit_name in unit_names.keys()
+ }
+
+
+def computeSupportedUnits(all_units, sanctioned_units):
+ """Given the set of all possible ICU unit identifiers and the set of sanctioned unit
+ identifiers, compute the set of effectively supported ICU unit identifiers.
+ """
+
+ def find_match(unit):
+ unit_match = [
+ (unit_type, unit_name)
+ for (unit_type, unit_name) in all_units
+ if unit_name == unit
+ ]
+ if unit_match:
+ assert len(unit_match) == 1
+ return unit_match[0]
+ return None
+
+ def compound_unit_identifiers():
+ for numerator in sanctioned_units:
+ for denominator in sanctioned_units:
+ yield "{}-per-{}".format(numerator, denominator)
+
+ supported_simple_units = {find_match(unit) for unit in sanctioned_units}
+ assert None not in supported_simple_units
+
+ supported_compound_units = {
+ unit_match
+ for unit_match in (find_match(unit) for unit in compound_unit_identifiers())
+ if unit_match
+ }
+
+ return supported_simple_units | supported_compound_units
+
+
+def readICUDataFilterForUnits(data_filter_file):
+ with io.open(data_filter_file, mode="r", encoding="utf-8") as f:
+ data_filter = json.load(f)
+
+ # Find the rule set for the "unit_tree".
+ unit_tree_rules = [
+ entry["rules"]
+ for entry in data_filter["resourceFilters"]
+ if entry["categories"] == ["unit_tree"]
+ ]
+ assert len(unit_tree_rules) == 1
+
+ # Compute the list of included units from that rule set. The regular expression must match
+ # "+/*/length/meter" and mustn't match either "-/*" or "+/*/compound".
+ included_unit_re = re.compile(r"^\+/\*/(.+?)/(.+)$")
+ filtered_units = (included_unit_re.match(unit) for unit in unit_tree_rules[0])
+
+ return {(unit.group(1), unit.group(2)) for unit in filtered_units if unit}
+
+
+def writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units):
+ js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
+
+ def find_unit_type(unit):
+ result = [
+ unit_type for (unit_type, unit_name) in all_units if unit_name == unit
+ ]
+ assert result and len(result) == 1
+ return result[0]
+
+ sanctioned_js_file = os.path.join(
+ js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiersGenerated.js"
+ )
+ with io.open(sanctioned_js_file, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ sanctioned_units_object = json.dumps(
+ {unit: True for unit in sorted(sanctioned_units)},
+ sort_keys=True,
+ indent=4,
+ separators=(",", ": "),
+ )
+
+ println(generatedFileWarning)
+
+ println(
+ """
+/**
+ * The list of currently supported simple unit identifiers.
+ *
+ * Intl.NumberFormat Unified API Proposal
+ */"""
+ )
+
+ println(
+ "var sanctionedSimpleUnitIdentifiers = {};".format(sanctioned_units_object)
+ )
+
+ sanctioned_cpp_file = os.path.join(
+ js_src_builtin_intl_dir, "MeasureUnitGenerated.h"
+ )
+ with io.open(sanctioned_cpp_file, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ println(generatedFileWarning)
+
+ println(
+ """
+struct MeasureUnit {
+ const char* const type;
+ const char* const name;
+};
+
+/**
+ * The list of currently supported simple unit identifiers.
+ *
+ * The list must be kept in alphabetical order of |name|.
+ */
+inline constexpr MeasureUnit simpleMeasureUnits[] = {
+ // clang-format off"""
+ )
+
+ for unit_name in sorted(sanctioned_units):
+ println(' {{"{}", "{}"}},'.format(find_unit_type(unit_name), unit_name))
+
+ println(
+ """
+ // clang-format on
+};""".lstrip(
+ "\n"
+ )
+ )
+
+ writeUnitTestFiles(all_units, sanctioned_units)
+
+
+def writeUnitTestFiles(all_units, sanctioned_units):
+ """ Generate test files for unit number formatters. """
+
+ js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
+ test_dir = os.path.join(
+ js_src_builtin_intl_dir, "../../tests/non262/Intl/NumberFormat"
+ )
+
+ def write_test(file_name, test_content, indent=4):
+ file_path = os.path.join(test_dir, file_name)
+ with io.open(file_path, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
+ println("")
+ println(generatedFileWarning)
+ println("")
+
+ sanctioned_units_array = json.dumps(
+ [unit for unit in sorted(sanctioned_units)],
+ indent=indent,
+ separators=(",", ": "),
+ )
+
+ println(
+ "const sanctionedSimpleUnitIdentifiers = {};".format(
+ sanctioned_units_array
+ )
+ )
+
+ println(test_content)
+
+ println(
+ """
+if (typeof reportCompare === "function")
+{}reportCompare(true, true);""".format(
+ " " * indent
+ )
+ )
+
+ write_test(
+ "unit-compound-combinations.js",
+ """
+// Test all simple unit identifier combinations are allowed.
+
+for (const numerator of sanctionedSimpleUnitIdentifiers) {
+ for (const denominator of sanctionedSimpleUnitIdentifiers) {
+ const unit = `${numerator}-per-${denominator}`;
+ const nf = new Intl.NumberFormat("en", {style: "unit", unit});
+
+ assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join(""));
+ }
+}""",
+ )
+
+ all_units_array = json.dumps(
+ ["-".join(unit) for unit in sorted(all_units)], indent=4, separators=(",", ": ")
+ )
+
+ write_test(
+ "unit-well-formed.js",
+ """
+const allUnits = {};
+""".format(
+ all_units_array
+ )
+ + """
+// Test only sanctioned unit identifiers are allowed.
+
+for (const typeAndUnit of allUnits) {
+ const [_, type, unit] = typeAndUnit.match(/(\w+)-(.+)/);
+
+ let allowed;
+ if (unit.includes("-per-")) {
+ const [numerator, denominator] = unit.split("-per-");
+ allowed = sanctionedSimpleUnitIdentifiers.includes(numerator) &&
+ sanctionedSimpleUnitIdentifiers.includes(denominator);
+ } else {
+ allowed = sanctionedSimpleUnitIdentifiers.includes(unit);
+ }
+
+ if (allowed) {
+ const nf = new Intl.NumberFormat("en", {style: "unit", unit});
+ assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join(""));
+ } else {
+ assertThrowsInstanceOf(() => new Intl.NumberFormat("en", {style: "unit", unit}),
+ RangeError, `Missing error for "${typeAndUnit}"`);
+ }
+}""",
+ )
+
+ write_test(
+ "unit-formatToParts-has-unit-field.js",
+ """
+// Test only English and Chinese to keep the overall runtime reasonable.
+//
+// Chinese is included because it contains more than one "unit" element for
+// certain unit combinations.
+const locales = ["en", "zh"];
+
+// Plural rules for English only differentiate between "one" and "other". Plural
+// rules for Chinese only use "other". That means we only need to test two values
+// per unit.
+const values = [0, 1];
+
+// Ensure unit formatters contain at least one "unit" element.
+
+for (const locale of locales) {
+ for (const unit of sanctionedSimpleUnitIdentifiers) {
+ const nf = new Intl.NumberFormat(locale, {style: "unit", unit});
+
+ for (const value of values) {
+ assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true,
+ `locale=${locale}, unit=${unit}`);
+ }
+ }
+
+ for (const numerator of sanctionedSimpleUnitIdentifiers) {
+ for (const denominator of sanctionedSimpleUnitIdentifiers) {
+ const unit = `${numerator}-per-${denominator}`;
+ const nf = new Intl.NumberFormat(locale, {style: "unit", unit});
+
+ for (const value of values) {
+ assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true,
+ `locale=${locale}, unit=${unit}`);
+ }
+ }
+ }
+}""",
+ indent=2,
+ )
+
+
+def updateUnits(topsrcdir, args):
+ icu_path = os.path.join(topsrcdir, "intl", "icu")
+ icu_unit_path = os.path.join(icu_path, "source", "data", "unit")
+
+ with io.open(
+ "SanctionedSimpleUnitIdentifiers.yaml", mode="r", encoding="utf-8"
+ ) as f:
+ sanctioned_units = yaml.safe_load(f)
+
+ # Read all possible ICU unit identifiers from the "unit/root.txt" resource.
+ unit_root_file = os.path.join(icu_unit_path, "root.txt")
+ all_units = readICUUnitResourceFile(unit_root_file)
+
+ # Compute the set of effectively supported ICU unit identifiers.
+ supported_units = computeSupportedUnits(all_units, sanctioned_units)
+
+ # Read the list of units we're including into the ICU data file.
+ data_filter_file = os.path.join(icu_path, "data_filter.json")
+ filtered_units = readICUDataFilterForUnits(data_filter_file)
+
+ # Both sets must match to avoid resource loading errors at runtime.
+ if supported_units != filtered_units:
+
+ def units_to_string(units):
+ return ", ".join("/".join(u) for u in units)
+
+ missing = supported_units - filtered_units
+ if missing:
+ raise RuntimeError("Missing units: {}".format(units_to_string(missing)))
+
+ # Not exactly an error, but we currently don't have a use case where we need to support
+ # more units than required by ECMA-402.
+ extra = filtered_units - supported_units
+ if extra:
+ raise RuntimeError("Unnecessary units: {}".format(units_to_string(extra)))
+
+ writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units)
+
+
+def readICUNumberingSystemsResourceFile(filepath):
+ """Returns a dictionary of numbering systems where the key denotes the numbering system name
+ and the value a dictionary with additional numbering system data.
+
+ Example:
+
+ numberingSystems:table(nofallback){
+ numberingSystems{
+ latn{
+ algorithmic:int{0}
+ desc{"0123456789"}
+ radix:int{10}
+ }
+ roman{
+ algorithmic:int{1}
+ desc{"%roman-upper"}
+ radix:int{10}
+ }
+ }
+ }
+
+ Returns {"latn": {"digits": "0123456789", "algorithmic": False},
+ "roman": {"algorithmic": True}}
+ """
+
+ start_table_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{$")
+ end_table_re = re.compile(r"^\}$")
+ table_entry_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{(?:(?:\"(.*?)\")|(\d+))\}$")
+
+ # The current resource table.
+ table = {}
+
+ # List of parent tables when parsing.
+ parents = []
+
+ # Track multi-line comments state.
+ in_multiline_comment = False
+
+ for line in flines(filepath, "utf-8-sig"):
+ # Remove leading and trailing whitespace.
+ line = line.strip()
+
+ # Skip over comments.
+ if in_multiline_comment:
+ if line.endswith("*/"):
+ in_multiline_comment = False
+ continue
+
+ if line.startswith("//"):
+ continue
+
+ if line.startswith("/*"):
+ in_multiline_comment = True
+ continue
+
+ # Try to match the start of a table, e.g. `latn{`.
+ match = start_table_re.match(line)
+ if match:
+ parents.append(table)
+ table_name = match.group(1)
+ new_table = {}
+ table[table_name] = new_table
+ table = new_table
+ continue
+
+ # Try to match the end of a table.
+ match = end_table_re.match(line)
+ if match:
+ table = parents.pop()
+ continue
+
+ # Try to match a table entry, e.g. `desc{"0123456789"}`.
+ match = table_entry_re.match(line)
+ if match:
+ entry_key = match.group(1)
+ entry_value = (
+ match.group(2) if match.group(2) is not None else int(match.group(3))
+ )
+ table[entry_key] = entry_value
+ continue
+
+ raise Exception("unexpected line: '{}' in {}".format(line, filepath))
+
+ assert len(parents) == 0, "Not all tables closed"
+ assert len(table) == 1, "More than one root table"
+
+ # Remove the two top-level "numberingSystems" tables.
+ (_, numbering_systems) = table.popitem()
+ (_, numbering_systems) = numbering_systems.popitem()
+
+ # Assert all numbering systems use base 10.
+ assert all(ns["radix"] == 10 for ns in numbering_systems.values())
+
+ # Return the numbering systems.
+ return {
+ key: {"digits": value["desc"], "algorithmic": False}
+ if not bool(value["algorithmic"])
+ else {"algorithmic": True}
+ for (key, value) in numbering_systems.items()
+ }
+
+
+def writeNumberingSystemFiles(numbering_systems):
+ js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
+
+ numbering_systems_js_file = os.path.join(
+ js_src_builtin_intl_dir, "NumberingSystemsGenerated.h"
+ )
+ with io.open(
+ numbering_systems_js_file, mode="w", encoding="utf-8", newline=""
+ ) as f:
+ println = partial(print, file=f)
+
+ println(generatedFileWarning)
+
+ println(
+ """
+/**
+ * The list of numbering systems with simple digit mappings.
+ */
+
+#ifndef builtin_intl_NumberingSystemsGenerated_h
+#define builtin_intl_NumberingSystemsGenerated_h
+"""
+ )
+
+ simple_numbering_systems = sorted(
+ name
+ for (name, value) in numbering_systems.items()
+ if not value["algorithmic"]
+ )
+
+ println("// clang-format off")
+ println("#define NUMBERING_SYSTEMS_WITH_SIMPLE_DIGIT_MAPPINGS \\")
+ println(
+ "{}".format(
+ ", \\\n".join(
+ ' "{}"'.format(name) for name in simple_numbering_systems
+ )
+ )
+ )
+ println("// clang-format on")
+ println("")
+
+ println("#endif // builtin_intl_NumberingSystemsGenerated_h")
+
+ js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
+ test_dir = os.path.join(js_src_builtin_intl_dir, "../../tests/non262/Intl")
+
+ intl_shell_js_file = os.path.join(test_dir, "shell.js")
+
+ with io.open(intl_shell_js_file, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ println(generatedFileWarning)
+
+ println(
+ """
+// source: CLDR file common/bcp47/number.xml; version CLDR {}.
+// https://github.com/unicode-org/cldr/blob/master/common/bcp47/number.xml
+// https://github.com/unicode-org/cldr/blob/master/common/supplemental/numberingSystems.xml
+""".format(
+ readCLDRVersionFromICU()
+ ).rstrip()
+ )
+
+ numbering_systems_object = json.dumps(
+ numbering_systems,
+ indent=2,
+ separators=(",", ": "),
+ sort_keys=True,
+ ensure_ascii=False,
+ )
+ println("const numberingSystems = {};".format(numbering_systems_object))
+
+
+def updateNumberingSystems(topsrcdir, args):
+ icu_path = os.path.join(topsrcdir, "intl", "icu")
+ icu_misc_path = os.path.join(icu_path, "source", "data", "misc")
+
+ with io.open("NumberingSystems.yaml", mode="r", encoding="utf-8") as f:
+ numbering_systems = yaml.safe_load(f)
+
+ # Read all possible ICU unit identifiers from the "misc/numberingSystems.txt" resource.
+ misc_ns_file = os.path.join(icu_misc_path, "numberingSystems.txt")
+ all_numbering_systems = readICUNumberingSystemsResourceFile(misc_ns_file)
+
+ all_numbering_systems_simple_digits = {
+ name
+ for (name, value) in all_numbering_systems.items()
+ if not value["algorithmic"]
+ }
+
+ # Assert ICU includes support for all required numbering systems. If this assertion fails,
+ # something is broken in ICU.
+ assert all_numbering_systems_simple_digits.issuperset(
+ numbering_systems
+ ), "{}".format(numbering_systems.difference(all_numbering_systems_simple_digits))
+
+ # Assert the spec requires support for all numbering systems with simple digit mappings. If
+ # this assertion fails, file a PR at <https://github.com/tc39/ecma402> to include any new
+ # numbering systems.
+ assert all_numbering_systems_simple_digits.issubset(numbering_systems), "{}".format(
+ all_numbering_systems_simple_digits.difference(numbering_systems)
+ )
+
+ writeNumberingSystemFiles(all_numbering_systems)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ # This script must reside in js/src/builtin/intl to work correctly.
+ (thisDir, thisFile) = os.path.split(os.path.abspath(sys.argv[0]))
+ dirPaths = os.path.normpath(thisDir).split(os.sep)
+ if "/".join(dirPaths[-4:]) != "js/src/builtin/intl":
+ raise RuntimeError("%s must reside in js/src/builtin/intl" % sys.argv[0])
+ topsrcdir = "/".join(dirPaths[:-4])
+
+ def EnsureHttps(v):
+ if not v.startswith("https:"):
+ raise argparse.ArgumentTypeError("URL protocol must be https: " % v)
+ return v
+
+ parser = argparse.ArgumentParser(description="Update intl data.")
+ subparsers = parser.add_subparsers(help="Select update mode")
+
+ parser_cldr_tags = subparsers.add_parser(
+ "langtags", help="Update CLDR language tags data"
+ )
+ parser_cldr_tags.add_argument(
+ "--version", metavar="VERSION", help="CLDR version number"
+ )
+ parser_cldr_tags.add_argument(
+ "--url",
+ metavar="URL",
+ default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
+ type=EnsureHttps,
+ help="Download url CLDR data (default: %(default)s)",
+ )
+ parser_cldr_tags.add_argument(
+ "--out",
+ default="LanguageTagGenerated.cpp",
+ help="Output file (default: %(default)s)",
+ )
+ parser_cldr_tags.add_argument(
+ "file", nargs="?", help="Local cldr-core.zip file, if omitted uses <URL>"
+ )
+ parser_cldr_tags.set_defaults(func=updateCLDRLangTags)
+
+ parser_tz = subparsers.add_parser("tzdata", help="Update tzdata")
+ parser_tz.add_argument(
+ "--tz",
+ help="Local tzdata directory or file, if omitted downloads tzdata "
+ "distribution from https://www.iana.org/time-zones/",
+ )
+ # ICU doesn't include the backzone file by default, but we still like to
+ # use the backzone time zone names to avoid user confusion. This does lead
+ # to formatting "historic" dates (pre-1970 era) with the wrong time zone,
+ # but that's probably acceptable for now.
+ parser_tz.add_argument(
+ "--ignore-backzone",
+ action="store_true",
+ help="Ignore tzdata's 'backzone' file. Can be enabled to generate more "
+ "accurate time zone canonicalization reflecting the actual time "
+ "zones as used by ICU.",
+ )
+ parser_tz.add_argument(
+ "--out",
+ default="TimeZoneDataGenerated.h",
+ help="Output file (default: %(default)s)",
+ )
+ parser_tz.set_defaults(func=partial(updateTzdata, topsrcdir))
+
+ parser_currency = subparsers.add_parser(
+ "currency", help="Update currency digits mapping"
+ )
+ parser_currency.add_argument(
+ "--url",
+ metavar="URL",
+ default="https://www.currency-iso.org/dam/downloads/lists/list_one.xml", # NOQA: E501
+ type=EnsureHttps,
+ help="Download url for the currency & funds code list (default: "
+ "%(default)s)",
+ )
+ parser_currency.add_argument(
+ "--out",
+ default="CurrencyDataGenerated.js",
+ help="Output file (default: %(default)s)",
+ )
+ parser_currency.add_argument(
+ "file", nargs="?", help="Local currency code list file, if omitted uses <URL>"
+ )
+ parser_currency.set_defaults(func=partial(updateCurrency, topsrcdir))
+
+ parser_units = subparsers.add_parser(
+ "units", help="Update sanctioned unit identifiers mapping"
+ )
+ parser_units.set_defaults(func=partial(updateUnits, topsrcdir))
+
+ parser_numbering_systems = subparsers.add_parser(
+ "numbering", help="Update numbering systems with simple " "digit mappings"
+ )
+ parser_numbering_systems.set_defaults(
+ func=partial(updateNumberingSystems, topsrcdir)
+ )
+
+ args = parser.parse_args()
+ args.func(args)