#!/usr/bin/env python # -*- coding: utf-8 -*- # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ Usage: make_intl_data.py langtags [cldr_common.zip] make_intl_data.py tzdata make_intl_data.py currency make_intl_data.py units make_intl_data.py numbering Target "langtags": This script extracts information about 1) mappings between deprecated and current Unicode BCP 47 locale identifiers, and 2) deprecated and current BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping code in intl/components/LocaleGenerated.cpp. The code is used in intl/components/Locale.cpp. Target "tzdata": This script computes which time zone informations are not up-to-date in ICU and provides the necessary mappings to workaround this problem. https://ssl.icu-project.org/trac/ticket/12044 Target "currency": Generates the mapping from currency codes to decimal digits used for them. Target "units": Generate source and test files using the list of so-called "sanctioned unit identifiers" and verifies that the ICU data filter includes these units. Target "numbering": Generate source and test files using the list of numbering systems with simple digit mappings and verifies that it's in sync with ICU/CLDR. """ import io import json import os import re import sys import tarfile import tempfile from contextlib import closing from functools import partial, total_ordering from itertools import chain, groupby, tee from operator import attrgetter, itemgetter from zipfile import ZipFile import yaml if sys.version_info.major == 2: from itertools import ifilter as filter from itertools import ifilterfalse as filterfalse from itertools import imap as map from itertools import izip_longest as zip_longest from urllib2 import Request as UrlRequest from urllib2 import urlopen from urlparse import urlsplit else: from itertools import filterfalse, zip_longest from urllib.parse import urlsplit from urllib.request import Request as UrlRequest from urllib.request import urlopen # From https://docs.python.org/3/library/itertools.html def grouper(iterable, n, fillvalue=None): "Collect data into fixed-length chunks or blocks" # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" args = [iter(iterable)] * n return zip_longest(*args, fillvalue=fillvalue) def writeMappingHeader(println, description, source, url): if type(description) is not list: description = [description] for desc in description: println("// {0}".format(desc)) println("// Derived from {0}.".format(source)) println("// {0}".format(url)) def writeMappingsVar(println, mapping, name, description, source, url): """Writes a variable definition with a mapping table. Writes the contents of dictionary |mapping| through the |println| function with the given variable name and a comment with description, fileDate, and URL. """ println("") writeMappingHeader(println, description, source, url) println("var {0} = {{".format(name)) for key, value in sorted(mapping.items(), key=itemgetter(0)): println(' "{0}": "{1}",'.format(key, value)) println("};") def writeMappingsBinarySearch( println, fn_name, type_name, name, validate_fn, validate_case_fn, mappings, tag_maxlength, description, source, url, ): """Emit code to perform a binary search on language tag subtags. Uses the contents of |mapping|, which can either be a dictionary or set, to emit a mapping function to find subtag replacements. """ println("") writeMappingHeader(println, description, source, url) println( """ bool mozilla::intl::Locale::{0}({1} {2}) {{ MOZ_ASSERT({3}({2}.Span())); MOZ_ASSERT({4}({2}.Span())); """.format( fn_name, type_name, name, validate_fn, validate_case_fn ).strip() ) writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength) println( """ }""".lstrip( "\n" ) ) def writeMappingsBinarySearchBody( println, source_name, target_name, mappings, tag_maxlength ): def write_array(subtags, name, length, fixed): if fixed: println( " static const char {}[{}][{}] = {{".format( name, len(subtags), length + 1 ) ) else: println(" static const char* {}[{}] = {{".format(name, len(subtags))) # Group in pairs of ten to not exceed the 80 line column limit. for entries in grouper(subtags, 10): entries = ( '"{}"'.format(tag).rjust(length + 2) for tag in entries if tag is not None ) println(" {},".format(", ".join(entries))) println(" };") trailing_return = True # Sort the subtags by length. That enables using an optimized comparator # for the binary search, which only performs a single |memcmp| for multiple # of two subtag lengths. mappings_keys = mappings.keys() if type(mappings) == dict else mappings for length, subtags in groupby(sorted(mappings_keys, key=len), len): # Omit the length check if the current length is the maximum length. if length != tag_maxlength: println( """ if ({}.Length() == {}) {{ """.format( source_name, length ).rstrip( "\n" ) ) else: trailing_return = False println( """ { """.rstrip( "\n" ) ) # The subtags need to be sorted for binary search to work. subtags = sorted(subtags) def equals(subtag): return """{}.EqualTo("{}")""".format(source_name, subtag) # Don't emit a binary search for short lists. if len(subtags) == 1: if type(mappings) == dict: println( """ if ({}) {{ {}.Set(mozilla::MakeStringSpan("{}")); return true; }} return false; """.format( equals(subtags[0]), target_name, mappings[subtags[0]] ).strip( "\n" ) ) else: println( """ return {}; """.format( equals(subtags[0]) ).strip( "\n" ) ) elif len(subtags) <= 4: if type(mappings) == dict: for subtag in subtags: println( """ if ({}) {{ {}.Set("{}"); return true; }} """.format( equals(subtag), target_name, mappings[subtag] ).strip( "\n" ) ) println( """ return false; """.strip( "\n" ) ) else: cond = (equals(subtag) for subtag in subtags) cond = (" ||\n" + " " * (4 + len("return "))).join(cond) println( """ return {}; """.format( cond ).strip( "\n" ) ) else: write_array(subtags, source_name + "s", length, True) if type(mappings) == dict: write_array([mappings[k] for k in subtags], "aliases", length, False) println( """ if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{ {1}.Set(mozilla::MakeStringSpan(replacement)); return true; }} return false; """.format( source_name, target_name ).rstrip() ) else: println( """ return HasReplacement({0}s, {0}); """.format( source_name ).rstrip() ) println( """ } """.strip( "\n" ) ) if trailing_return: println( """ return false;""" ) def writeComplexLanguageTagMappings( println, complex_language_mappings, description, source, url ): println("") writeMappingHeader(println, description, source, url) println( """ void mozilla::intl::Locale::PerformComplexLanguageMappings() { MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); """.lstrip() ) # Merge duplicate language entries. language_aliases = {} for deprecated_language, (language, script, region) in sorted( complex_language_mappings.items(), key=itemgetter(0) ): key = (language, script, region) if key not in language_aliases: language_aliases[key] = [] else: language_aliases[key].append(deprecated_language) first_language = True for deprecated_language, (language, script, region) in sorted( complex_language_mappings.items(), key=itemgetter(0) ): key = (language, script, region) if deprecated_language in language_aliases[key]: continue if_kind = "if" if first_language else "else if" first_language = False cond = ( 'Language().EqualTo("{}")'.format(lang) for lang in [deprecated_language] + language_aliases[key] ) cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) println( """ {} ({}) {{""".format( if_kind, cond ).strip( "\n" ) ) println( """ SetLanguage("{}");""".format( language ).strip( "\n" ) ) if script is not None: println( """ if (Script().Missing()) {{ SetScript("{}"); }}""".format( script ).strip( "\n" ) ) if region is not None: println( """ if (Region().Missing()) {{ SetRegion("{}"); }}""".format( region ).strip( "\n" ) ) println( """ }""".strip( "\n" ) ) println( """ } """.strip( "\n" ) ) def writeComplexRegionTagMappings( println, complex_region_mappings, description, source, url ): println("") writeMappingHeader(println, description, source, url) println( """ void mozilla::intl::Locale::PerformComplexRegionMappings() { MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); MOZ_ASSERT(IsStructurallyValidRegionTag(Region().Span())); MOZ_ASSERT(IsCanonicallyCasedRegionTag(Region().Span())); """.lstrip() ) # |non_default_replacements| is a list and hence not hashable. Convert it # to a string to get a proper hashable value. def hash_key(default, non_default_replacements): return (default, str(sorted(str(v) for v in non_default_replacements))) # Merge duplicate region entries. region_aliases = {} for deprecated_region, (default, non_default_replacements) in sorted( complex_region_mappings.items(), key=itemgetter(0) ): key = hash_key(default, non_default_replacements) if key not in region_aliases: region_aliases[key] = [] else: region_aliases[key].append(deprecated_region) first_region = True for deprecated_region, (default, non_default_replacements) in sorted( complex_region_mappings.items(), key=itemgetter(0) ): key = hash_key(default, non_default_replacements) if deprecated_region in region_aliases[key]: continue if_kind = "if" if first_region else "else if" first_region = False cond = ( 'Region().EqualTo("{}")'.format(region) for region in [deprecated_region] + region_aliases[key] ) cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) println( """ {} ({}) {{""".format( if_kind, cond ).strip( "\n" ) ) replacement_regions = sorted( {region for (_, _, region) in non_default_replacements} ) first_case = True for replacement_region in replacement_regions: replacement_language_script = sorted( (language, script) for (language, script, region) in (non_default_replacements) if region == replacement_region ) if_kind = "if" if first_case else "else if" first_case = False def compare_tags(language, script): if script is None: return 'Language().EqualTo("{}")'.format(language) return '(Language().EqualTo("{}") && Script().EqualTo("{}"))'.format( language, script ) cond = ( compare_tags(language, script) for (language, script) in replacement_language_script ) cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond) println( """ {} ({}) {{ SetRegion("{}"); }}""".format( if_kind, cond, replacement_region ) .rstrip() .strip("\n") ) println( """ else {{ SetRegion("{}"); }} }}""".format( default ) .rstrip() .strip("\n") ) println( """ } """.strip( "\n" ) ) def writeVariantTagMappings(println, variant_mappings, description, source, url): """Writes a function definition that maps variant subtags.""" println( """ static const char* ToCharPointer(const char* str) { return str; } static const char* ToCharPointer(const mozilla::intl::UniqueChars& str) { return str.get(); } template static bool IsLessThan(const T& a, const U& b) { return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0; } """ ) writeMappingHeader(println, description, source, url) println( """ bool mozilla::intl::Locale::PerformVariantMappings() { // The variant subtags need to be sorted for binary search. MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), IsLessThan)); auto removeVariantAt = [&](size_t index) { mVariants.erase(mVariants.begin() + index); }; auto insertVariantSortedIfNotPresent = [&](const char* variant) { auto* p = std::lower_bound( mVariants.begin(), mVariants.end(), variant, IsLessThan); // Don't insert the replacement when already present. if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { return true; } // Insert the preferred variant in sort order. auto preferred = DuplicateStringToUniqueChars(variant); return !!mVariants.insert(p, std::move(preferred)); }; for (size_t i = 0; i < mVariants.length();) { const char* variant = mVariants[i].get(); MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant))); """.lstrip() ) (no_alias, with_alias) = partition( variant_mappings.items(), lambda item: item[1] is None ) no_replacements = " ||\n ".join( f"""strcmp(variant, "{deprecated_variant}") == 0""" for (deprecated_variant, _) in sorted(no_alias, key=itemgetter(0)) ) println( f""" if ({no_replacements}) {{ removeVariantAt(i); }} """.strip( "\n" ) ) for deprecated_variant, (type, replacement) in sorted( with_alias, key=itemgetter(0) ): println( f""" else if (strcmp(variant, "{deprecated_variant}") == 0) {{ removeVariantAt(i); """.strip( "\n" ) ) if type == "language": println( f""" SetLanguage("{replacement}"); """.strip( "\n" ) ) elif type == "region": println( f""" SetRegion("{replacement}"); """.strip( "\n" ) ) else: assert type == "variant" println( f""" if (!insertVariantSortedIfNotPresent("{replacement}")) {{ return false; }} """.strip( "\n" ) ) println( """ } """.strip( "\n" ) ) println( """ else { i++; } } return true; } """.strip( "\n" ) ) def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url): """Writes a function definition that maps legacy language tags.""" println("") writeMappingHeader(println, description, source, url) println( """\ bool mozilla::intl::Locale::UpdateLegacyMappings() { // We're mapping legacy tags to non-legacy form here. // Other tags remain unchanged. // // Legacy tags are either sign language tags ("sgn") or have one or multiple // variant subtags. Therefore we can quickly exclude most tags by checking // these two subtags. MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); if (!Language().EqualTo("sgn") && mVariants.length() == 0) { return true; } #ifdef DEBUG for (const auto& variant : Variants()) { MOZ_ASSERT(IsStructurallyValidVariantTag(variant)); MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant)); } #endif // The variant subtags need to be sorted for binary search. MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), IsLessThan)); auto findVariant = [this](const char* variant) { auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, IsLessThan); if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { return p; } return static_cast(nullptr); }; auto insertVariantSortedIfNotPresent = [&](const char* variant) { auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, IsLessThan); // Don't insert the replacement when already present. if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { return true; } // Insert the preferred variant in sort order. auto preferred = DuplicateStringToUniqueChars(variant); return !!mVariants.insert(p, std::move(preferred)); }; auto removeVariant = [&](auto* p) { size_t index = std::distance(mVariants.begin(), p); mVariants.erase(mVariants.begin() + index); }; auto removeVariants = [&](auto* p, auto* q) { size_t pIndex = std::distance(mVariants.begin(), p); size_t qIndex = std::distance(mVariants.begin(), q); MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted"); mVariants.erase(mVariants.begin() + qIndex); mVariants.erase(mVariants.begin() + pIndex); };""" ) # Helper class for pattern matching. class AnyClass: def __eq__(self, obj): return obj is not None Any = AnyClass() # Group the mappings by language. legacy_mappings_by_language = {} for type, replacement in legacy_mappings.items(): (language, _, _, _) = type legacy_mappings_by_language.setdefault(language, {})[type] = replacement # Handle the empty language case first. if None in legacy_mappings_by_language: # Get the mappings and remove them from the dict. mappings = legacy_mappings_by_language.pop(None) # This case only applies for the "hepburn-heploc" -> "alalc97" # mapping, so just inline it here. from_tag = (None, None, None, "hepburn-heploc") to_tag = (None, None, None, "alalc97") assert len(mappings) == 1 assert mappings[from_tag] == to_tag println( """ if (mVariants.length() >= 2) { if (auto* hepburn = findVariant("hepburn")) { if (auto* heploc = findVariant("heploc")) { removeVariants(hepburn, heploc); if (!insertVariantSortedIfNotPresent("alalc97")) { return false; } } } } """ ) # Handle sign languages next. if "sgn" in legacy_mappings_by_language: mappings = legacy_mappings_by_language.pop("sgn") # Legacy sign language mappings have the form "sgn-XX" where "XX" is # some region code. assert all(type == ("sgn", None, Any, None) for type in mappings.keys()) # Legacy sign languages are mapped to a single language subtag. assert all( replacement == (Any, None, None, None) for replacement in mappings.values() ) println( """ if (Language().EqualTo("sgn")) { if (Region().Present() && SignLanguageMapping(mLanguage, Region())) { mRegion.Set(mozilla::MakeStringSpan("")); } } """.rstrip().lstrip( "\n" ) ) # Finally handle all remaining cases. # The remaining mappings have neither script nor region subtags in the source locale. assert all( type == (Any, None, None, Any) for mappings in legacy_mappings_by_language.values() for type in mappings.keys() ) # And they have neither script nor region nor variant subtags in the target locale. assert all( replacement == (Any, None, None, None) for mappings in legacy_mappings_by_language.values() for replacement in mappings.values() ) # Compact the mappings table by removing empty fields. legacy_mappings_by_language = { lang: { variants: r_language for ((_, _, _, variants), (r_language, _, _, _)) in mappings.items() } for (lang, mappings) in legacy_mappings_by_language.items() } # Try to combine the remaining cases. legacy_mappings_compact = {} # Python can't hash dicts or lists, so use the string representation as the hash key. def hash_key(mappings): return str(sorted(mappings.items(), key=itemgetter(0))) for lang, mappings in sorted( legacy_mappings_by_language.items(), key=itemgetter(0) ): key = hash_key(mappings) legacy_mappings_compact.setdefault(key, []).append(lang) for langs in legacy_mappings_compact.values(): language_equal_to = ( f"""Language().EqualTo("{lang}")""" for lang in sorted(langs) ) cond = f""" ||\n{" " * len(" else if (")}""".join(language_equal_to) println( f""" else if ({cond}) {{ """.rstrip().lstrip( "\n" ) ) mappings = legacy_mappings_by_language[langs[0]] # Count the variant subtags to determine the sort order. def variant_size(m): (k, _) = m return len(k.split("-")) # Alias rules are applied by largest union size first. for size, mappings_by_size in groupby( sorted(mappings.items(), key=variant_size, reverse=True), key=variant_size ): # Convert grouper object to dict. mappings_by_size = dict(mappings_by_size) is_first = True chain_if = size == 1 # Alias rules are applied in alphabetical order for variants, r_language in sorted( mappings_by_size.items(), key=itemgetter(0) ): sorted_variants = sorted(variants.split("-")) len_variants = len(sorted_variants) maybe_else = "else " if chain_if and not is_first else "" is_first = False for i, variant in enumerate(sorted_variants): println( f""" {" " * i}{maybe_else}if (auto* {variant} = findVariant("{variant}")) {{ """.rstrip().lstrip( "\n" ) ) indent = " " * len_variants println( f""" {indent}removeVariant{"s" if len_variants > 1 else ""}({", ".join(sorted_variants)}); {indent}SetLanguage("{r_language}"); {indent}{"return true;" if not chain_if else ""} """.rstrip().lstrip( "\n" ) ) for i in range(len_variants, 0, -1): println( f""" {" " * (i - 1)}}} """.rstrip().lstrip( "\n" ) ) println( """ } """.rstrip().lstrip( "\n" ) ) println( """ return true; }""" ) def writeSignLanguageMappingsFunction( println, legacy_mappings, description, source, url ): """Writes a function definition that maps legacy sign language tags.""" println("") writeMappingHeader(println, description, source, url) println( """\ bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language, const RegionSubtag& region) { MOZ_ASSERT(language.EqualTo("sgn")); MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span())); MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span())); """.rstrip() ) region_mappings = { rg: lg for ((lang, _, rg, _), (lg, _, _, _)) in legacy_mappings.items() if lang == "sgn" } source_name = "region" target_name = "language" tag_maxlength = 3 writeMappingsBinarySearchBody( println, source_name, target_name, region_mappings, tag_maxlength ) println( """ }""".lstrip() ) def readSupplementalData(core_file): """Reads CLDR Supplemental Data and extracts information for Intl.js. Information extracted: - legacyMappings: mappings from legacy tags to preferred complete language tags - languageMappings: mappings from language subtags to preferred subtags - complexLanguageMappings: mappings from language subtags with complex rules - regionMappings: mappings from region subtags to preferred subtags - complexRegionMappings: mappings from region subtags with complex rules - variantMappings: mappings from variant subtags to preferred subtags - likelySubtags: likely subtags used for generating test data only Returns these mappings as dictionaries. """ import xml.etree.ElementTree as ET # From Unicode BCP 47 locale identifier . re_unicode_language_id = re.compile( r""" ^ # unicode_language_id = unicode_language_subtag # unicode_language_subtag = alpha{2,3} | alpha{5,8} (?P[a-z]{2,3}|[a-z]{5,8}) # (sep unicode_script_subtag)? # unicode_script_subtag = alpha{4} (?:-(?P