#!/usr/bin/env python # -*- coding: utf-8 -*- # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ Usage: make_intl_data.py langtags [cldr_core.zip] make_intl_data.py tzdata make_intl_data.py currency make_intl_data.py units make_intl_data.py numbering Target "langtags": This script extracts information about 1) mappings between deprecated and current Unicode BCP 47 locale identifiers, and 2) deprecated and current BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp. Target "tzdata": This script computes which time zone informations are not up-to-date in ICU and provides the necessary mappings to workaround this problem. https://ssl.icu-project.org/trac/ticket/12044 Target "currency": Generates the mapping from currency codes to decimal digits used for them. Target "units": Generate source and test files using the list of so-called "sanctioned unit identifiers" and verifies that the ICU data filter includes these units. Target "numbering": Generate source and test files using the list of numbering systems with simple digit mappings and verifies that it's in sync with ICU/CLDR. """ from __future__ import print_function import os import re import io import json import sys import tarfile import tempfile import yaml from contextlib import closing from functools import partial, total_ordering from itertools import chain, groupby, tee from operator import attrgetter, itemgetter from zipfile import ZipFile if sys.version_info.major == 2: from itertools import ( ifilter as filter, ifilterfalse as filterfalse, imap as map, izip_longest as zip_longest, ) from urllib2 import urlopen, Request as UrlRequest from urlparse import urlsplit else: from itertools import filterfalse, zip_longest from urllib.request import urlopen, Request as UrlRequest from urllib.parse import urlsplit # From https://docs.python.org/3/library/itertools.html def grouper(iterable, n, fillvalue=None): "Collect data into fixed-length chunks or blocks" # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" args = [iter(iterable)] * n return zip_longest(*args, fillvalue=fillvalue) def writeMappingHeader(println, description, source, url): if type(description) is not list: description = [description] for desc in description: println("// {0}".format(desc)) println("// Derived from {0}.".format(source)) println("// {0}".format(url)) def writeMappingsVar(println, mapping, name, description, source, url): """Writes a variable definition with a mapping table. Writes the contents of dictionary |mapping| through the |println| function with the given variable name and a comment with description, fileDate, and URL. """ println("") writeMappingHeader(println, description, source, url) println("var {0} = {{".format(name)) for (key, value) in sorted(mapping.items(), key=itemgetter(0)): println(' "{0}": "{1}",'.format(key, value)) println("};") def writeMappingsBinarySearch( println, fn_name, type_name, name, validate_fn, validate_case_fn, mappings, tag_maxlength, description, source, url, ): """Emit code to perform a binary search on language tag subtags. Uses the contents of |mapping|, which can either be a dictionary or set, to emit a mapping function to find subtag replacements. """ println("") writeMappingHeader(println, description, source, url) println( """ bool js::intl::LanguageTag::{0}({1} {2}) {{ MOZ_ASSERT({3}({2}.span())); MOZ_ASSERT({4}({2}.span())); """.format( fn_name, type_name, name, validate_fn, validate_case_fn ).strip() ) def write_array(subtags, name, length, fixed): if fixed: println( " static const char {}[{}][{}] = {{".format( name, len(subtags), length + 1 ) ) else: println(" static const char* {}[{}] = {{".format(name, len(subtags))) # Group in pairs of ten to not exceed the 80 line column limit. for entries in grouper(subtags, 10): entries = ( '"{}"'.format(tag).rjust(length + 2) for tag in entries if tag is not None ) println(" {},".format(", ".join(entries))) println(" };") trailing_return = True # Sort the subtags by length. That enables using an optimized comparator # for the binary search, which only performs a single |memcmp| for multiple # of two subtag lengths. mappings_keys = mappings.keys() if type(mappings) == dict else mappings for (length, subtags) in groupby(sorted(mappings_keys, key=len), len): # Omit the length check if the current length is the maximum length. if length != tag_maxlength: println( """ if ({}.length() == {}) {{ """.format( name, length ).rstrip( "\n" ) ) else: trailing_return = False println( """ { """.rstrip( "\n" ) ) # The subtags need to be sorted for binary search to work. subtags = sorted(subtags) def equals(subtag): return """{}.equalTo("{}")""".format(name, subtag) # Don't emit a binary search for short lists. if len(subtags) == 1: if type(mappings) == dict: println( """ if ({}) {{ {}.set("{}"); return true; }} return false; """.format( equals(subtags[0]), name, mappings[subtags[0]] ).strip( "\n" ) ) else: println( """ return {}; """.format( equals(subtags[0]) ).strip( "\n" ) ) elif len(subtags) <= 4: if type(mappings) == dict: for subtag in subtags: println( """ if ({}) {{ {}.set("{}"); return true; }} """.format( equals(subtag), name, mappings[subtag] ).strip( "\n" ) ) println( """ return false; """.strip( "\n" ) ) else: cond = (equals(subtag) for subtag in subtags) cond = (" ||\n" + " " * (4 + len("return "))).join(cond) println( """ return {}; """.format( cond ).strip( "\n" ) ) else: write_array(subtags, name + "s", length, True) if type(mappings) == dict: write_array([mappings[k] for k in subtags], "aliases", length, False) println( """ if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{ {0}.set(mozilla::MakeStringSpan(replacement)); return true; }} return false; """.format( name ).rstrip() ) else: println( """ return HasReplacement({0}s, {0}); """.format( name ).rstrip() ) println( """ } """.strip( "\n" ) ) if trailing_return: println( """ return false;""" ) println( """ }""".lstrip( "\n" ) ) def writeComplexLanguageTagMappings( println, complex_language_mappings, description, source, url ): println("") writeMappingHeader(println, description, source, url) println( """ void js::intl::LanguageTag::performComplexLanguageMappings() { MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); """.lstrip() ) # Merge duplicate language entries. language_aliases = {} for (deprecated_language, (language, script, region)) in sorted( complex_language_mappings.items(), key=itemgetter(0) ): key = (language, script, region) if key not in language_aliases: language_aliases[key] = [] else: language_aliases[key].append(deprecated_language) first_language = True for (deprecated_language, (language, script, region)) in sorted( complex_language_mappings.items(), key=itemgetter(0) ): key = (language, script, region) if deprecated_language in language_aliases[key]: continue if_kind = "if" if first_language else "else if" first_language = False cond = ( 'language().equalTo("{}")'.format(lang) for lang in [deprecated_language] + language_aliases[key] ) cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) println( """ {} ({}) {{""".format( if_kind, cond ).strip( "\n" ) ) println( """ setLanguage("{}");""".format( language ).strip( "\n" ) ) if script is not None: println( """ if (script().missing()) {{ setScript("{}"); }}""".format( script ).strip( "\n" ) ) if region is not None: println( """ if (region().missing()) {{ setRegion("{}"); }}""".format( region ).strip( "\n" ) ) println( """ }""".strip( "\n" ) ) println( """ } """.strip( "\n" ) ) def writeComplexRegionTagMappings( println, complex_region_mappings, description, source, url ): println("") writeMappingHeader(println, description, source, url) println( """ void js::intl::LanguageTag::performComplexRegionMappings() { MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); MOZ_ASSERT(IsStructurallyValidRegionTag(region().span())); MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span())); """.lstrip() ) # |non_default_replacements| is a list and hence not hashable. Convert it # to a string to get a proper hashable value. def hash_key(default, non_default_replacements): return (default, str(sorted(str(v) for v in non_default_replacements))) # Merge duplicate region entries. region_aliases = {} for (deprecated_region, (default, non_default_replacements)) in sorted( complex_region_mappings.items(), key=itemgetter(0) ): key = hash_key(default, non_default_replacements) if key not in region_aliases: region_aliases[key] = [] else: region_aliases[key].append(deprecated_region) first_region = True for (deprecated_region, (default, non_default_replacements)) in sorted( complex_region_mappings.items(), key=itemgetter(0) ): key = hash_key(default, non_default_replacements) if deprecated_region in region_aliases[key]: continue if_kind = "if" if first_region else "else if" first_region = False cond = ( 'region().equalTo("{}")'.format(region) for region in [deprecated_region] + region_aliases[key] ) cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) println( """ {} ({}) {{""".format( if_kind, cond ).strip( "\n" ) ) replacement_regions = sorted( {region for (_, _, region) in non_default_replacements} ) first_case = True for replacement_region in replacement_regions: replacement_language_script = sorted( (language, script) for (language, script, region) in (non_default_replacements) if region == replacement_region ) if_kind = "if" if first_case else "else if" first_case = False def compare_tags(language, script): if script is None: return 'language().equalTo("{}")'.format(language) return '(language().equalTo("{}") && script().equalTo("{}"))'.format( language, script ) cond = ( compare_tags(language, script) for (language, script) in replacement_language_script ) cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond) println( """ {} ({}) {{ setRegion("{}"); }}""".format( if_kind, cond, replacement_region ) .rstrip() .strip("\n") ) println( """ else {{ setRegion("{}"); }} }}""".format( default ) .rstrip() .strip("\n") ) println( """ } """.strip( "\n" ) ) def writeVariantTagMappings(println, variant_mappings, description, source, url): """ Writes a function definition that maps variant subtags. """ println( """ static const char* ToCharPointer(const char* str) { return str; } static const char* ToCharPointer(const js::UniqueChars& str) { return str.get(); } template static bool IsLessThan(const T& a, const U& b) { return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0; } """ ) writeMappingHeader(println, description, source, url) println( """ bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) { // The variant subtags need to be sorted for binary search. MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(), IsLessThan)); auto insertVariantSortedIfNotPresent = [&](const char* variant) { auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant, IsLessThan); // Don't insert the replacement when already present. if (p != variants_.end() && strcmp(p->get(), variant) == 0) { return true; } // Insert the preferred variant in sort order. auto preferred = DuplicateString(cx, variant); if (!preferred) { return false; } return !!variants_.insert(p, std::move(preferred)); }; for (size_t i = 0; i < variants_.length(); ) { auto& variant = variants_[i]; MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant.get()))); """.lstrip() ) first_variant = True for (deprecated_variant, (type, replacement)) in sorted( variant_mappings.items(), key=itemgetter(0) ): if_kind = "if" if first_variant else "else if" first_variant = False println( """ {} (strcmp(variant.get(), "{}") == 0) {{ variants_.erase(variants_.begin() + i); """.format( if_kind, deprecated_variant ).strip( "\n" ) ) if type == "language": println( """ setLanguage("{}"); """.format( replacement ).strip( "\n" ) ) elif type == "region": println( """ setRegion("{}"); """.format( replacement ).strip( "\n" ) ) else: assert type == "variant" println( """ if (!insertVariantSortedIfNotPresent("{}")) {{ return false; }} """.format( replacement ).strip( "\n" ) ) println( """ } """.strip( "\n" ) ) println( """ else { i++; } } return true; } """.strip( "\n" ) ) def writeGrandfatheredMappingsFunction( println, grandfathered_mappings, description, source, url ): """ Writes a function definition that maps grandfathered language tags. """ println("") writeMappingHeader(println, description, source, url) println( """\ bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) { // We're mapping regular grandfathered tags to non-grandfathered form here. // Other tags remain unchanged. // // regular = "art-lojban" // / "cel-gaulish" // / "no-bok" // / "no-nyn" // / "zh-guoyu" // / "zh-hakka" // / "zh-min" // / "zh-min-nan" // / "zh-xiang" // // Therefore we can quickly exclude most tags by checking every // |unicode_locale_id| subcomponent for characteristics not shared by any of // the regular grandfathered (RG) tags: // // * Real-world |unicode_language_subtag|s are all two or three letters, // so don't waste time running a useless |language.length > 3| fast-path. // * No RG tag has a "script"-looking component. // * No RG tag has a "region"-looking component. // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish, // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok, // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag // that |unicode_locale_id| doesn't support.) // * No RG tag contains |extensions| or |pu_extensions|. if (script().present() || region().present() || variants().length() != 1 || extensions().length() != 0 || privateuse()) { return true; } MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variants()[0].get()))); auto variantEqualTo = [this](const char* variant) { return strcmp(variants()[0].get(), variant) == 0; };""" ) # From Unicode BCP 47 locale identifier . # # Doesn't allow any 'extensions' subtags. re_unicode_locale_id = re.compile( r""" ^ # unicode_language_id = unicode_language_subtag # unicode_language_subtag = alpha{2,3} | alpha{5,8} (?P[a-z]{2,3}|[a-z]{5,8}) # (sep unicode_script_subtag)? # unicode_script_subtag = alpha{4} (?:-(?P