summaryrefslogtreecommitdiffstats
path: root/gfx/harfbuzz/src/gen-tag-table.py
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /gfx/harfbuzz/src/gen-tag-table.py
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'gfx/harfbuzz/src/gen-tag-table.py')
-rwxr-xr-xgfx/harfbuzz/src/gen-tag-table.py1216
1 files changed, 1216 insertions, 0 deletions
diff --git a/gfx/harfbuzz/src/gen-tag-table.py b/gfx/harfbuzz/src/gen-tag-table.py
new file mode 100755
index 0000000000..7e15c08c56
--- /dev/null
+++ b/gfx/harfbuzz/src/gen-tag-table.py
@@ -0,0 +1,1216 @@
+#!/usr/bin/env python3
+
+"""Generator of the mapping from OpenType tags to BCP 47 tags and vice
+versa.
+
+It creates a ``const LangTag[]``, matching the tags from the OpenType
+languages system tag list to the language subtags of the BCP 47 language
+subtag registry, with some manual adjustments. The mappings are
+supplemented with macrolanguages' sublanguages and retired codes'
+replacements, according to BCP 47 and some manual additions where BCP 47
+omits a retired code entirely.
+
+Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
+intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
+back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
+multiple BCP 47 tags) are listed here, except when the alphabetically
+first BCP 47 tag happens to be the chosen disambiguated tag. In that
+case, the fallback behavior will choose the right tag anyway.
+
+usage: ./gen-tag-table.py languagetags language-subtag-registry
+
+Input files:
+* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
+* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
+"""
+
+import collections
+import html
+from html.parser import HTMLParser
+import itertools
+import re
+import sys
+import unicodedata
+
+if len (sys.argv) != 3:
+ sys.exit (__doc__)
+
+def expect (condition, message=None):
+ if not condition:
+ if message is None:
+ raise AssertionError
+ raise AssertionError (message)
+
+def write (s):
+ sys.stdout.flush ()
+ sys.stdout.buffer.write (s.encode ('utf-8'))
+
+DEFAULT_LANGUAGE_SYSTEM = ''
+
+# from https://www-01.sil.org/iso639-3/iso-639-3.tab
+ISO_639_3_TO_1 = {
+ 'aar': 'aa',
+ 'abk': 'ab',
+ 'afr': 'af',
+ 'aka': 'ak',
+ 'amh': 'am',
+ 'ara': 'ar',
+ 'arg': 'an',
+ 'asm': 'as',
+ 'ava': 'av',
+ 'ave': 'ae',
+ 'aym': 'ay',
+ 'aze': 'az',
+ 'bak': 'ba',
+ 'bam': 'bm',
+ 'bel': 'be',
+ 'ben': 'bn',
+ 'bis': 'bi',
+ 'bod': 'bo',
+ 'bos': 'bs',
+ 'bre': 'br',
+ 'bul': 'bg',
+ 'cat': 'ca',
+ 'ces': 'cs',
+ 'cha': 'ch',
+ 'che': 'ce',
+ 'chu': 'cu',
+ 'chv': 'cv',
+ 'cor': 'kw',
+ 'cos': 'co',
+ 'cre': 'cr',
+ 'cym': 'cy',
+ 'dan': 'da',
+ 'deu': 'de',
+ 'div': 'dv',
+ 'dzo': 'dz',
+ 'ell': 'el',
+ 'eng': 'en',
+ 'epo': 'eo',
+ 'est': 'et',
+ 'eus': 'eu',
+ 'ewe': 'ee',
+ 'fao': 'fo',
+ 'fas': 'fa',
+ 'fij': 'fj',
+ 'fin': 'fi',
+ 'fra': 'fr',
+ 'fry': 'fy',
+ 'ful': 'ff',
+ 'gla': 'gd',
+ 'gle': 'ga',
+ 'glg': 'gl',
+ 'glv': 'gv',
+ 'grn': 'gn',
+ 'guj': 'gu',
+ 'hat': 'ht',
+ 'hau': 'ha',
+ 'hbs': 'sh',
+ 'heb': 'he',
+ 'her': 'hz',
+ 'hin': 'hi',
+ 'hmo': 'ho',
+ 'hrv': 'hr',
+ 'hun': 'hu',
+ 'hye': 'hy',
+ 'ibo': 'ig',
+ 'ido': 'io',
+ 'iii': 'ii',
+ 'iku': 'iu',
+ 'ile': 'ie',
+ 'ina': 'ia',
+ 'ind': 'id',
+ 'ipk': 'ik',
+ 'isl': 'is',
+ 'ita': 'it',
+ 'jav': 'jv',
+ 'jpn': 'ja',
+ 'kal': 'kl',
+ 'kan': 'kn',
+ 'kas': 'ks',
+ 'kat': 'ka',
+ 'kau': 'kr',
+ 'kaz': 'kk',
+ 'khm': 'km',
+ 'kik': 'ki',
+ 'kin': 'rw',
+ 'kir': 'ky',
+ 'kom': 'kv',
+ 'kon': 'kg',
+ 'kor': 'ko',
+ 'kua': 'kj',
+ 'kur': 'ku',
+ 'lao': 'lo',
+ 'lat': 'la',
+ 'lav': 'lv',
+ 'lim': 'li',
+ 'lin': 'ln',
+ 'lit': 'lt',
+ 'ltz': 'lb',
+ 'lub': 'lu',
+ 'lug': 'lg',
+ 'mah': 'mh',
+ 'mal': 'ml',
+ 'mar': 'mr',
+ 'mkd': 'mk',
+ 'mlg': 'mg',
+ 'mlt': 'mt',
+ 'mol': 'mo',
+ 'mon': 'mn',
+ 'mri': 'mi',
+ 'msa': 'ms',
+ 'mya': 'my',
+ 'nau': 'na',
+ 'nav': 'nv',
+ 'nbl': 'nr',
+ 'nde': 'nd',
+ 'ndo': 'ng',
+ 'nep': 'ne',
+ 'nld': 'nl',
+ 'nno': 'nn',
+ 'nob': 'nb',
+ 'nor': 'no',
+ 'nya': 'ny',
+ 'oci': 'oc',
+ 'oji': 'oj',
+ 'ori': 'or',
+ 'orm': 'om',
+ 'oss': 'os',
+ 'pan': 'pa',
+ 'pli': 'pi',
+ 'pol': 'pl',
+ 'por': 'pt',
+ 'pus': 'ps',
+ 'que': 'qu',
+ 'roh': 'rm',
+ 'ron': 'ro',
+ 'run': 'rn',
+ 'rus': 'ru',
+ 'sag': 'sg',
+ 'san': 'sa',
+ 'sin': 'si',
+ 'slk': 'sk',
+ 'slv': 'sl',
+ 'sme': 'se',
+ 'smo': 'sm',
+ 'sna': 'sn',
+ 'snd': 'sd',
+ 'som': 'so',
+ 'sot': 'st',
+ 'spa': 'es',
+ 'sqi': 'sq',
+ 'srd': 'sc',
+ 'srp': 'sr',
+ 'ssw': 'ss',
+ 'sun': 'su',
+ 'swa': 'sw',
+ 'swe': 'sv',
+ 'tah': 'ty',
+ 'tam': 'ta',
+ 'tat': 'tt',
+ 'tel': 'te',
+ 'tgk': 'tg',
+ 'tgl': 'tl',
+ 'tha': 'th',
+ 'tir': 'ti',
+ 'ton': 'to',
+ 'tsn': 'tn',
+ 'tso': 'ts',
+ 'tuk': 'tk',
+ 'tur': 'tr',
+ 'twi': 'tw',
+ 'uig': 'ug',
+ 'ukr': 'uk',
+ 'urd': 'ur',
+ 'uzb': 'uz',
+ 'ven': 've',
+ 'vie': 'vi',
+ 'vol': 'vo',
+ 'wln': 'wa',
+ 'wol': 'wo',
+ 'xho': 'xh',
+ 'yid': 'yi',
+ 'yor': 'yo',
+ 'zha': 'za',
+ 'zho': 'zh',
+ 'zul': 'zu',
+}
+
+class LanguageTag (object):
+ """A BCP 47 language tag.
+
+ Attributes:
+ subtags (List[str]): The list of subtags in this tag.
+ grandfathered (bool): Whether this tag is grandfathered. If
+ ``true``, the entire lowercased tag is the ``language``
+ and the other subtag fields are empty.
+ language (str): The language subtag.
+ script (str): The script subtag.
+ region (str): The region subtag.
+ variant (str): The variant subtag.
+
+ Args:
+ tag (str): A BCP 47 language tag.
+
+ """
+ def __init__ (self, tag):
+ global bcp_47
+ self.subtags = tag.lower ().split ('-')
+ self.grandfathered = tag.lower () in bcp_47.grandfathered
+ if self.grandfathered:
+ self.language = tag.lower ()
+ self.script = ''
+ self.region = ''
+ self.variant = ''
+ else:
+ self.language = self.subtags[0]
+ self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
+ self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
+ self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
+
+ def __str__(self):
+ return '-'.join(self.subtags)
+
+ def __repr__ (self):
+ return 'LanguageTag(%r)' % str(self)
+
+ @staticmethod
+ def _find_first (function, sequence):
+ try:
+ return next (iter (filter (function, sequence)))
+ except StopIteration:
+ return None
+
+ def is_complex (self):
+ """Return whether this tag is too complex to represent as a
+ ``LangTag`` in the generated code.
+
+ Complex tags need to be handled in
+ ``hb_ot_tags_from_complex_language``.
+
+ Returns:
+ Whether this tag is complex.
+ """
+ return not (len (self.subtags) == 1
+ or self.grandfathered
+ and len (self.subtags[1]) != 3
+ and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
+
+ def get_group (self):
+ """Return the group into which this tag should be categorized in
+ ``hb_ot_tags_from_complex_language``.
+
+ The group is the first letter of the tag, or ``'und'`` if this tag
+ should not be matched in a ``switch`` statement in the generated
+ code.
+
+ Returns:
+ This tag's group.
+ """
+ return ('und'
+ if (self.language == 'und'
+ or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
+ else self.language[0])
+
+class OpenTypeRegistryParser (HTMLParser):
+ """A parser for the OpenType language system tag registry.
+
+ Attributes:
+ header (str): The "last updated" line of the registry.
+ names (Mapping[str, str]): A map of language system tags to the
+ names they are given in the registry.
+ ranks (DefaultDict[str, int]): A map of language system tags to
+ numbers. If a single BCP 47 tag corresponds to multiple
+ OpenType tags, the tags are ordered in increasing order by
+ rank. The rank is based on the number of BCP 47 tags
+ associated with a tag, though it may be manually modified.
+ to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
+ OpenType language system tags to sets of BCP 47 tags.
+ from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
+ inverted. Its values start as unsorted sets;
+ ``sort_languages`` converts them to sorted lists.
+ from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
+ A copy of ``from_bcp_47``. It starts as ``None`` and is
+ populated at the beginning of the first call to
+ ``inherit_from_macrolanguages``.
+
+ """
+ def __init__ (self):
+ HTMLParser.__init__ (self)
+ self.header = ''
+ self.names = {}
+ self.ranks = collections.defaultdict (int)
+ self.to_bcp_47 = collections.defaultdict (set)
+ self.from_bcp_47 = collections.defaultdict (set)
+ self.from_bcp_47_uninherited = None
+ # Whether the parser is in a <td> element
+ self._td = False
+ # Whether the parser is after a <br> element within the current <tr> element
+ self._br = False
+ # The text of the <td> elements of the current <tr> element.
+ self._current_tr = []
+
+ def handle_starttag (self, tag, attrs):
+ if tag == 'br':
+ self._br = True
+ elif tag == 'meta':
+ for attr, value in attrs:
+ if attr == 'name' and value == 'updated_at':
+ self.header = self.get_starttag_text ()
+ break
+ elif tag == 'td':
+ self._td = True
+ self._current_tr.append ('')
+ elif tag == 'tr':
+ self._br = False
+ self._current_tr = []
+
+ def handle_endtag (self, tag):
+ if tag == 'td':
+ self._td = False
+ elif tag == 'tr' and self._current_tr:
+ expect (2 <= len (self._current_tr) <= 3)
+ name = self._current_tr[0].strip ()
+ tag = self._current_tr[1].strip ("\t\n\v\f\r '")
+ rank = 0
+ if len (tag) > 4:
+ expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
+ name += ' (deprecated)'
+ tag = tag.split (' ')[0]
+ rank = 1
+ self.names[tag] = re.sub (' languages$', '', name)
+ if not self._current_tr[2]:
+ return
+ iso_codes = self._current_tr[2].strip ()
+ self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
+ rank += 2 * len (self.to_bcp_47[tag])
+ self.ranks[tag] = rank
+
+ def handle_data (self, data):
+ if self._td and not self._br:
+ self._current_tr[-1] += data
+
+ def handle_charref (self, name):
+ self.handle_data (html.unescape ('&#%s;' % name))
+
+ def handle_entityref (self, name):
+ self.handle_data (html.unescape ('&%s;' % name))
+
+ def parse (self, filename):
+ """Parse the OpenType language system tag registry.
+
+ Args:
+ filename (str): The file name of the registry.
+ """
+ with open (filename, encoding='utf-8') as f:
+ self.feed (f.read ())
+ expect (self.header)
+ for tag, iso_codes in self.to_bcp_47.items ():
+ for iso_code in iso_codes:
+ self.from_bcp_47[iso_code].add (tag)
+
+ def add_language (self, bcp_47_tag, ot_tag):
+ """Add a language as if it were in the registry.
+
+ Args:
+ bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
+ a language subtag, and if the language subtag is a
+ macrolanguage, then new languages are added corresponding
+ to the macrolanguages' individual languages with the
+ remainder of the tag appended.
+ ot_tag (str): An OpenType language system tag.
+ """
+ global bcp_47
+ self.to_bcp_47[ot_tag].add (bcp_47_tag)
+ self.from_bcp_47[bcp_47_tag].add (ot_tag)
+ if bcp_47_tag.lower () not in bcp_47.grandfathered:
+ try:
+ [macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
+ if macrolanguage in bcp_47.macrolanguages:
+ s = set ()
+ for language in bcp_47.macrolanguages[macrolanguage]:
+ if language.lower () not in bcp_47.grandfathered:
+ s.add ('%s-%s' % (language, suffix))
+ bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
+ except ValueError:
+ pass
+
+ @staticmethod
+ def _remove_language (tag_1, dict_1, dict_2):
+ for tag_2 in dict_1.pop (tag_1):
+ dict_2[tag_2].remove (tag_1)
+ if not dict_2[tag_2]:
+ del dict_2[tag_2]
+
+ def remove_language_ot (self, ot_tag):
+ """Remove an OpenType tag from the registry.
+
+ Args:
+ ot_tag (str): An OpenType tag.
+ """
+ self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
+
+ def remove_language_bcp_47 (self, bcp_47_tag):
+ """Remove a BCP 47 tag from the registry.
+
+ Args:
+ bcp_47_tag (str): A BCP 47 tag.
+ """
+ self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
+
+ def inherit_from_macrolanguages (self):
+ """Copy mappings from macrolanguages to individual languages.
+
+ If a BCP 47 tag for an individual mapping has no OpenType
+ mapping but its macrolanguage does, the mapping is copied to
+ the individual language. For example, als (Tosk Albanian) has no
+ explicit mapping, so it inherits from sq (Albanian) the mapping
+ to SQI.
+
+ However, if an OpenType tag maps to a BCP 47 macrolanguage and
+ some but not all of its individual languages, the mapping is not
+ inherited from the macrolanguage to the missing individual
+ languages. For example, INUK (Nunavik Inuktitut) is mapped to
+ ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to
+ ikt (Inuinnaqtun, which is an individual language of iu), so
+ this method does not add a mapping from ikt to INUK.
+
+ If a BCP 47 tag for a macrolanguage has no OpenType mapping but
+ some of its individual languages do, their mappings are copied
+ to the macrolanguage.
+ """
+ global bcp_47
+ first_time = self.from_bcp_47_uninherited is None
+ if first_time:
+ self.from_bcp_47_uninherited = dict (self.from_bcp_47)
+ for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
+ ot_macrolanguages = {
+ ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ())
+ }
+ blocked_ot_macrolanguages = set ()
+ if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''):
+ for ot_macrolanguage in ot_macrolanguages:
+ round_trip_macrolanguages = {
+ l for l in self.to_bcp_47[ot_macrolanguage]
+ if 'retired code' not in bcp_47.scopes.get (l, '')
+ }
+ round_trip_languages = {
+ l for l in languages
+ if 'retired code' not in bcp_47.scopes.get (l, '')
+ }
+ intersection = round_trip_macrolanguages & round_trip_languages
+ if intersection and intersection != round_trip_languages:
+ blocked_ot_macrolanguages.add (ot_macrolanguage)
+ if ot_macrolanguages:
+ for ot_macrolanguage in ot_macrolanguages:
+ if ot_macrolanguage not in blocked_ot_macrolanguages:
+ for language in languages:
+ self.add_language (language, ot_macrolanguage)
+ if not blocked_ot_macrolanguages:
+ self.ranks[ot_macrolanguage] += 1
+ elif first_time:
+ for language in languages:
+ if language in self.from_bcp_47_uninherited:
+ ot_macrolanguages |= self.from_bcp_47_uninherited[language]
+ else:
+ ot_macrolanguages.clear ()
+ if not ot_macrolanguages:
+ break
+ for ot_macrolanguage in ot_macrolanguages:
+ self.add_language (macrolanguage, ot_macrolanguage)
+
+ def sort_languages (self):
+ """Sort the values of ``from_bcp_47`` in ascending rank order."""
+ for language, tags in self.from_bcp_47.items ():
+ self.from_bcp_47[language] = sorted (tags,
+ key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
+
+ot = OpenTypeRegistryParser ()
+
+class BCP47Parser (object):
+ """A parser for the BCP 47 subtag registry.
+
+ Attributes:
+ header (str): The "File-Date" line of the registry.
+ names (Mapping[str, str]): A map of subtags to the names they
+ are given in the registry. Each value is a
+ ``'\\n'``-separated list of names.
+ scopes (Mapping[str, str]): A map of language subtags to strings
+ suffixed to language names, including suffixes to explain
+ language scopes.
+ macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
+ language subtags to the sets of language subtags which
+ inherit from them. See
+ ``OpenTypeRegistryParser.inherit_from_macrolanguages``.
+ prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
+ subtags to their prefixes.
+ grandfathered (AbstractSet[str]): The set of grandfathered tags,
+ normalized to lowercase.
+
+ """
+ def __init__ (self):
+ self.header = ''
+ self.names = {}
+ self.scopes = {}
+ self.macrolanguages = collections.defaultdict (set)
+ self.prefixes = collections.defaultdict (set)
+ self.grandfathered = set ()
+
+ def parse (self, filename):
+ """Parse the BCP 47 subtag registry.
+
+ Args:
+ filename (str): The file name of the registry.
+ """
+ with open (filename, encoding='utf-8') as f:
+ subtag_type = None
+ subtag = None
+ deprecated = False
+ has_preferred_value = False
+ line_buffer = ''
+ for line in itertools.chain (f, ['']):
+ line = line.rstrip ()
+ if line.startswith (' '):
+ line_buffer += line[1:]
+ continue
+ line, line_buffer = line_buffer, line
+ if line.startswith ('Type: '):
+ subtag_type = line.split (' ')[1]
+ deprecated = False
+ has_preferred_value = False
+ elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
+ subtag = line.split (' ')[1]
+ if subtag_type == 'grandfathered':
+ self.grandfathered.add (subtag.lower ())
+ elif line.startswith ('Description: '):
+ description = line.split (' ', 1)[1].replace (' (individual language)', '')
+ description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '',
+ description)
+ if subtag in self.names:
+ self.names[subtag] += '\n' + description
+ else:
+ self.names[subtag] = description
+ elif subtag_type == 'language' or subtag_type == 'grandfathered':
+ if line.startswith ('Scope: '):
+ scope = line.split (' ')[1]
+ if scope == 'macrolanguage':
+ scope = ' [macrolanguage]'
+ elif scope == 'collection':
+ scope = ' [collection]'
+ else:
+ continue
+ self.scopes[subtag] = scope
+ elif line.startswith ('Deprecated: '):
+ self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
+ deprecated = True
+ elif deprecated and line.startswith ('Comments: see '):
+ # If a subtag is split into multiple replacement subtags,
+ # it essentially represents a macrolanguage.
+ for language in line.replace (',', '').split (' ')[2:]:
+ self._add_macrolanguage (subtag, language)
+ elif line.startswith ('Preferred-Value: '):
+ # If a subtag is deprecated in favor of a single replacement subtag,
+ # it is either a dialect or synonym of the preferred subtag. Either
+ # way, it is close enough to the truth to consider the replacement
+ # the macrolanguage of the deprecated language.
+ has_preferred_value = True
+ macrolanguage = line.split (' ')[1]
+ self._add_macrolanguage (macrolanguage, subtag)
+ elif not has_preferred_value and line.startswith ('Macrolanguage: '):
+ self._add_macrolanguage (line.split (' ')[1], subtag)
+ elif subtag_type == 'variant':
+ if line.startswith ('Deprecated: '):
+ self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
+ elif line.startswith ('Prefix: '):
+ self.prefixes[subtag].add (line.split (' ')[1])
+ elif line.startswith ('File-Date: '):
+ self.header = line
+ expect (self.header)
+
+ def _add_macrolanguage (self, macrolanguage, language):
+ global ot
+ if language not in ot.from_bcp_47:
+ for l in self.macrolanguages.get (language, set ()):
+ self._add_macrolanguage (macrolanguage, l)
+ if macrolanguage not in ot.from_bcp_47:
+ for ls in list (self.macrolanguages.values ()):
+ if macrolanguage in ls:
+ ls.add (language)
+ return
+ self.macrolanguages[macrolanguage].add (language)
+
+ def remove_extra_macrolanguages (self):
+ """Make every language have at most one macrolanguage."""
+ inverted = collections.defaultdict (list)
+ for macrolanguage, languages in self.macrolanguages.items ():
+ for language in languages:
+ inverted[language].append (macrolanguage)
+ for language, macrolanguages in inverted.items ():
+ if len (macrolanguages) > 1:
+ macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
+ biggest_macrolanguage = macrolanguages.pop ()
+ for macrolanguage in macrolanguages:
+ self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
+
+ def _get_name_piece (self, subtag):
+ """Return the first name of a subtag plus its scope suffix.
+
+ Args:
+ subtag (str): A BCP 47 subtag.
+
+ Returns:
+ The name form of ``subtag``.
+ """
+ return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
+
+ def get_name (self, lt):
+ """Return the names of the subtags in a language tag.
+
+ Args:
+ lt (LanguageTag): A BCP 47 language tag.
+
+ Returns:
+ The name form of ``lt``.
+ """
+ name = self._get_name_piece (lt.language)
+ if lt.script:
+ name += '; ' + self._get_name_piece (lt.script.title ())
+ if lt.region:
+ name += '; ' + self._get_name_piece (lt.region.upper ())
+ if lt.variant:
+ name += '; ' + self._get_name_piece (lt.variant)
+ return name
+
+bcp_47 = BCP47Parser ()
+
+ot.parse (sys.argv[1])
+bcp_47.parse (sys.argv[2])
+
+ot.add_language ('ary', 'MOR')
+
+ot.add_language ('ath', 'ATH')
+
+ot.add_language ('bai', 'BML')
+
+ot.ranks['BAL'] = ot.ranks['KAR'] + 1
+
+ot.add_language ('ber', 'BBR')
+
+ot.remove_language_ot ('PGR')
+ot.add_language ('el-polyton', 'PGR')
+
+bcp_47.macrolanguages['et'] = {'ekk'}
+
+bcp_47.names['flm'] = 'Falam Chin'
+bcp_47.scopes['flm'] = ' (retired code)'
+bcp_47.macrolanguages['flm'] = {'cfm'}
+
+ot.ranks['FNE'] = ot.ranks['TNE'] + 1
+
+ot.add_language ('und-fonipa', 'IPPH')
+
+ot.add_language ('und-fonnapa', 'APPH')
+
+ot.remove_language_ot ('IRT')
+ot.add_language ('ga-Latg', 'IRT')
+
+ot.add_language ('hy-arevmda', 'HYE')
+
+ot.remove_language_ot ('KGE')
+ot.add_language ('und-Geok', 'KGE')
+
+bcp_47.macrolanguages['id'] = {'in'}
+
+bcp_47.macrolanguages['ijo'] = {'ijc'}
+
+ot.add_language ('kht', 'KHN')
+ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
+ot.ranks['KHN'] = ot.ranks['KHT'] + 1
+
+ot.ranks['LCR'] = ot.ranks['MCR'] + 1
+
+ot.names['MAL'] = 'Malayalam Traditional'
+ot.ranks['MLR'] += 1
+
+bcp_47.names['mhv'] = 'Arakanese'
+bcp_47.scopes['mhv'] = ' (retired code)'
+
+ot.add_language ('mnw-TH', 'MONT')
+
+ot.add_language ('no', 'NOR')
+
+ot.add_language ('oc-provenc', 'PRO')
+
+ot.remove_language_ot ('QUZ')
+ot.add_language ('qu', 'QUZ')
+ot.add_language ('qub', 'QWH')
+ot.add_language ('qud', 'QVI')
+ot.add_language ('qug', 'QVI')
+ot.add_language ('qul', 'QUH')
+ot.add_language ('qup', 'QVI')
+ot.add_language ('qur', 'QWH')
+ot.add_language ('qus', 'QUH')
+ot.add_language ('quw', 'QVI')
+ot.add_language ('qux', 'QWH')
+ot.add_language ('qva', 'QWH')
+ot.add_language ('qvh', 'QWH')
+ot.add_language ('qvj', 'QVI')
+ot.add_language ('qvl', 'QWH')
+ot.add_language ('qvm', 'QWH')
+ot.add_language ('qvn', 'QWH')
+ot.add_language ('qvo', 'QVI')
+ot.add_language ('qvp', 'QWH')
+ot.add_language ('qvw', 'QWH')
+ot.add_language ('qvz', 'QVI')
+ot.add_language ('qwa', 'QWH')
+ot.add_language ('qws', 'QWH')
+ot.add_language ('qxa', 'QWH')
+ot.add_language ('qxc', 'QWH')
+ot.add_language ('qxh', 'QWH')
+ot.add_language ('qxl', 'QVI')
+ot.add_language ('qxn', 'QWH')
+ot.add_language ('qxo', 'QWH')
+ot.add_language ('qxr', 'QVI')
+ot.add_language ('qxt', 'QWH')
+ot.add_language ('qxw', 'QWH')
+
+bcp_47.macrolanguages['ro-MD'].add ('mo')
+
+ot.remove_language_ot ('SYRE')
+ot.remove_language_ot ('SYRJ')
+ot.remove_language_ot ('SYRN')
+ot.add_language ('und-Syre', 'SYRE')
+ot.add_language ('und-Syrj', 'SYRJ')
+ot.add_language ('und-Syrn', 'SYRN')
+
+bcp_47.names['xst'] = "Silt'e"
+bcp_47.scopes['xst'] = ' (retired code)'
+bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
+
+ot.add_language ('xwo', 'TOD')
+
+ot.remove_language_ot ('ZHH')
+ot.remove_language_ot ('ZHP')
+ot.remove_language_ot ('ZHT')
+ot.remove_language_ot ('ZHTM')
+bcp_47.macrolanguages['zh'].remove ('lzh')
+bcp_47.macrolanguages['zh'].remove ('yue')
+ot.add_language ('zh-Hant-MO', 'ZHH')
+ot.add_language ('zh-Hant-MO', 'ZHTM')
+ot.add_language ('zh-Hant-HK', 'ZHH')
+ot.add_language ('zh-Hans', 'ZHS')
+ot.add_language ('zh-Hant', 'ZHT')
+ot.add_language ('zh-HK', 'ZHH')
+ot.add_language ('zh-MO', 'ZHH')
+ot.add_language ('zh-MO', 'ZHTM')
+ot.add_language ('zh-TW', 'ZHT')
+ot.add_language ('lzh', 'ZHT')
+ot.add_language ('lzh-Hans', 'ZHS')
+ot.add_language ('yue', 'ZHH')
+ot.add_language ('yue-Hans', 'ZHS')
+
+bcp_47.macrolanguages['zom'] = {'yos'}
+
+def rank_delta (bcp_47, ot):
+ """Return a delta to apply to a BCP 47 tag's rank.
+
+ Most OpenType tags have a constant rank, but a few have ranks that
+ depend on the BCP 47 tag.
+
+ Args:
+ bcp_47 (str): A BCP 47 tag.
+ ot (str): An OpenType tag to.
+
+ Returns:
+ A number to add to ``ot``'s rank when sorting ``bcp_47``'s
+ OpenType equivalents.
+ """
+ if bcp_47 == 'ak' and ot == 'AKA':
+ return -1
+ if bcp_47 == 'tw' and ot == 'TWI':
+ return -1
+ return 0
+
+disambiguation = {
+ 'ALT': 'alt',
+ 'ARK': 'rki',
+ 'ATH': 'ath',
+ 'BHI': 'bhb',
+ 'BLN': 'bjt',
+ 'BTI': 'beb',
+ 'CCHN': 'cco',
+ 'CMR': 'swb',
+ 'CPP': 'crp',
+ 'CRR': 'crx',
+ 'DUJ': 'dwu',
+ 'ECR': 'crj',
+ 'HAL': 'cfm',
+ 'HND': 'hnd',
+ 'HYE': 'hyw',
+ 'KIS': 'kqs',
+ 'KUI': 'uki',
+ 'LRC': 'bqi',
+ 'NDB': 'nd',
+ 'NIS': 'njz',
+ 'PLG': 'pce',
+ 'PRO': 'pro',
+ 'QIN': 'bgr',
+ 'QUH': 'quh',
+ 'QVI': 'qvi',
+ 'QWH': 'qwh',
+ 'SIG': 'stv',
+ 'SRB': 'sr',
+ 'SXT': 'xnj',
+ 'ZHH': 'zh-HK',
+ 'ZHS': 'zh-Hans',
+ 'ZHT': 'zh-Hant',
+ 'ZHTM': 'zh-MO',
+}
+
+ot.inherit_from_macrolanguages ()
+bcp_47.remove_extra_macrolanguages ()
+ot.inherit_from_macrolanguages ()
+ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
+ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
+for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
+ possible_bcp_47_tag = tricky_ot_tag.lower ()
+ if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
+ ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
+ bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
+ot.sort_languages ()
+
+print ('/* == Start of generated table == */')
+print ('/*')
+print (' * The following table is generated by running:')
+print (' *')
+print (' * %s languagetags language-subtag-registry' % sys.argv[0])
+print (' *')
+print (' * on files with these headers:')
+print (' *')
+print (' * %s' % ot.header.strip ())
+print (' * %s' % bcp_47.header)
+print (' */')
+print ()
+print ('#ifndef HB_OT_TAG_TABLE_HH')
+print ('#define HB_OT_TAG_TABLE_HH')
+print ()
+
+def hb_tag (tag):
+ """Convert a tag to ``HB_TAG`` form.
+
+ Args:
+ tag (str): An OpenType tag.
+
+ Returns:
+ A snippet of C++ representing ``tag``.
+ """
+ if tag == DEFAULT_LANGUAGE_SYSTEM:
+ return 'HB_TAG_NONE\t '
+ return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
+
+def get_variant_set (name):
+ """Return a set of variant language names from a name.
+
+ Args:
+ name (str): A list of language names from the BCP 47 registry,
+ joined on ``'\\n'``.
+
+ Returns:
+ A set of normalized language names.
+ """
+ return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'"))
+ .encode ('ASCII', 'ignore')
+ .strip ()
+ for n in re.split ('[\n(),]', name) if n)
+
+def language_name_intersection (a, b):
+ """Return the names in common between two language names.
+
+ Args:
+ a (str): A list of language names from the BCP 47 registry,
+ joined on ``'\\n'``.
+ b (str): A list of language names from the BCP 47 registry,
+ joined on ``'\\n'``.
+
+ Returns:
+ The normalized language names shared by ``a`` and ``b``.
+ """
+ return get_variant_set (a).intersection (get_variant_set (b))
+
+def get_matching_language_name (intersection, candidates):
+ return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
+
+def same_tag (bcp_47_tag, ot_tags):
+ return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
+
+for language_len in (2, 3):
+ if language_len == 3:
+ print ('#ifndef HB_NO_LANGUAGE_LONG')
+ print ('static const LangTag ot_languages%d[] = {' % language_len)
+ for language, tags in sorted (ot.from_bcp_47.items ()):
+ if language == '' or '-' in language:
+ continue
+ if len(language) != language_len: continue
+ commented_out = same_tag (language, tags)
+ for i, tag in enumerate (tags, start=1):
+ print ('%s{%s,\t%s},' % ('/*' if commented_out else ' ', hb_tag (language), hb_tag (tag)), end='')
+ if commented_out:
+ print ('*/', end='')
+ print ('\t/* ', end='')
+ bcp_47_name = bcp_47.names.get (language, '')
+ bcp_47_name_candidates = bcp_47_name.split ('\n')
+ ot_name = ot.names[tag]
+ scope = bcp_47.scopes.get (language, '')
+ if tag == DEFAULT_LANGUAGE_SYSTEM:
+ write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
+ else:
+ intersection = language_name_intersection (bcp_47_name, ot_name)
+ if not intersection:
+ write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
+ else:
+ name = get_matching_language_name (intersection, bcp_47_name_candidates)
+ bcp_47.names[language] = name
+ write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
+ print (' */')
+ print ('};')
+ if language_len == 3:
+ print ('#endif')
+ print ()
+
+print ('/**')
+print (' * hb_ot_tags_from_complex_language:')
+print (' * @lang_str: a BCP 47 language tag to convert.')
+print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
+print (' * conversion.')
+print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
+print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
+print (' * @tags: array of size at least @language_count to store the language tag')
+print (' * results')
+print (' *')
+print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
+print (' *')
+print (' * Return value: Whether any language systems were retrieved.')
+print (' **/')
+print ('static inline bool')
+print ('hb_ot_tags_from_complex_language (const char *lang_str,')
+print ('\t\t\t\t const char *limit,')
+print ('\t\t\t\t unsigned int *count /* IN/OUT */,')
+print ('\t\t\t\t hb_tag_t *tags /* OUT */)')
+print ('{')
+
+def print_subtag_matches (subtag, string, new_line):
+ if subtag:
+ if new_line:
+ print ()
+ print ('\t&& ', end='')
+ print ('subtag_matches (%s, limit, "-%s", %i)' % (string, subtag, 1 + len (subtag)), end='')
+
+complex_tags = collections.defaultdict (list)
+for initial, group in itertools.groupby ((lt_tags for lt_tags in [
+ (LanguageTag (language), tags)
+ for language, tags in sorted (ot.from_bcp_47.items (),
+ key=lambda i: (-len (i[0]), i[0]))
+ ] if lt_tags[0].is_complex ()),
+ key=lambda lt_tags: lt_tags[0].get_group ()):
+ complex_tags[initial] += group
+
+# Calculate the min length of the subtags outside the switch
+min_subtag_len = 100
+for initial, items in sorted (complex_tags.items ()):
+ if initial != 'und':
+ continue
+ for lt, tags in items:
+ if not tags:
+ continue
+ subtag_len = 0
+ subtag_len += 1 + len (lt.script) if lt.script is not None else 0
+ subtag_len += 1 + len (lt.region) if lt.region is not None else 0
+ subtag_len += 1 + len (lt.variant) if lt.variant is not None else 0
+ min_subtag_len = min(subtag_len, min_subtag_len)
+
+print (' if (limit - lang_str >= %d)' % (min_subtag_len + 2))
+print (' {')
+print (" const char *p = strchr (lang_str, '-');")
+print (" if (!p || p >= limit || limit - p < %i) goto out;" % min_subtag_len)
+for initial, items in sorted (complex_tags.items ()):
+ if initial != 'und':
+ continue
+ for lt, tags in items:
+ if not tags:
+ continue
+ if lt.variant in bcp_47.prefixes:
+ expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
+ '%s is not a valid prefix of %s' % (lt.language, lt.variant))
+ print (' if (', end='')
+ print_subtag_matches (lt.script, 'p', False)
+ print_subtag_matches (lt.region, 'p', False)
+ print_subtag_matches (lt.variant, 'p', False)
+ print (')')
+ print (' {')
+ write (' /* %s */' % bcp_47.get_name (lt))
+ print ()
+ if len (tags) == 1:
+ write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
+ print ()
+ print (' *count = 1;')
+ else:
+ print (' hb_tag_t possible_tags[] = {')
+ for tag in tags:
+ write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag]))
+ print ()
+ print (' };')
+ print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
+ print ('\ttags[i] = possible_tags[i];')
+ print (' *count = i;')
+ print (' return true;')
+ print (' }')
+print (' }')
+print ('out:')
+
+print (' switch (lang_str[0])')
+print (' {')
+for initial, items in sorted (complex_tags.items ()):
+ if initial == 'und':
+ continue
+ print (" case '%s':" % initial)
+ for lt, tags in items:
+ if not tags:
+ continue
+ print (' if (', end='')
+ script = lt.script
+ region = lt.region
+ if lt.grandfathered:
+ print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
+ else:
+ string_literal = lt.language[1:] + '-'
+ if script:
+ string_literal += script
+ script = None
+ if region:
+ string_literal += '-' + region
+ region = None
+ if string_literal[-1] == '-':
+ print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
+ else:
+ print ('lang_matches (&lang_str[1], limit, "%s", %i)' % (string_literal, len (string_literal)), end='')
+ print_subtag_matches (script, 'lang_str', True)
+ print_subtag_matches (region, 'lang_str', True)
+ print_subtag_matches (lt.variant, 'lang_str', True)
+ print (')')
+ print (' {')
+ write (' /* %s */' % bcp_47.get_name (lt))
+ print ()
+ if len (tags) == 1:
+ write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
+ print ()
+ print (' *count = 1;')
+ else:
+ print (' unsigned int i;')
+ print (' hb_tag_t possible_tags[] = {')
+ for tag in tags:
+ write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag]))
+ print ()
+ print (' };')
+ print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
+ print ('\ttags[i] = possible_tags[i];')
+ print (' *count = i;')
+ print (' return true;')
+ print (' }')
+ print (' break;')
+
+print (' }')
+print (' return false;')
+print ('}')
+print ()
+print ('/**')
+print (' * hb_ot_ambiguous_tag_to_language')
+print (' * @tag: A language tag.')
+print (' *')
+print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
+print (' * many language tags) and the best tag is not the alphabetically first, or if')
+print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
+print (' * in #ot_languages.')
+print (' *')
+print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
+print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
+print (' **/')
+print ('static inline hb_language_t')
+print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
+print ('{')
+print (' switch (tag)')
+print (' {')
+
+def verify_disambiguation_dict ():
+ """Verify and normalize ``disambiguation``.
+
+ ``disambiguation`` is a map of ambiguous OpenType language system
+ tags to the particular BCP 47 tags they correspond to. This function
+ checks that all its keys really are ambiguous and that each key's
+ value is valid for that key. It checks that no ambiguous tag is
+ missing, except when it can figure out which BCP 47 tag is the best
+ by itself.
+
+ It modifies ``disambiguation`` to remove keys whose values are the
+ same as those that the fallback would return anyway, and to add
+ ambiguous keys whose disambiguations it determined automatically.
+
+ Raises:
+ AssertionError: Verification failed.
+ """
+ global bcp_47
+ global disambiguation
+ global ot
+ for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
+ if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
+ primary_tags = []
+ else:
+ primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
+ if len (primary_tags) == 1:
+ expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
+ if '-' in primary_tags[0]:
+ disambiguation[ot_tag] = primary_tags[0]
+ else:
+ first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
+ if primary_tags[0] != first_tag:
+ disambiguation[ot_tag] = primary_tags[0]
+ elif len (primary_tags) == 0:
+ expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
+ else:
+ original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
+ if len (original_languages) == 1:
+ macrolanguages = original_languages
+ else:
+ macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
+ if len (macrolanguages) != 1:
+ macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]')
+ if len (macrolanguages) != 1:
+ macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
+ if len (macrolanguages) != 1:
+ expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
+ expect (disambiguation[ot_tag] in bcp_47_tags,
+ '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
+ elif ot_tag not in disambiguation:
+ disambiguation[ot_tag] = macrolanguages[0]
+ different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t)))
+ if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]:
+ del disambiguation[ot_tag]
+ for ot_tag in disambiguation.keys ():
+ expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
+
+verify_disambiguation_dict ()
+for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
+ write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
+ print ()
+ write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
+ print ()
+
+print (' default:')
+print (' return HB_LANGUAGE_INVALID;')
+print (' }')
+print ('}')
+
+print ()
+print ('#endif /* HB_OT_TAG_TABLE_HH */')
+print ()
+print ('/* == End of generated table == */')
+