diff options
Diffstat (limited to '')
-rwxr-xr-x | gfx/harfbuzz/src/gen-tag-table.py | 1216 |
1 files changed, 1216 insertions, 0 deletions
diff --git a/gfx/harfbuzz/src/gen-tag-table.py b/gfx/harfbuzz/src/gen-tag-table.py new file mode 100755 index 0000000000..7e15c08c56 --- /dev/null +++ b/gfx/harfbuzz/src/gen-tag-table.py @@ -0,0 +1,1216 @@ +#!/usr/bin/env python3 + +"""Generator of the mapping from OpenType tags to BCP 47 tags and vice +versa. + +It creates a ``const LangTag[]``, matching the tags from the OpenType +languages system tag list to the language subtags of the BCP 47 language +subtag registry, with some manual adjustments. The mappings are +supplemented with macrolanguages' sublanguages and retired codes' +replacements, according to BCP 47 and some manual additions where BCP 47 +omits a retired code entirely. + +Also generated is a function, ``hb_ot_ambiguous_tag_to_language``, +intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags +back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to +multiple BCP 47 tags) are listed here, except when the alphabetically +first BCP 47 tag happens to be the chosen disambiguated tag. In that +case, the fallback behavior will choose the right tag anyway. + +usage: ./gen-tag-table.py languagetags language-subtag-registry + +Input files: +* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags +* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry +""" + +import collections +import html +from html.parser import HTMLParser +import itertools +import re +import sys +import unicodedata + +if len (sys.argv) != 3: + sys.exit (__doc__) + +def expect (condition, message=None): + if not condition: + if message is None: + raise AssertionError + raise AssertionError (message) + +def write (s): + sys.stdout.flush () + sys.stdout.buffer.write (s.encode ('utf-8')) + +DEFAULT_LANGUAGE_SYSTEM = '' + +# from https://www-01.sil.org/iso639-3/iso-639-3.tab +ISO_639_3_TO_1 = { + 'aar': 'aa', + 'abk': 'ab', + 'afr': 'af', + 'aka': 'ak', + 'amh': 'am', + 'ara': 'ar', + 'arg': 'an', + 'asm': 'as', + 'ava': 'av', + 'ave': 'ae', + 'aym': 'ay', + 'aze': 'az', + 'bak': 'ba', + 'bam': 'bm', + 'bel': 'be', + 'ben': 'bn', + 'bis': 'bi', + 'bod': 'bo', + 'bos': 'bs', + 'bre': 'br', + 'bul': 'bg', + 'cat': 'ca', + 'ces': 'cs', + 'cha': 'ch', + 'che': 'ce', + 'chu': 'cu', + 'chv': 'cv', + 'cor': 'kw', + 'cos': 'co', + 'cre': 'cr', + 'cym': 'cy', + 'dan': 'da', + 'deu': 'de', + 'div': 'dv', + 'dzo': 'dz', + 'ell': 'el', + 'eng': 'en', + 'epo': 'eo', + 'est': 'et', + 'eus': 'eu', + 'ewe': 'ee', + 'fao': 'fo', + 'fas': 'fa', + 'fij': 'fj', + 'fin': 'fi', + 'fra': 'fr', + 'fry': 'fy', + 'ful': 'ff', + 'gla': 'gd', + 'gle': 'ga', + 'glg': 'gl', + 'glv': 'gv', + 'grn': 'gn', + 'guj': 'gu', + 'hat': 'ht', + 'hau': 'ha', + 'hbs': 'sh', + 'heb': 'he', + 'her': 'hz', + 'hin': 'hi', + 'hmo': 'ho', + 'hrv': 'hr', + 'hun': 'hu', + 'hye': 'hy', + 'ibo': 'ig', + 'ido': 'io', + 'iii': 'ii', + 'iku': 'iu', + 'ile': 'ie', + 'ina': 'ia', + 'ind': 'id', + 'ipk': 'ik', + 'isl': 'is', + 'ita': 'it', + 'jav': 'jv', + 'jpn': 'ja', + 'kal': 'kl', + 'kan': 'kn', + 'kas': 'ks', + 'kat': 'ka', + 'kau': 'kr', + 'kaz': 'kk', + 'khm': 'km', + 'kik': 'ki', + 'kin': 'rw', + 'kir': 'ky', + 'kom': 'kv', + 'kon': 'kg', + 'kor': 'ko', + 'kua': 'kj', + 'kur': 'ku', + 'lao': 'lo', + 'lat': 'la', + 'lav': 'lv', + 'lim': 'li', + 'lin': 'ln', + 'lit': 'lt', + 'ltz': 'lb', + 'lub': 'lu', + 'lug': 'lg', + 'mah': 'mh', + 'mal': 'ml', + 'mar': 'mr', + 'mkd': 'mk', + 'mlg': 'mg', + 'mlt': 'mt', + 'mol': 'mo', + 'mon': 'mn', + 'mri': 'mi', + 'msa': 'ms', + 'mya': 'my', + 'nau': 'na', + 'nav': 'nv', + 'nbl': 'nr', + 'nde': 'nd', + 'ndo': 'ng', + 'nep': 'ne', + 'nld': 'nl', + 'nno': 'nn', + 'nob': 'nb', + 'nor': 'no', + 'nya': 'ny', + 'oci': 'oc', + 'oji': 'oj', + 'ori': 'or', + 'orm': 'om', + 'oss': 'os', + 'pan': 'pa', + 'pli': 'pi', + 'pol': 'pl', + 'por': 'pt', + 'pus': 'ps', + 'que': 'qu', + 'roh': 'rm', + 'ron': 'ro', + 'run': 'rn', + 'rus': 'ru', + 'sag': 'sg', + 'san': 'sa', + 'sin': 'si', + 'slk': 'sk', + 'slv': 'sl', + 'sme': 'se', + 'smo': 'sm', + 'sna': 'sn', + 'snd': 'sd', + 'som': 'so', + 'sot': 'st', + 'spa': 'es', + 'sqi': 'sq', + 'srd': 'sc', + 'srp': 'sr', + 'ssw': 'ss', + 'sun': 'su', + 'swa': 'sw', + 'swe': 'sv', + 'tah': 'ty', + 'tam': 'ta', + 'tat': 'tt', + 'tel': 'te', + 'tgk': 'tg', + 'tgl': 'tl', + 'tha': 'th', + 'tir': 'ti', + 'ton': 'to', + 'tsn': 'tn', + 'tso': 'ts', + 'tuk': 'tk', + 'tur': 'tr', + 'twi': 'tw', + 'uig': 'ug', + 'ukr': 'uk', + 'urd': 'ur', + 'uzb': 'uz', + 'ven': 've', + 'vie': 'vi', + 'vol': 'vo', + 'wln': 'wa', + 'wol': 'wo', + 'xho': 'xh', + 'yid': 'yi', + 'yor': 'yo', + 'zha': 'za', + 'zho': 'zh', + 'zul': 'zu', +} + +class LanguageTag (object): + """A BCP 47 language tag. + + Attributes: + subtags (List[str]): The list of subtags in this tag. + grandfathered (bool): Whether this tag is grandfathered. If + ``true``, the entire lowercased tag is the ``language`` + and the other subtag fields are empty. + language (str): The language subtag. + script (str): The script subtag. + region (str): The region subtag. + variant (str): The variant subtag. + + Args: + tag (str): A BCP 47 language tag. + + """ + def __init__ (self, tag): + global bcp_47 + self.subtags = tag.lower ().split ('-') + self.grandfathered = tag.lower () in bcp_47.grandfathered + if self.grandfathered: + self.language = tag.lower () + self.script = '' + self.region = '' + self.variant = '' + else: + self.language = self.subtags[0] + self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags) + self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:]) + self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags) + + def __str__(self): + return '-'.join(self.subtags) + + def __repr__ (self): + return 'LanguageTag(%r)' % str(self) + + @staticmethod + def _find_first (function, sequence): + try: + return next (iter (filter (function, sequence))) + except StopIteration: + return None + + def is_complex (self): + """Return whether this tag is too complex to represent as a + ``LangTag`` in the generated code. + + Complex tags need to be handled in + ``hb_ot_tags_from_complex_language``. + + Returns: + Whether this tag is complex. + """ + return not (len (self.subtags) == 1 + or self.grandfathered + and len (self.subtags[1]) != 3 + and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language]) + + def get_group (self): + """Return the group into which this tag should be categorized in + ``hb_ot_tags_from_complex_language``. + + The group is the first letter of the tag, or ``'und'`` if this tag + should not be matched in a ``switch`` statement in the generated + code. + + Returns: + This tag's group. + """ + return ('und' + if (self.language == 'und' + or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1) + else self.language[0]) + +class OpenTypeRegistryParser (HTMLParser): + """A parser for the OpenType language system tag registry. + + Attributes: + header (str): The "last updated" line of the registry. + names (Mapping[str, str]): A map of language system tags to the + names they are given in the registry. + ranks (DefaultDict[str, int]): A map of language system tags to + numbers. If a single BCP 47 tag corresponds to multiple + OpenType tags, the tags are ordered in increasing order by + rank. The rank is based on the number of BCP 47 tags + associated with a tag, though it may be manually modified. + to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of + OpenType language system tags to sets of BCP 47 tags. + from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` + inverted. Its values start as unsorted sets; + ``sort_languages`` converts them to sorted lists. + from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]): + A copy of ``from_bcp_47``. It starts as ``None`` and is + populated at the beginning of the first call to + ``inherit_from_macrolanguages``. + + """ + def __init__ (self): + HTMLParser.__init__ (self) + self.header = '' + self.names = {} + self.ranks = collections.defaultdict (int) + self.to_bcp_47 = collections.defaultdict (set) + self.from_bcp_47 = collections.defaultdict (set) + self.from_bcp_47_uninherited = None + # Whether the parser is in a <td> element + self._td = False + # Whether the parser is after a <br> element within the current <tr> element + self._br = False + # The text of the <td> elements of the current <tr> element. + self._current_tr = [] + + def handle_starttag (self, tag, attrs): + if tag == 'br': + self._br = True + elif tag == 'meta': + for attr, value in attrs: + if attr == 'name' and value == 'updated_at': + self.header = self.get_starttag_text () + break + elif tag == 'td': + self._td = True + self._current_tr.append ('') + elif tag == 'tr': + self._br = False + self._current_tr = [] + + def handle_endtag (self, tag): + if tag == 'td': + self._td = False + elif tag == 'tr' and self._current_tr: + expect (2 <= len (self._current_tr) <= 3) + name = self._current_tr[0].strip () + tag = self._current_tr[1].strip ("\t\n\v\f\r '") + rank = 0 + if len (tag) > 4: + expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag) + name += ' (deprecated)' + tag = tag.split (' ')[0] + rank = 1 + self.names[tag] = re.sub (' languages$', '', name) + if not self._current_tr[2]: + return + iso_codes = self._current_tr[2].strip () + self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (',')) + rank += 2 * len (self.to_bcp_47[tag]) + self.ranks[tag] = rank + + def handle_data (self, data): + if self._td and not self._br: + self._current_tr[-1] += data + + def handle_charref (self, name): + self.handle_data (html.unescape ('&#%s;' % name)) + + def handle_entityref (self, name): + self.handle_data (html.unescape ('&%s;' % name)) + + def parse (self, filename): + """Parse the OpenType language system tag registry. + + Args: + filename (str): The file name of the registry. + """ + with open (filename, encoding='utf-8') as f: + self.feed (f.read ()) + expect (self.header) + for tag, iso_codes in self.to_bcp_47.items (): + for iso_code in iso_codes: + self.from_bcp_47[iso_code].add (tag) + + def add_language (self, bcp_47_tag, ot_tag): + """Add a language as if it were in the registry. + + Args: + bcp_47_tag (str): A BCP 47 tag. If the tag is more than just + a language subtag, and if the language subtag is a + macrolanguage, then new languages are added corresponding + to the macrolanguages' individual languages with the + remainder of the tag appended. + ot_tag (str): An OpenType language system tag. + """ + global bcp_47 + self.to_bcp_47[ot_tag].add (bcp_47_tag) + self.from_bcp_47[bcp_47_tag].add (ot_tag) + if bcp_47_tag.lower () not in bcp_47.grandfathered: + try: + [macrolanguage, suffix] = bcp_47_tag.split ('-', 1) + if macrolanguage in bcp_47.macrolanguages: + s = set () + for language in bcp_47.macrolanguages[macrolanguage]: + if language.lower () not in bcp_47.grandfathered: + s.add ('%s-%s' % (language, suffix)) + bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s + except ValueError: + pass + + @staticmethod + def _remove_language (tag_1, dict_1, dict_2): + for tag_2 in dict_1.pop (tag_1): + dict_2[tag_2].remove (tag_1) + if not dict_2[tag_2]: + del dict_2[tag_2] + + def remove_language_ot (self, ot_tag): + """Remove an OpenType tag from the registry. + + Args: + ot_tag (str): An OpenType tag. + """ + self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47) + + def remove_language_bcp_47 (self, bcp_47_tag): + """Remove a BCP 47 tag from the registry. + + Args: + bcp_47_tag (str): A BCP 47 tag. + """ + self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47) + + def inherit_from_macrolanguages (self): + """Copy mappings from macrolanguages to individual languages. + + If a BCP 47 tag for an individual mapping has no OpenType + mapping but its macrolanguage does, the mapping is copied to + the individual language. For example, als (Tosk Albanian) has no + explicit mapping, so it inherits from sq (Albanian) the mapping + to SQI. + + However, if an OpenType tag maps to a BCP 47 macrolanguage and + some but not all of its individual languages, the mapping is not + inherited from the macrolanguage to the missing individual + languages. For example, INUK (Nunavik Inuktitut) is mapped to + ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to + ikt (Inuinnaqtun, which is an individual language of iu), so + this method does not add a mapping from ikt to INUK. + + If a BCP 47 tag for a macrolanguage has no OpenType mapping but + some of its individual languages do, their mappings are copied + to the macrolanguage. + """ + global bcp_47 + first_time = self.from_bcp_47_uninherited is None + if first_time: + self.from_bcp_47_uninherited = dict (self.from_bcp_47) + for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): + ot_macrolanguages = { + ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ()) + } + blocked_ot_macrolanguages = set () + if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''): + for ot_macrolanguage in ot_macrolanguages: + round_trip_macrolanguages = { + l for l in self.to_bcp_47[ot_macrolanguage] + if 'retired code' not in bcp_47.scopes.get (l, '') + } + round_trip_languages = { + l for l in languages + if 'retired code' not in bcp_47.scopes.get (l, '') + } + intersection = round_trip_macrolanguages & round_trip_languages + if intersection and intersection != round_trip_languages: + blocked_ot_macrolanguages.add (ot_macrolanguage) + if ot_macrolanguages: + for ot_macrolanguage in ot_macrolanguages: + if ot_macrolanguage not in blocked_ot_macrolanguages: + for language in languages: + self.add_language (language, ot_macrolanguage) + if not blocked_ot_macrolanguages: + self.ranks[ot_macrolanguage] += 1 + elif first_time: + for language in languages: + if language in self.from_bcp_47_uninherited: + ot_macrolanguages |= self.from_bcp_47_uninherited[language] + else: + ot_macrolanguages.clear () + if not ot_macrolanguages: + break + for ot_macrolanguage in ot_macrolanguages: + self.add_language (macrolanguage, ot_macrolanguage) + + def sort_languages (self): + """Sort the values of ``from_bcp_47`` in ascending rank order.""" + for language, tags in self.from_bcp_47.items (): + self.from_bcp_47[language] = sorted (tags, + key=lambda t: (self.ranks[t] + rank_delta (language, t), t)) + +ot = OpenTypeRegistryParser () + +class BCP47Parser (object): + """A parser for the BCP 47 subtag registry. + + Attributes: + header (str): The "File-Date" line of the registry. + names (Mapping[str, str]): A map of subtags to the names they + are given in the registry. Each value is a + ``'\\n'``-separated list of names. + scopes (Mapping[str, str]): A map of language subtags to strings + suffixed to language names, including suffixes to explain + language scopes. + macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of + language subtags to the sets of language subtags which + inherit from them. See + ``OpenTypeRegistryParser.inherit_from_macrolanguages``. + prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant + subtags to their prefixes. + grandfathered (AbstractSet[str]): The set of grandfathered tags, + normalized to lowercase. + + """ + def __init__ (self): + self.header = '' + self.names = {} + self.scopes = {} + self.macrolanguages = collections.defaultdict (set) + self.prefixes = collections.defaultdict (set) + self.grandfathered = set () + + def parse (self, filename): + """Parse the BCP 47 subtag registry. + + Args: + filename (str): The file name of the registry. + """ + with open (filename, encoding='utf-8') as f: + subtag_type = None + subtag = None + deprecated = False + has_preferred_value = False + line_buffer = '' + for line in itertools.chain (f, ['']): + line = line.rstrip () + if line.startswith (' '): + line_buffer += line[1:] + continue + line, line_buffer = line_buffer, line + if line.startswith ('Type: '): + subtag_type = line.split (' ')[1] + deprecated = False + has_preferred_value = False + elif line.startswith ('Subtag: ') or line.startswith ('Tag: '): + subtag = line.split (' ')[1] + if subtag_type == 'grandfathered': + self.grandfathered.add (subtag.lower ()) + elif line.startswith ('Description: '): + description = line.split (' ', 1)[1].replace (' (individual language)', '') + description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '', + description) + if subtag in self.names: + self.names[subtag] += '\n' + description + else: + self.names[subtag] = description + elif subtag_type == 'language' or subtag_type == 'grandfathered': + if line.startswith ('Scope: '): + scope = line.split (' ')[1] + if scope == 'macrolanguage': + scope = ' [macrolanguage]' + elif scope == 'collection': + scope = ' [collection]' + else: + continue + self.scopes[subtag] = scope + elif line.startswith ('Deprecated: '): + self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') + deprecated = True + elif deprecated and line.startswith ('Comments: see '): + # If a subtag is split into multiple replacement subtags, + # it essentially represents a macrolanguage. + for language in line.replace (',', '').split (' ')[2:]: + self._add_macrolanguage (subtag, language) + elif line.startswith ('Preferred-Value: '): + # If a subtag is deprecated in favor of a single replacement subtag, + # it is either a dialect or synonym of the preferred subtag. Either + # way, it is close enough to the truth to consider the replacement + # the macrolanguage of the deprecated language. + has_preferred_value = True + macrolanguage = line.split (' ')[1] + self._add_macrolanguage (macrolanguage, subtag) + elif not has_preferred_value and line.startswith ('Macrolanguage: '): + self._add_macrolanguage (line.split (' ')[1], subtag) + elif subtag_type == 'variant': + if line.startswith ('Deprecated: '): + self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') + elif line.startswith ('Prefix: '): + self.prefixes[subtag].add (line.split (' ')[1]) + elif line.startswith ('File-Date: '): + self.header = line + expect (self.header) + + def _add_macrolanguage (self, macrolanguage, language): + global ot + if language not in ot.from_bcp_47: + for l in self.macrolanguages.get (language, set ()): + self._add_macrolanguage (macrolanguage, l) + if macrolanguage not in ot.from_bcp_47: + for ls in list (self.macrolanguages.values ()): + if macrolanguage in ls: + ls.add (language) + return + self.macrolanguages[macrolanguage].add (language) + + def remove_extra_macrolanguages (self): + """Make every language have at most one macrolanguage.""" + inverted = collections.defaultdict (list) + for macrolanguage, languages in self.macrolanguages.items (): + for language in languages: + inverted[language].append (macrolanguage) + for language, macrolanguages in inverted.items (): + if len (macrolanguages) > 1: + macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml])) + biggest_macrolanguage = macrolanguages.pop () + for macrolanguage in macrolanguages: + self._add_macrolanguage (biggest_macrolanguage, macrolanguage) + + def _get_name_piece (self, subtag): + """Return the first name of a subtag plus its scope suffix. + + Args: + subtag (str): A BCP 47 subtag. + + Returns: + The name form of ``subtag``. + """ + return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '') + + def get_name (self, lt): + """Return the names of the subtags in a language tag. + + Args: + lt (LanguageTag): A BCP 47 language tag. + + Returns: + The name form of ``lt``. + """ + name = self._get_name_piece (lt.language) + if lt.script: + name += '; ' + self._get_name_piece (lt.script.title ()) + if lt.region: + name += '; ' + self._get_name_piece (lt.region.upper ()) + if lt.variant: + name += '; ' + self._get_name_piece (lt.variant) + return name + +bcp_47 = BCP47Parser () + +ot.parse (sys.argv[1]) +bcp_47.parse (sys.argv[2]) + +ot.add_language ('ary', 'MOR') + +ot.add_language ('ath', 'ATH') + +ot.add_language ('bai', 'BML') + +ot.ranks['BAL'] = ot.ranks['KAR'] + 1 + +ot.add_language ('ber', 'BBR') + +ot.remove_language_ot ('PGR') +ot.add_language ('el-polyton', 'PGR') + +bcp_47.macrolanguages['et'] = {'ekk'} + +bcp_47.names['flm'] = 'Falam Chin' +bcp_47.scopes['flm'] = ' (retired code)' +bcp_47.macrolanguages['flm'] = {'cfm'} + +ot.ranks['FNE'] = ot.ranks['TNE'] + 1 + +ot.add_language ('und-fonipa', 'IPPH') + +ot.add_language ('und-fonnapa', 'APPH') + +ot.remove_language_ot ('IRT') +ot.add_language ('ga-Latg', 'IRT') + +ot.add_language ('hy-arevmda', 'HYE') + +ot.remove_language_ot ('KGE') +ot.add_language ('und-Geok', 'KGE') + +bcp_47.macrolanguages['id'] = {'in'} + +bcp_47.macrolanguages['ijo'] = {'ijc'} + +ot.add_language ('kht', 'KHN') +ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)' +ot.ranks['KHN'] = ot.ranks['KHT'] + 1 + +ot.ranks['LCR'] = ot.ranks['MCR'] + 1 + +ot.names['MAL'] = 'Malayalam Traditional' +ot.ranks['MLR'] += 1 + +bcp_47.names['mhv'] = 'Arakanese' +bcp_47.scopes['mhv'] = ' (retired code)' + +ot.add_language ('mnw-TH', 'MONT') + +ot.add_language ('no', 'NOR') + +ot.add_language ('oc-provenc', 'PRO') + +ot.remove_language_ot ('QUZ') +ot.add_language ('qu', 'QUZ') +ot.add_language ('qub', 'QWH') +ot.add_language ('qud', 'QVI') +ot.add_language ('qug', 'QVI') +ot.add_language ('qul', 'QUH') +ot.add_language ('qup', 'QVI') +ot.add_language ('qur', 'QWH') +ot.add_language ('qus', 'QUH') +ot.add_language ('quw', 'QVI') +ot.add_language ('qux', 'QWH') +ot.add_language ('qva', 'QWH') +ot.add_language ('qvh', 'QWH') +ot.add_language ('qvj', 'QVI') +ot.add_language ('qvl', 'QWH') +ot.add_language ('qvm', 'QWH') +ot.add_language ('qvn', 'QWH') +ot.add_language ('qvo', 'QVI') +ot.add_language ('qvp', 'QWH') +ot.add_language ('qvw', 'QWH') +ot.add_language ('qvz', 'QVI') +ot.add_language ('qwa', 'QWH') +ot.add_language ('qws', 'QWH') +ot.add_language ('qxa', 'QWH') +ot.add_language ('qxc', 'QWH') +ot.add_language ('qxh', 'QWH') +ot.add_language ('qxl', 'QVI') +ot.add_language ('qxn', 'QWH') +ot.add_language ('qxo', 'QWH') +ot.add_language ('qxr', 'QVI') +ot.add_language ('qxt', 'QWH') +ot.add_language ('qxw', 'QWH') + +bcp_47.macrolanguages['ro-MD'].add ('mo') + +ot.remove_language_ot ('SYRE') +ot.remove_language_ot ('SYRJ') +ot.remove_language_ot ('SYRN') +ot.add_language ('und-Syre', 'SYRE') +ot.add_language ('und-Syrj', 'SYRJ') +ot.add_language ('und-Syrn', 'SYRN') + +bcp_47.names['xst'] = "Silt'e" +bcp_47.scopes['xst'] = ' (retired code)' +bcp_47.macrolanguages['xst'] = {'stv', 'wle'} + +ot.add_language ('xwo', 'TOD') + +ot.remove_language_ot ('ZHH') +ot.remove_language_ot ('ZHP') +ot.remove_language_ot ('ZHT') +ot.remove_language_ot ('ZHTM') +bcp_47.macrolanguages['zh'].remove ('lzh') +bcp_47.macrolanguages['zh'].remove ('yue') +ot.add_language ('zh-Hant-MO', 'ZHH') +ot.add_language ('zh-Hant-MO', 'ZHTM') +ot.add_language ('zh-Hant-HK', 'ZHH') +ot.add_language ('zh-Hans', 'ZHS') +ot.add_language ('zh-Hant', 'ZHT') +ot.add_language ('zh-HK', 'ZHH') +ot.add_language ('zh-MO', 'ZHH') +ot.add_language ('zh-MO', 'ZHTM') +ot.add_language ('zh-TW', 'ZHT') +ot.add_language ('lzh', 'ZHT') +ot.add_language ('lzh-Hans', 'ZHS') +ot.add_language ('yue', 'ZHH') +ot.add_language ('yue-Hans', 'ZHS') + +bcp_47.macrolanguages['zom'] = {'yos'} + +def rank_delta (bcp_47, ot): + """Return a delta to apply to a BCP 47 tag's rank. + + Most OpenType tags have a constant rank, but a few have ranks that + depend on the BCP 47 tag. + + Args: + bcp_47 (str): A BCP 47 tag. + ot (str): An OpenType tag to. + + Returns: + A number to add to ``ot``'s rank when sorting ``bcp_47``'s + OpenType equivalents. + """ + if bcp_47 == 'ak' and ot == 'AKA': + return -1 + if bcp_47 == 'tw' and ot == 'TWI': + return -1 + return 0 + +disambiguation = { + 'ALT': 'alt', + 'ARK': 'rki', + 'ATH': 'ath', + 'BHI': 'bhb', + 'BLN': 'bjt', + 'BTI': 'beb', + 'CCHN': 'cco', + 'CMR': 'swb', + 'CPP': 'crp', + 'CRR': 'crx', + 'DUJ': 'dwu', + 'ECR': 'crj', + 'HAL': 'cfm', + 'HND': 'hnd', + 'HYE': 'hyw', + 'KIS': 'kqs', + 'KUI': 'uki', + 'LRC': 'bqi', + 'NDB': 'nd', + 'NIS': 'njz', + 'PLG': 'pce', + 'PRO': 'pro', + 'QIN': 'bgr', + 'QUH': 'quh', + 'QVI': 'qvi', + 'QWH': 'qwh', + 'SIG': 'stv', + 'SRB': 'sr', + 'SXT': 'xnj', + 'ZHH': 'zh-HK', + 'ZHS': 'zh-Hans', + 'ZHT': 'zh-Hant', + 'ZHTM': 'zh-MO', +} + +ot.inherit_from_macrolanguages () +bcp_47.remove_extra_macrolanguages () +ot.inherit_from_macrolanguages () +ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/' +ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1 +for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names): + possible_bcp_47_tag = tricky_ot_tag.lower () + if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]: + ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM) + bcp_47.macrolanguages[possible_bcp_47_tag] = set () +ot.sort_languages () + +print ('/* == Start of generated table == */') +print ('/*') +print (' * The following table is generated by running:') +print (' *') +print (' * %s languagetags language-subtag-registry' % sys.argv[0]) +print (' *') +print (' * on files with these headers:') +print (' *') +print (' * %s' % ot.header.strip ()) +print (' * %s' % bcp_47.header) +print (' */') +print () +print ('#ifndef HB_OT_TAG_TABLE_HH') +print ('#define HB_OT_TAG_TABLE_HH') +print () + +def hb_tag (tag): + """Convert a tag to ``HB_TAG`` form. + + Args: + tag (str): An OpenType tag. + + Returns: + A snippet of C++ representing ``tag``. + """ + if tag == DEFAULT_LANGUAGE_SYSTEM: + return 'HB_TAG_NONE\t ' + return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4]) + +def get_variant_set (name): + """Return a set of variant language names from a name. + + Args: + name (str): A list of language names from the BCP 47 registry, + joined on ``'\\n'``. + + Returns: + A set of normalized language names. + """ + return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'")) + .encode ('ASCII', 'ignore') + .strip () + for n in re.split ('[\n(),]', name) if n) + +def language_name_intersection (a, b): + """Return the names in common between two language names. + + Args: + a (str): A list of language names from the BCP 47 registry, + joined on ``'\\n'``. + b (str): A list of language names from the BCP 47 registry, + joined on ``'\\n'``. + + Returns: + The normalized language names shared by ``a`` and ``b``. + """ + return get_variant_set (a).intersection (get_variant_set (b)) + +def get_matching_language_name (intersection, candidates): + return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c)))) + +def same_tag (bcp_47_tag, ot_tags): + return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () + +for language_len in (2, 3): + if language_len == 3: + print ('#ifndef HB_NO_LANGUAGE_LONG') + print ('static const LangTag ot_languages%d[] = {' % language_len) + for language, tags in sorted (ot.from_bcp_47.items ()): + if language == '' or '-' in language: + continue + if len(language) != language_len: continue + commented_out = same_tag (language, tags) + for i, tag in enumerate (tags, start=1): + print ('%s{%s,\t%s},' % ('/*' if commented_out else ' ', hb_tag (language), hb_tag (tag)), end='') + if commented_out: + print ('*/', end='') + print ('\t/* ', end='') + bcp_47_name = bcp_47.names.get (language, '') + bcp_47_name_candidates = bcp_47_name.split ('\n') + ot_name = ot.names[tag] + scope = bcp_47.scopes.get (language, '') + if tag == DEFAULT_LANGUAGE_SYSTEM: + write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}') + else: + intersection = language_name_intersection (bcp_47_name, ot_name) + if not intersection: + write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name)) + else: + name = get_matching_language_name (intersection, bcp_47_name_candidates) + bcp_47.names[language] = name + write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope)) + print (' */') + print ('};') + if language_len == 3: + print ('#endif') + print () + +print ('/**') +print (' * hb_ot_tags_from_complex_language:') +print (' * @lang_str: a BCP 47 language tag to convert.') +print (' * @limit: a pointer to the end of the substring of @lang_str to consider for') +print (' * conversion.') +print (' * @count: maximum number of language tags to retrieve (IN) and actual number of') +print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.') +print (' * @tags: array of size at least @language_count to store the language tag') +print (' * results') +print (' *') +print (' * Converts a multi-subtag BCP 47 language tag to language tags.') +print (' *') +print (' * Return value: Whether any language systems were retrieved.') +print (' **/') +print ('static inline bool') +print ('hb_ot_tags_from_complex_language (const char *lang_str,') +print ('\t\t\t\t const char *limit,') +print ('\t\t\t\t unsigned int *count /* IN/OUT */,') +print ('\t\t\t\t hb_tag_t *tags /* OUT */)') +print ('{') + +def print_subtag_matches (subtag, string, new_line): + if subtag: + if new_line: + print () + print ('\t&& ', end='') + print ('subtag_matches (%s, limit, "-%s", %i)' % (string, subtag, 1 + len (subtag)), end='') + +complex_tags = collections.defaultdict (list) +for initial, group in itertools.groupby ((lt_tags for lt_tags in [ + (LanguageTag (language), tags) + for language, tags in sorted (ot.from_bcp_47.items (), + key=lambda i: (-len (i[0]), i[0])) + ] if lt_tags[0].is_complex ()), + key=lambda lt_tags: lt_tags[0].get_group ()): + complex_tags[initial] += group + +# Calculate the min length of the subtags outside the switch +min_subtag_len = 100 +for initial, items in sorted (complex_tags.items ()): + if initial != 'und': + continue + for lt, tags in items: + if not tags: + continue + subtag_len = 0 + subtag_len += 1 + len (lt.script) if lt.script is not None else 0 + subtag_len += 1 + len (lt.region) if lt.region is not None else 0 + subtag_len += 1 + len (lt.variant) if lt.variant is not None else 0 + min_subtag_len = min(subtag_len, min_subtag_len) + +print (' if (limit - lang_str >= %d)' % (min_subtag_len + 2)) +print (' {') +print (" const char *p = strchr (lang_str, '-');") +print (" if (!p || p >= limit || limit - p < %i) goto out;" % min_subtag_len) +for initial, items in sorted (complex_tags.items ()): + if initial != 'und': + continue + for lt, tags in items: + if not tags: + continue + if lt.variant in bcp_47.prefixes: + expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, + '%s is not a valid prefix of %s' % (lt.language, lt.variant)) + print (' if (', end='') + print_subtag_matches (lt.script, 'p', False) + print_subtag_matches (lt.region, 'p', False) + print_subtag_matches (lt.variant, 'p', False) + print (')') + print (' {') + write (' /* %s */' % bcp_47.get_name (lt)) + print () + if len (tags) == 1: + write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) + print () + print (' *count = 1;') + else: + print (' hb_tag_t possible_tags[] = {') + for tag in tags: + write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag])) + print () + print (' };') + print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) + print ('\ttags[i] = possible_tags[i];') + print (' *count = i;') + print (' return true;') + print (' }') +print (' }') +print ('out:') + +print (' switch (lang_str[0])') +print (' {') +for initial, items in sorted (complex_tags.items ()): + if initial == 'und': + continue + print (" case '%s':" % initial) + for lt, tags in items: + if not tags: + continue + print (' if (', end='') + script = lt.script + region = lt.region + if lt.grandfathered: + print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='') + else: + string_literal = lt.language[1:] + '-' + if script: + string_literal += script + script = None + if region: + string_literal += '-' + region + region = None + if string_literal[-1] == '-': + print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='') + else: + print ('lang_matches (&lang_str[1], limit, "%s", %i)' % (string_literal, len (string_literal)), end='') + print_subtag_matches (script, 'lang_str', True) + print_subtag_matches (region, 'lang_str', True) + print_subtag_matches (lt.variant, 'lang_str', True) + print (')') + print (' {') + write (' /* %s */' % bcp_47.get_name (lt)) + print () + if len (tags) == 1: + write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) + print () + print (' *count = 1;') + else: + print (' unsigned int i;') + print (' hb_tag_t possible_tags[] = {') + for tag in tags: + write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag])) + print () + print (' };') + print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) + print ('\ttags[i] = possible_tags[i];') + print (' *count = i;') + print (' return true;') + print (' }') + print (' break;') + +print (' }') +print (' return false;') +print ('}') +print () +print ('/**') +print (' * hb_ot_ambiguous_tag_to_language') +print (' * @tag: A language tag.') +print (' *') +print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to') +print (' * many language tags) and the best tag is not the alphabetically first, or if') +print (' * the best tag consists of multiple subtags, or if the best tag does not appear') +print (' * in #ot_languages.') +print (' *') +print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,') +print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.') +print (' **/') +print ('static inline hb_language_t') +print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)') +print ('{') +print (' switch (tag)') +print (' {') + +def verify_disambiguation_dict (): + """Verify and normalize ``disambiguation``. + + ``disambiguation`` is a map of ambiguous OpenType language system + tags to the particular BCP 47 tags they correspond to. This function + checks that all its keys really are ambiguous and that each key's + value is valid for that key. It checks that no ambiguous tag is + missing, except when it can figure out which BCP 47 tag is the best + by itself. + + It modifies ``disambiguation`` to remove keys whose values are the + same as those that the fallback would return anyway, and to add + ambiguous keys whose disambiguations it determined automatically. + + Raises: + AssertionError: Verification failed. + """ + global bcp_47 + global disambiguation + global ot + for ot_tag, bcp_47_tags in ot.to_bcp_47.items (): + if ot_tag == DEFAULT_LANGUAGE_SYSTEM: + primary_tags = [] + else: + primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag) + if len (primary_tags) == 1: + expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag) + if '-' in primary_tags[0]: + disambiguation[ot_tag] = primary_tags[0] + else: + first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0] + if primary_tags[0] != first_tag: + disambiguation[ot_tag] = primary_tags[0] + elif len (primary_tags) == 0: + expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) + else: + original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')] + if len (original_languages) == 1: + macrolanguages = original_languages + else: + macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]'] + if len (macrolanguages) != 1: + macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]') + if len (macrolanguages) != 1: + macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, '')) + if len (macrolanguages) != 1: + expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages))) + expect (disambiguation[ot_tag] in bcp_47_tags, + '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag)) + elif ot_tag not in disambiguation: + disambiguation[ot_tag] = macrolanguages[0] + different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t))) + if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]: + del disambiguation[ot_tag] + for ot_tag in disambiguation.keys (): + expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag) + +verify_disambiguation_dict () +for ot_tag, bcp_47_tag in sorted (disambiguation.items ()): + write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag])) + print () + write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag)))) + print () + +print (' default:') +print (' return HB_LANGUAGE_INVALID;') +print (' }') +print ('}') + +print () +print ('#endif /* HB_OT_TAG_TABLE_HH */') +print () +print ('/* == End of generated table == */') + |