diff options
Diffstat (limited to '')
-rwxr-xr-x | gfx/harfbuzz/src/gen-indic-table.py | 699 |
1 files changed, 699 insertions, 0 deletions
diff --git a/gfx/harfbuzz/src/gen-indic-table.py b/gfx/harfbuzz/src/gen-indic-table.py new file mode 100755 index 0000000000..4ef970265d --- /dev/null +++ b/gfx/harfbuzz/src/gen-indic-table.py @@ -0,0 +1,699 @@ +#!/usr/bin/env python3 + +"""usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt + +Input files: +* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt +* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt +* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt +""" + +import sys + +if len (sys.argv) != 4: + sys.exit (__doc__) + +ALLOWED_SINGLES = [0x00A0, 0x25CC] +ALLOWED_BLOCKS = [ + 'Basic Latin', + 'Latin-1 Supplement', + 'Devanagari', + 'Bengali', + 'Gurmukhi', + 'Gujarati', + 'Oriya', + 'Tamil', + 'Telugu', + 'Kannada', + 'Malayalam', + 'Myanmar', + 'Khmer', + 'Vedic Extensions', + 'General Punctuation', + 'Superscripts and Subscripts', + 'Devanagari Extended', + 'Myanmar Extended-B', + 'Myanmar Extended-A', +] + +files = [open (x, encoding='utf-8') for x in sys.argv[1:]] + +headers = [[f.readline () for i in range (2)] for f in files] + +unicode_data = [{} for _ in files] +for i, f in enumerate (files): + for line in f: + + j = line.find ('#') + if j >= 0: + line = line[:j] + + fields = [x.strip () for x in line.split (';')] + if len (fields) == 1: + continue + + uu = fields[0].split ('..') + start = int (uu[0], 16) + if len (uu) == 1: + end = start + else: + end = int (uu[1], 16) + + t = fields[1] + + for u in range (start, end + 1): + unicode_data[i][u] = t + +# Merge data into one dict: +defaults = ('Other', 'Not_Applicable', 'No_Block') +combined = {} +for i,d in enumerate (unicode_data): + for u,v in d.items (): + if i == 2 and not u in combined: + continue + if not u in combined: + combined[u] = list (defaults) + combined[u][i] = v +combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS} + + +# Convert categories & positions types + +categories = { + 'indic' : [ + 'X', + 'C', + 'V', + 'N', + 'H', + 'ZWNJ', + 'ZWJ', + 'M', + 'SM', + 'A', + 'VD', + 'PLACEHOLDER', + 'DOTTEDCIRCLE', + 'RS', + 'MPst', + 'Repha', + 'Ra', + 'CM', + 'Symbol', + 'CS', + ], + 'khmer' : [ + 'VAbv', + 'VBlw', + 'VPre', + 'VPst', + + 'Robatic', + 'Xgroup', + 'Ygroup', + ], + 'myanmar' : [ + 'VAbv', + 'VBlw', + 'VPre', + 'VPst', + + 'IV', + 'As', + 'DB', + 'GB', + 'MH', + 'MR', + 'MW', + 'MY', + 'PT', + 'VS', + 'ML', + ], +} + +category_map = { + 'Other' : 'X', + 'Avagraha' : 'Symbol', + 'Bindu' : 'SM', + 'Brahmi_Joining_Number' : 'PLACEHOLDER', # Don't care. + 'Cantillation_Mark' : 'A', + 'Consonant' : 'C', + 'Consonant_Dead' : 'C', + 'Consonant_Final' : 'CM', + 'Consonant_Head_Letter' : 'C', + 'Consonant_Initial_Postfixed' : 'C', # TODO + 'Consonant_Killer' : 'M', # U+17CD only. + 'Consonant_Medial' : 'CM', + 'Consonant_Placeholder' : 'PLACEHOLDER', + 'Consonant_Preceding_Repha' : 'Repha', + 'Consonant_Prefixed' : 'X', # Don't care. + 'Consonant_Subjoined' : 'CM', + 'Consonant_Succeeding_Repha' : 'CM', + 'Consonant_With_Stacker' : 'CS', + 'Gemination_Mark' : 'SM', # https://github.com/harfbuzz/harfbuzz/issues/552 + 'Invisible_Stacker' : 'H', + 'Joiner' : 'ZWJ', + 'Modifying_Letter' : 'X', + 'Non_Joiner' : 'ZWNJ', + 'Nukta' : 'N', + 'Number' : 'PLACEHOLDER', + 'Number_Joiner' : 'PLACEHOLDER', # Don't care. + 'Pure_Killer' : 'M', # Is like a vowel matra. + 'Register_Shifter' : 'RS', + 'Syllable_Modifier' : 'SM', + 'Tone_Letter' : 'X', + 'Tone_Mark' : 'N', + 'Virama' : 'H', + 'Visarga' : 'SM', + 'Vowel' : 'V', + 'Vowel_Dependent' : 'M', + 'Vowel_Independent' : 'V', +} +position_map = { + 'Not_Applicable' : 'END', + + 'Left' : 'PRE_C', + 'Top' : 'ABOVE_C', + 'Bottom' : 'BELOW_C', + 'Right' : 'POST_C', + + # These should resolve to the position of the last part of the split sequence. + 'Bottom_And_Right' : 'POST_C', + 'Left_And_Right' : 'POST_C', + 'Top_And_Bottom' : 'BELOW_C', + 'Top_And_Bottom_And_Left' : 'BELOW_C', + 'Top_And_Bottom_And_Right' : 'POST_C', + 'Top_And_Left' : 'ABOVE_C', + 'Top_And_Left_And_Right' : 'POST_C', + 'Top_And_Right' : 'POST_C', + + 'Overstruck' : 'AFTER_MAIN', + 'Visual_order_left' : 'PRE_M', +} + +category_overrides = { + + # These are the variation-selectors. They only appear in the Myanmar grammar + # but are not Myanmar-specific + 0xFE00: 'VS', + 0xFE01: 'VS', + 0xFE02: 'VS', + 0xFE03: 'VS', + 0xFE04: 'VS', + 0xFE05: 'VS', + 0xFE06: 'VS', + 0xFE07: 'VS', + 0xFE08: 'VS', + 0xFE09: 'VS', + 0xFE0A: 'VS', + 0xFE0B: 'VS', + 0xFE0C: 'VS', + 0xFE0D: 'VS', + 0xFE0E: 'VS', + 0xFE0F: 'VS', + + # These appear in the OT Myanmar spec, but are not Myanmar-specific + 0x2015: 'PLACEHOLDER', + 0x2022: 'PLACEHOLDER', + 0x25FB: 'PLACEHOLDER', + 0x25FC: 'PLACEHOLDER', + 0x25FD: 'PLACEHOLDER', + 0x25FE: 'PLACEHOLDER', + + + # Indic + + 0x0930: 'Ra', # Devanagari + 0x09B0: 'Ra', # Bengali + 0x09F0: 'Ra', # Bengali + 0x0A30: 'Ra', # Gurmukhi No Reph + 0x0AB0: 'Ra', # Gujarati + 0x0B30: 'Ra', # Oriya + 0x0BB0: 'Ra', # Tamil No Reph + 0x0C30: 'Ra', # Telugu Reph formed only with ZWJ + 0x0CB0: 'Ra', # Kannada + 0x0D30: 'Ra', # Malayalam No Reph, Logical Repha + + # The following act more like the Bindus. + 0x0953: 'SM', + 0x0954: 'SM', + + # U+0A40 GURMUKHI VOWEL SIGN II may be preceded by U+0A02 GURMUKHI SIGN BINDI. + 0x0A40: 'MPst', + + # The following act like consonants. + 0x0A72: 'C', + 0x0A73: 'C', + 0x1CF5: 'C', + 0x1CF6: 'C', + + # TODO: The following should only be allowed after a Visarga. + # For now, just treat them like regular tone marks. + 0x1CE2: 'A', + 0x1CE3: 'A', + 0x1CE4: 'A', + 0x1CE5: 'A', + 0x1CE6: 'A', + 0x1CE7: 'A', + 0x1CE8: 'A', + + # TODO: The following should only be allowed after some of + # the nasalization marks, maybe only for U+1CE9..U+1CF1. + # For now, just treat them like tone marks. + 0x1CED: 'A', + + # The following take marks in standalone clusters, similar to Avagraha. + 0xA8F2: 'Symbol', + 0xA8F3: 'Symbol', + 0xA8F4: 'Symbol', + 0xA8F5: 'Symbol', + 0xA8F6: 'Symbol', + 0xA8F7: 'Symbol', + 0x1CE9: 'Symbol', + 0x1CEA: 'Symbol', + 0x1CEB: 'Symbol', + 0x1CEC: 'Symbol', + 0x1CEE: 'Symbol', + 0x1CEF: 'Symbol', + 0x1CF0: 'Symbol', + 0x1CF1: 'Symbol', + + 0x0A51: 'M', # https://github.com/harfbuzz/harfbuzz/issues/524 + + # According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil, + # so the Indic shaper needs to know their categories. + 0x11301: 'SM', + 0x11302: 'SM', + 0x11303: 'SM', + 0x1133B: 'N', + 0x1133C: 'N', + + 0x0AFB: 'N', # https://github.com/harfbuzz/harfbuzz/issues/552 + 0x0B55: 'N', # https://github.com/harfbuzz/harfbuzz/issues/2849 + + 0x09FC: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/1613 + 0x0C80: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/623 + 0x0D04: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/3511 + + 0x25CC: 'DOTTEDCIRCLE', + + + # Khmer + + 0x179A: 'Ra', + + 0x17CC: 'Robatic', + 0x17C9: 'Robatic', + 0x17CA: 'Robatic', + + 0x17C6: 'Xgroup', + 0x17CB: 'Xgroup', + 0x17CD: 'Xgroup', + 0x17CE: 'Xgroup', + 0x17CF: 'Xgroup', + 0x17D0: 'Xgroup', + 0x17D1: 'Xgroup', + + 0x17C7: 'Ygroup', + 0x17C8: 'Ygroup', + 0x17DD: 'Ygroup', + 0x17D3: 'Ygroup', # Just guessing. Uniscribe doesn't categorize it. + + 0x17D9: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/issues/2384 + + + # Myanmar + + # https://docs.microsoft.com/en-us/typography/script-development/myanmar#analyze + + 0x104E: 'C', # The spec says C, IndicSyllableCategory says Consonant_Placeholder + + 0x1004: 'Ra', + 0x101B: 'Ra', + 0x105A: 'Ra', + + 0x1032: 'A', + 0x1036: 'A', + + 0x103A: 'As', + + #0x1040: 'D0', # XXX The spec says D0, but Uniscribe doesn't seem to do. + + 0x103E: 'MH', + 0x1060: 'ML', + 0x103C: 'MR', + 0x103D: 'MW', + 0x1082: 'MW', + 0x103B: 'MY', + 0x105E: 'MY', + 0x105F: 'MY', + + 0x1063: 'PT', + 0x1064: 'PT', + 0x1069: 'PT', + 0x106A: 'PT', + 0x106B: 'PT', + 0x106C: 'PT', + 0x106D: 'PT', + 0xAA7B: 'PT', + + 0x1038: 'SM', + 0x1087: 'SM', + 0x1088: 'SM', + 0x1089: 'SM', + 0x108A: 'SM', + 0x108B: 'SM', + 0x108C: 'SM', + 0x108D: 'SM', + 0x108F: 'SM', + 0x109A: 'SM', + 0x109B: 'SM', + 0x109C: 'SM', + + 0x104A: 'PLACEHOLDER', +} +position_overrides = { + + 0x0A51: 'BELOW_C', # https://github.com/harfbuzz/harfbuzz/issues/524 + + 0x0B01: 'BEFORE_SUB', # Oriya Bindu is BeforeSub in the spec. +} + +def matra_pos_left(u, block): + return "PRE_M" +def matra_pos_right(u, block): + if block == 'Devanagari': return 'AFTER_SUB' + if block == 'Bengali': return 'AFTER_POST' + if block == 'Gurmukhi': return 'AFTER_POST' + if block == 'Gujarati': return 'AFTER_POST' + if block == 'Oriya': return 'AFTER_POST' + if block == 'Tamil': return 'AFTER_POST' + if block == 'Telugu': return 'BEFORE_SUB' if u <= 0x0C42 else 'AFTER_SUB' + if block == 'Kannada': return 'BEFORE_SUB' if u < 0x0CC3 or u > 0x0CD6 else 'AFTER_SUB' + if block == 'Malayalam': return 'AFTER_POST' + return 'AFTER_SUB' +def matra_pos_top(u, block): + # BENG and MLYM don't have top matras. + if block == 'Devanagari': return 'AFTER_SUB' + if block == 'Gurmukhi': return 'AFTER_POST' # Deviate from spec + if block == 'Gujarati': return 'AFTER_SUB' + if block == 'Oriya': return 'AFTER_MAIN' + if block == 'Tamil': return 'AFTER_SUB' + if block == 'Telugu': return 'BEFORE_SUB' + if block == 'Kannada': return 'BEFORE_SUB' + return 'AFTER_SUB' +def matra_pos_bottom(u, block): + if block == 'Devanagari': return 'AFTER_SUB' + if block == 'Bengali': return 'AFTER_SUB' + if block == 'Gurmukhi': return 'AFTER_POST' + if block == 'Gujarati': return 'AFTER_POST' + if block == 'Oriya': return 'AFTER_SUB' + if block == 'Tamil': return 'AFTER_POST' + if block == 'Telugu': return 'BEFORE_SUB' + if block == 'Kannada': return 'BEFORE_SUB' + if block == 'Malayalam': return 'AFTER_POST' + return "AFTER_SUB" +def indic_matra_position(u, pos, block): # Reposition matra + if pos == 'PRE_C': return matra_pos_left(u, block) + if pos == 'POST_C': return matra_pos_right(u, block) + if pos == 'ABOVE_C': return matra_pos_top(u, block) + if pos == 'BELOW_C': return matra_pos_bottom(u, block) + assert (False) + +def position_to_category(pos): + if pos == 'PRE_C': return 'VPre' + if pos == 'ABOVE_C': return 'VAbv' + if pos == 'BELOW_C': return 'VBlw' + if pos == 'POST_C': return 'VPst' + assert(False) + + +defaults = (category_map[defaults[0]], position_map[defaults[1]], defaults[2]) + +indic_data = {} +for k, (cat, pos, block) in combined.items(): + cat = category_map[cat] + pos = position_map[pos] + indic_data[k] = (cat, pos, block) + +for k,new_cat in category_overrides.items(): + (cat, pos, _) = indic_data.get(k, defaults) + indic_data[k] = (new_cat, pos, unicode_data[2][k]) + +# We only expect position for certain types +positioned_categories = ('CM', 'SM', 'RS', 'H', 'M', 'MPst') +for k, (cat, pos, block) in indic_data.items(): + if cat not in positioned_categories: + pos = 'END' + indic_data[k] = (cat, pos, block) + +# Position overrides are more complicated + +# Keep in sync with CONSONANT_FLAGS in the shaper +consonant_categories = ('C', 'CS', 'Ra','CM', 'V', 'PLACEHOLDER', 'DOTTEDCIRCLE') +matra_categories = ('M', 'MPst') +smvd_categories = ('SM', 'VD', 'A', 'Symbol') +for k, (cat, pos, block) in indic_data.items(): + if cat in consonant_categories: + pos = 'BASE_C' + elif cat in matra_categories: + if block.startswith('Khmer') or block.startswith('Myanmar'): + cat = position_to_category(pos) + else: + pos = indic_matra_position(k, pos, block) + elif cat in smvd_categories: + pos = 'SMVD'; + indic_data[k] = (cat, pos, block) + +for k,new_pos in position_overrides.items(): + (cat, pos, _) = indic_data.get(k, defaults) + indic_data[k] = (cat, new_pos, unicode_data[2][k]) + + +values = [{_: 1} for _ in defaults] +for vv in indic_data.values(): + for i,v in enumerate(vv): + values[i][v] = values[i].get (v, 0) + 1 + + + + +# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out +singles = {} +for u in ALLOWED_SINGLES: + singles[u] = indic_data[u] + del indic_data[u] + +print ("/* == Start of generated table == */") +print ("/*") +print (" * The following table is generated by running:") +print (" *") +print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt") +print (" *") +print (" * on files with these headers:") +print (" *") +for h in headers: + for l in h: + print (" * %s" % (l.strip())) +print (" */") +print () +print ('#include "hb.hh"') +print () +print ('#ifndef HB_NO_OT_SHAPE') +print () +print ('#include "hb-ot-shaper-indic.hh"') +print () +print ('#pragma GCC diagnostic push') +print ('#pragma GCC diagnostic ignored "-Wunused-macros"') +print () + +# Print categories +for shaper in categories: + print ('#include "hb-ot-shaper-%s-machine.hh"' % shaper) +print () +done = {} +for shaper, shaper_cats in categories.items(): + print ('/* %s */' % shaper) + for cat in shaper_cats: + v = shaper[0].upper() + if cat not in done: + print ("#define OT_%s %s_Cat(%s)" % (cat, v, cat)) + done[cat] = v + else: + print ('static_assert (OT_%s == %s_Cat(%s), "");' % (cat, v, cat)) +print () + +# Shorten values +short = [{ + "Repha": 'Rf', + "PLACEHOLDER": 'GB', + "DOTTEDCIRCLE": 'DC', + "VPst": 'VR', + "VPre": 'VL', + "Robatic": 'Rt', + "Xgroup": 'Xg', + "Ygroup": 'Yg', + "As": 'As', +},{ + "END": 'X', + "BASE_C": 'C', + "ABOVE_C": 'T', + "BELOW_C": 'B', + "POST_C": 'R', + "PRE_C": 'L', + "PRE_M": 'LM', + "AFTER_MAIN": 'A', + "AFTER_SUB": 'AS', + "BEFORE_SUB": 'BS', + "AFTER_POST": 'AP', + "SMVD": 'SM', +}] +all_shorts = [{},{}] + +# Add some of the values, to make them more readable, and to avoid duplicates + +for i in range (2): + for v,s in short[i].items (): + all_shorts[i][s] = v + +what = ["OT", "POS"] +what_short = ["_OT", "_POS"] +cat_defs = [] +for i in range (2): + vv = sorted (values[i].keys ()) + for v in vv: + v_no_and = v.replace ('_And_', '_') + if v in short[i]: + s = short[i][v] + else: + s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')]) + if s in all_shorts[i]: + raise Exception ("Duplicate short value alias", v, all_shorts[i][s]) + all_shorts[i][s] = v + short[i][v] = s + cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + (v.upper () if i else v), str (values[i][v]), v)) + +maxlen_s = max ([len (c[0]) for c in cat_defs]) +maxlen_l = max ([len (c[1]) for c in cat_defs]) +maxlen_n = max ([len (c[2]) for c in cat_defs]) +for s in what_short: + print () + for c in [c for c in cat_defs if s in c[0]]: + print ("#define %s %s /* %s chars; %s */" % + (c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3])) +print () +print ('#pragma GCC diagnostic pop') +print () +print ("#define INDIC_COMBINE_CATEGORIES(S,M) ((S) | ((M) << 8))") +print () +print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (%s_##S, %s_##M)" % tuple(what_short)) +print () +print () + +total = 0 +used = 0 +last_block = None +def print_block (block, start, end, data): + global total, used, last_block + if block and block != last_block: + print () + print () + print (" /* %s */" % block) + num = 0 + assert start % 8 == 0 + assert (end+1) % 8 == 0 + for u in range (start, end+1): + if u % 8 == 0: + print () + print (" /* %04X */" % u, end="") + if u in data: + num += 1 + d = data.get (u, defaults) + print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="") + + total += end - start + 1 + used += num + if block: + last_block = block + +uu = sorted (indic_data) + +last = -100000 +num = 0 +offset = 0 +starts = [] +ends = [] +print ("static const uint16_t indic_table[] = {") +for u in uu: + if u <= last: + continue + block = indic_data[u][2] + + start = u//8*8 + end = start+1 + while end in uu and block == indic_data[end][2]: + end += 1 + end = (end-1)//8*8 + 7 + + if start != last + 1: + if start - last <= 1+16*2: + print_block (None, last+1, start-1, indic_data) + else: + if last >= 0: + ends.append (last + 1) + offset += ends[-1] - starts[-1] + print () + print () + print ("#define indic_offset_0x%04xu %d" % (start, offset)) + starts.append (start) + + print_block (block, start, end, indic_data) + last = end +ends.append (last + 1) +offset += ends[-1] - starts[-1] +print () +print () +occupancy = used * 100. / total +page_bits = 12 +print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) +print () +print ("uint16_t") +print ("hb_indic_get_categories (hb_codepoint_t u)") +print ("{") +print (" switch (u >> %d)" % page_bits) +print (" {") +pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())]) +for p in sorted(pages): + print (" case 0x%0Xu:" % p) + for u,d in singles.items (): + if p != u>>page_bits: continue + print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])) + for (start,end) in zip (starts, ends): + if p not in [start>>page_bits, end>>page_bits]: continue + offset = "indic_offset_0x%04xu" % start + print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) + print (" break;") + print ("") +print (" default:") +print (" break;") +print (" }") +print (" return _(X,X);") +print ("}") +print () +print ("#undef _") +print ("#undef INDIC_COMBINE_CATEGORIES") +for i in range (2): + print () + vv = sorted (values[i].keys ()) + for v in vv: + print ("#undef %s_%s" % + (what_short[i], short[i][v])) +print () +print ('#endif') +print () +print ("/* == End of generated table == */") + +# Maintain at least 50% occupancy in the table */ +if occupancy < 50: + raise Exception ("Table too sparse, please investigate: ", occupancy) |