summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/base_chars.py
blob: 91d3ba335255048bf45dfdc8e44b045d8561eb2f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import re
from collections import namedtuple
from unicodedata import category, combining, normalize

UNICODE_LIMIT = 0x110000

UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
UNICODE_COMBINING_CLASS_KANA_VOICING = 8
UNICODE_COMBINING_CLASS_VIRAMA = 9

BaseCharMapping = namedtuple("BaseCharMapping", ("char", "base_char"))
BaseCharMappingBlock = namedtuple("BaseCharMappingBlock", ("first", "last", "offset"))


# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
def is_combining_diacritic(char):
    return combining(char) not in (
        UNICODE_COMBINING_CLASS_NOT_REORDERED,
        UNICODE_COMBINING_CLASS_KANA_VOICING,
        UNICODE_COMBINING_CLASS_VIRAMA,
        91,
        129,
        130,
        132,
    )


# Keep this function in sync with IsMathOrMusicSymbol in nsUnicodeProperties.h.
def is_math_or_music_symbol(char):
    return category(char) in ("Sm", "So")


def changes_plane(char, base_char):
    # Mappings that would change the first 16 bits of a character are not
    # currently supported. This is because the mapping table only records the
    # last 16 bits of the base character and also because moving into or out of
    # the basic multilingual plane would change the length of a UTF-16 string.
    return ord(char) >> 16 != ord(base_char) >> 16


def main(header, fallback_table):
    mappings = {}

    # Glean mappings from decompositions

    for char in range(UNICODE_LIMIT):
        char = chr(char)
        if is_combining_diacritic(char) or is_math_or_music_symbol(char):
            continue
        decomposition = normalize("NFD", char)
        if len(decomposition) < 2:
            continue
        base_char = decomposition[0]
        if changes_plane(char, base_char):
            continue
        next_char = decomposition[1]
        if not is_combining_diacritic(next_char):
            # Hangul syllables decompose but do not actually have diacritics.
            # This also excludes decompositions with the Japanese marks U+3099
            # and U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND
            # MARK), which we should not ignore for searching (bug 1624244).
            continue
        mappings[char] = base_char

    # Add mappings from the ASCII fallback table

    for line in open(fallback_table, encoding="UTF-8"):
        m = re.match("^(.) → (.+?) ;", line)
        if not m:
            continue
        char = m.group(1)
        decomposition = m.group(2)
        if len(decomposition) >= 3:
            if decomposition.startswith("'") and decomposition.endswith("'"):
                decomposition = decomposition[1:-1]
        if len(decomposition) >= 2:
            if decomposition.startswith("\\"):
                decomposition = decomposition[1:]
        if len(decomposition) > 1:
            continue
        if changes_plane(char, decomposition):
            continue
        mappings[char] = decomposition

    # Organize mappings into contiguous blocks

    mappings = sorted([BaseCharMapping(ord(k), ord(v)) for k, v in mappings.items()])
    blocks = []
    i = 0
    while i < len(mappings) - 1:
        offset = i
        first = mappings[i].char & 0xFF
        while (
            i < len(mappings) - 1 and mappings[i].char >> 8 == mappings[i + 1].char >> 8
        ):
            while (
                i < len(mappings) - 1
                and mappings[i].char >> 8 == mappings[i + 1].char >> 8
                and mappings[i + 1].char - mappings[i].char > 1
            ):
                char = mappings[i].char + 1
                mappings.insert(i + 1, BaseCharMapping(char, char))
                i += 1
            i += 1
        last = mappings[i].char & 0xFF
        blocks.append(BaseCharMappingBlock(first, last, offset))
        i += 1

    indexes = []
    for i, block in enumerate(blocks):
        while len(indexes) < mappings[block.offset].char >> 8:
            indexes.append(255)
        indexes.append(i)

    # Write the mappings to a C header file

    header.write("struct BaseCharMappingBlock {\n")
    header.write("  uint8_t mFirst;\n")
    header.write("  uint8_t mLast;\n")
    header.write("  uint16_t mMappingStartOffset;\n")
    header.write("};\n")
    header.write("\n")
    header.write("static const uint16_t BASE_CHAR_MAPPING_LIST[] = {\n")
    for char, base_char in mappings:
        header.write(
            "  /* {:#06x}".format(char)
            + " */ "
            + "{:#06x}".format(base_char & 0xFFFF)
            + ","
        )
        if char != base_char:
            header.write(" /* " + chr(char) + " → " + chr(base_char) + " */")
        header.write("\n")
    header.write("};\n")
    header.write("\n")
    header.write(
        "static const struct BaseCharMappingBlock BASE_CHAR_MAPPING_BLOCKS[] = {\n"
    )
    for block in blocks:
        header.write(
            "  {"
            + "{:#04x}".format(block.first)
            + ", "
            + "{:#04x}".format(block.last)
            + ", "
            + str(block.offset).rjust(4)
            + "}, // "
            + "{:#04x}".format(mappings[block.offset].char >> 8)
            + "xx\n"
        )
    header.write("};\n")
    header.write("\n")
    header.write("static const uint8_t BASE_CHAR_MAPPING_BLOCK_INDEX[] = {\n")
    for i, index in enumerate(indexes):
        header.write(
            "  " + str(index).rjust(3) + ", // " + "{:#04x}".format(i) + "xx\n"
        )
    header.write("};\n")