summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/is_combining_diacritic.py
blob: d8c3c4ba35c179a09ef97882bcbb538f561ec33c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from unicodedata import combining

UNICODE_LIMIT = 0x110000

UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
UNICODE_COMBINING_CLASS_KANA_VOICING = 8
UNICODE_COMBINING_CLASS_VIRAMA = 9


# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
def is_combining_diacritic(char):
    return combining(char) not in (
        UNICODE_COMBINING_CLASS_NOT_REORDERED,
        UNICODE_COMBINING_CLASS_KANA_VOICING,
        UNICODE_COMBINING_CLASS_VIRAMA,
        91,
        129,
        130,
        132,
    )


# See gfxFontUtils.h for the SharedBitSet that we're creating a const instance of here.
BLOCK_SIZE = 32
BLOCK_SIZE_BITS = BLOCK_SIZE * 8


def main(header):
    blockIndex = []
    blocks = []

    # Figure out the contents of each 256-char block, and see if it is unique
    # or can share an already-allocated block.
    block = [0] * BLOCK_SIZE
    byte = 0
    bit = 0x01
    for char in range(UNICODE_LIMIT):
        if is_combining_diacritic(chr(char)):
            block[byte] |= bit
        bit <<= 1
        if bit == 0x100:
            bit = 0x01
            byte += 1
        if byte == BLOCK_SIZE:
            found = False
            for b in range(len(blocks)):
                if block == blocks[b]:
                    blockIndex.append(b)
                    found = True
                    break
            if not found:
                blockIndex.append(len(blocks))
                blocks.append(block)
            byte = 0
            block = [0] * BLOCK_SIZE

    # Strip trailing empty blocks from the index.
    while blockIndex[len(blockIndex) - 1] == 0:
        del blockIndex[len(blockIndex) - 1]

    # Write the SharedBitSet as data in a C++ header file.
    header.write("/* !GENERATED DATA -- DO NOT EDIT! */\n")
    header.write("/* (see is_combining_diacritic.py) */\n")
    header.write("\n")
    header.write('#include "gfxFontUtils.h"\n')
    header.write("\n")

    header.write("typedef struct {\n")
    header.write("  uint16_t mBlockIndexCount;\n")
    header.write("  uint16_t mBlockCount;\n")
    header.write("  uint16_t mBlockIndex[" + str(len(blockIndex)) + "];\n")
    header.write("  uint8_t mBlockData[" + str(len(blocks) * BLOCK_SIZE) + "];\n")
    header.write("} CombiningDiacriticsBitset_t;\n")
    header.write("\n")

    header.write(
        "static const CombiningDiacriticsBitset_t COMBINING_DIACRITICS_BITSET_DATA = {\n"
    )
    header.write("  " + str(len(blockIndex)) + ",\n")
    header.write("  " + str(len(blocks)) + ",\n")
    header.write("  {\n")
    for b in blockIndex:
        header.write("    " + str(b) + ",\n")
    header.write("  },\n")
    header.write("  {\n")
    for b in blocks:
        header.write("    ")
        for i in b:
            header.write(str(i) + ",")
        header.write("\n")
    header.write("  },\n")
    header.write("};\n")
    header.write("\n")
    header.write("static const SharedBitSet* sCombiningDiacriticsSet =\n")
    header.write(
        "    reinterpret_cast<const SharedBitSet*>(&COMBINING_DIACRITICS_BITSET_DATA);\n"
    )
    header.write("\n")