diff options
Diffstat (limited to 'intl/unicharutil/util/base_chars.py')
-rw-r--r-- | intl/unicharutil/util/base_chars.py | 162 |
1 files changed, 162 insertions, 0 deletions
diff --git a/intl/unicharutil/util/base_chars.py b/intl/unicharutil/util/base_chars.py new file mode 100644 index 0000000000..91d3ba3352 --- /dev/null +++ b/intl/unicharutil/util/base_chars.py @@ -0,0 +1,162 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import re +from collections import namedtuple +from unicodedata import category, combining, normalize + +UNICODE_LIMIT = 0x110000 + +UNICODE_COMBINING_CLASS_NOT_REORDERED = 0 +UNICODE_COMBINING_CLASS_KANA_VOICING = 8 +UNICODE_COMBINING_CLASS_VIRAMA = 9 + +BaseCharMapping = namedtuple("BaseCharMapping", ("char", "base_char")) +BaseCharMappingBlock = namedtuple("BaseCharMappingBlock", ("first", "last", "offset")) + + +# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h. +def is_combining_diacritic(char): + return combining(char) not in ( + UNICODE_COMBINING_CLASS_NOT_REORDERED, + UNICODE_COMBINING_CLASS_KANA_VOICING, + UNICODE_COMBINING_CLASS_VIRAMA, + 91, + 129, + 130, + 132, + ) + + +# Keep this function in sync with IsMathOrMusicSymbol in nsUnicodeProperties.h. +def is_math_or_music_symbol(char): + return category(char) in ("Sm", "So") + + +def changes_plane(char, base_char): + # Mappings that would change the first 16 bits of a character are not + # currently supported. This is because the mapping table only records the + # last 16 bits of the base character and also because moving into or out of + # the basic multilingual plane would change the length of a UTF-16 string. + return ord(char) >> 16 != ord(base_char) >> 16 + + +def main(header, fallback_table): + mappings = {} + + # Glean mappings from decompositions + + for char in range(UNICODE_LIMIT): + char = chr(char) + if is_combining_diacritic(char) or is_math_or_music_symbol(char): + continue + decomposition = normalize("NFD", char) + if len(decomposition) < 2: + continue + base_char = decomposition[0] + if changes_plane(char, base_char): + continue + next_char = decomposition[1] + if not is_combining_diacritic(next_char): + # Hangul syllables decompose but do not actually have diacritics. + # This also excludes decompositions with the Japanese marks U+3099 + # and U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND + # MARK), which we should not ignore for searching (bug 1624244). + continue + mappings[char] = base_char + + # Add mappings from the ASCII fallback table + + for line in open(fallback_table, encoding="UTF-8"): + m = re.match("^(.) → (.+?) ;", line) + if not m: + continue + char = m.group(1) + decomposition = m.group(2) + if len(decomposition) >= 3: + if decomposition.startswith("'") and decomposition.endswith("'"): + decomposition = decomposition[1:-1] + if len(decomposition) >= 2: + if decomposition.startswith("\\"): + decomposition = decomposition[1:] + if len(decomposition) > 1: + continue + if changes_plane(char, decomposition): + continue + mappings[char] = decomposition + + # Organize mappings into contiguous blocks + + mappings = sorted([BaseCharMapping(ord(k), ord(v)) for k, v in mappings.items()]) + blocks = [] + i = 0 + while i < len(mappings) - 1: + offset = i + first = mappings[i].char & 0xFF + while ( + i < len(mappings) - 1 and mappings[i].char >> 8 == mappings[i + 1].char >> 8 + ): + while ( + i < len(mappings) - 1 + and mappings[i].char >> 8 == mappings[i + 1].char >> 8 + and mappings[i + 1].char - mappings[i].char > 1 + ): + char = mappings[i].char + 1 + mappings.insert(i + 1, BaseCharMapping(char, char)) + i += 1 + i += 1 + last = mappings[i].char & 0xFF + blocks.append(BaseCharMappingBlock(first, last, offset)) + i += 1 + + indexes = [] + for i, block in enumerate(blocks): + while len(indexes) < mappings[block.offset].char >> 8: + indexes.append(255) + indexes.append(i) + + # Write the mappings to a C header file + + header.write("struct BaseCharMappingBlock {\n") + header.write(" uint8_t mFirst;\n") + header.write(" uint8_t mLast;\n") + header.write(" uint16_t mMappingStartOffset;\n") + header.write("};\n") + header.write("\n") + header.write("static const uint16_t BASE_CHAR_MAPPING_LIST[] = {\n") + for char, base_char in mappings: + header.write( + " /* {:#06x}".format(char) + + " */ " + + "{:#06x}".format(base_char & 0xFFFF) + + "," + ) + if char != base_char: + header.write(" /* " + chr(char) + " → " + chr(base_char) + " */") + header.write("\n") + header.write("};\n") + header.write("\n") + header.write( + "static const struct BaseCharMappingBlock BASE_CHAR_MAPPING_BLOCKS[] = {\n" + ) + for block in blocks: + header.write( + " {" + + "{:#04x}".format(block.first) + + ", " + + "{:#04x}".format(block.last) + + ", " + + str(block.offset).rjust(4) + + "}, // " + + "{:#04x}".format(mappings[block.offset].char >> 8) + + "xx\n" + ) + header.write("};\n") + header.write("\n") + header.write("static const uint8_t BASE_CHAR_MAPPING_BLOCK_INDEX[] = {\n") + for i, index in enumerate(indexes): + header.write( + " " + str(index).rjust(3) + ", // " + "{:#04x}".format(i) + "xx\n" + ) + header.write("};\n") |