summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/base_chars.py
diff options
context:
space:
mode:
Diffstat (limited to 'intl/unicharutil/util/base_chars.py')
-rw-r--r--intl/unicharutil/util/base_chars.py162
1 files changed, 162 insertions, 0 deletions
diff --git a/intl/unicharutil/util/base_chars.py b/intl/unicharutil/util/base_chars.py
new file mode 100644
index 0000000000..91d3ba3352
--- /dev/null
+++ b/intl/unicharutil/util/base_chars.py
@@ -0,0 +1,162 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import re
+from collections import namedtuple
+from unicodedata import category, combining, normalize
+
+UNICODE_LIMIT = 0x110000
+
+UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
+UNICODE_COMBINING_CLASS_KANA_VOICING = 8
+UNICODE_COMBINING_CLASS_VIRAMA = 9
+
+BaseCharMapping = namedtuple("BaseCharMapping", ("char", "base_char"))
+BaseCharMappingBlock = namedtuple("BaseCharMappingBlock", ("first", "last", "offset"))
+
+
+# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
+def is_combining_diacritic(char):
+ return combining(char) not in (
+ UNICODE_COMBINING_CLASS_NOT_REORDERED,
+ UNICODE_COMBINING_CLASS_KANA_VOICING,
+ UNICODE_COMBINING_CLASS_VIRAMA,
+ 91,
+ 129,
+ 130,
+ 132,
+ )
+
+
+# Keep this function in sync with IsMathOrMusicSymbol in nsUnicodeProperties.h.
+def is_math_or_music_symbol(char):
+ return category(char) in ("Sm", "So")
+
+
+def changes_plane(char, base_char):
+ # Mappings that would change the first 16 bits of a character are not
+ # currently supported. This is because the mapping table only records the
+ # last 16 bits of the base character and also because moving into or out of
+ # the basic multilingual plane would change the length of a UTF-16 string.
+ return ord(char) >> 16 != ord(base_char) >> 16
+
+
+def main(header, fallback_table):
+ mappings = {}
+
+ # Glean mappings from decompositions
+
+ for char in range(UNICODE_LIMIT):
+ char = chr(char)
+ if is_combining_diacritic(char) or is_math_or_music_symbol(char):
+ continue
+ decomposition = normalize("NFD", char)
+ if len(decomposition) < 2:
+ continue
+ base_char = decomposition[0]
+ if changes_plane(char, base_char):
+ continue
+ next_char = decomposition[1]
+ if not is_combining_diacritic(next_char):
+ # Hangul syllables decompose but do not actually have diacritics.
+ # This also excludes decompositions with the Japanese marks U+3099
+ # and U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND
+ # MARK), which we should not ignore for searching (bug 1624244).
+ continue
+ mappings[char] = base_char
+
+ # Add mappings from the ASCII fallback table
+
+ for line in open(fallback_table, encoding="UTF-8"):
+ m = re.match("^(.) → (.+?) ;", line)
+ if not m:
+ continue
+ char = m.group(1)
+ decomposition = m.group(2)
+ if len(decomposition) >= 3:
+ if decomposition.startswith("'") and decomposition.endswith("'"):
+ decomposition = decomposition[1:-1]
+ if len(decomposition) >= 2:
+ if decomposition.startswith("\\"):
+ decomposition = decomposition[1:]
+ if len(decomposition) > 1:
+ continue
+ if changes_plane(char, decomposition):
+ continue
+ mappings[char] = decomposition
+
+ # Organize mappings into contiguous blocks
+
+ mappings = sorted([BaseCharMapping(ord(k), ord(v)) for k, v in mappings.items()])
+ blocks = []
+ i = 0
+ while i < len(mappings) - 1:
+ offset = i
+ first = mappings[i].char & 0xFF
+ while (
+ i < len(mappings) - 1 and mappings[i].char >> 8 == mappings[i + 1].char >> 8
+ ):
+ while (
+ i < len(mappings) - 1
+ and mappings[i].char >> 8 == mappings[i + 1].char >> 8
+ and mappings[i + 1].char - mappings[i].char > 1
+ ):
+ char = mappings[i].char + 1
+ mappings.insert(i + 1, BaseCharMapping(char, char))
+ i += 1
+ i += 1
+ last = mappings[i].char & 0xFF
+ blocks.append(BaseCharMappingBlock(first, last, offset))
+ i += 1
+
+ indexes = []
+ for i, block in enumerate(blocks):
+ while len(indexes) < mappings[block.offset].char >> 8:
+ indexes.append(255)
+ indexes.append(i)
+
+ # Write the mappings to a C header file
+
+ header.write("struct BaseCharMappingBlock {\n")
+ header.write(" uint8_t mFirst;\n")
+ header.write(" uint8_t mLast;\n")
+ header.write(" uint16_t mMappingStartOffset;\n")
+ header.write("};\n")
+ header.write("\n")
+ header.write("static const uint16_t BASE_CHAR_MAPPING_LIST[] = {\n")
+ for char, base_char in mappings:
+ header.write(
+ " /* {:#06x}".format(char)
+ + " */ "
+ + "{:#06x}".format(base_char & 0xFFFF)
+ + ","
+ )
+ if char != base_char:
+ header.write(" /* " + chr(char) + " → " + chr(base_char) + " */")
+ header.write("\n")
+ header.write("};\n")
+ header.write("\n")
+ header.write(
+ "static const struct BaseCharMappingBlock BASE_CHAR_MAPPING_BLOCKS[] = {\n"
+ )
+ for block in blocks:
+ header.write(
+ " {"
+ + "{:#04x}".format(block.first)
+ + ", "
+ + "{:#04x}".format(block.last)
+ + ", "
+ + str(block.offset).rjust(4)
+ + "}, // "
+ + "{:#04x}".format(mappings[block.offset].char >> 8)
+ + "xx\n"
+ )
+ header.write("};\n")
+ header.write("\n")
+ header.write("static const uint8_t BASE_CHAR_MAPPING_BLOCK_INDEX[] = {\n")
+ for i, index in enumerate(indexes):
+ header.write(
+ " " + str(index).rjust(3) + ", // " + "{:#04x}".format(i) + "xx\n"
+ )
+ header.write("};\n")