diff options
Diffstat (limited to 'third_party/rust/unicode-normalization/scripts')
-rw-r--r-- | third_party/rust/unicode-normalization/scripts/unicode.py | 621 |
1 files changed, 621 insertions, 0 deletions
diff --git a/third_party/rust/unicode-normalization/scripts/unicode.py b/third_party/rust/unicode-normalization/scripts/unicode.py new file mode 100644 index 0000000000..d32d9128af --- /dev/null +++ b/third_party/rust/unicode-normalization/scripts/unicode.py @@ -0,0 +1,621 @@ +#!/usr/bin/env python +# +# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# This script uses the following Unicode tables: +# - DerivedNormalizationProps.txt +# - NormalizationTest.txt +# - UnicodeData.txt +# - StandardizedVariants.txt +# +# Since this should not require frequent updates, we just store this +# out-of-line and check the tables.rs and normalization_tests.rs files into git. +import collections +import urllib.request + +UNICODE_VERSION = "15.0.0" +UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION + +PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly + +#![allow(missing_docs)] +""" + +NormalizationTest = collections.namedtuple( + "NormalizationTest", + ["source", "nfc", "nfd", "nfkc", "nfkd"], +) + +# Mapping taken from Table 12 from: +# http://www.unicode.org/reports/tr44/#General_Category_Values +expanded_categories = { + 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], + 'Lm': ['L'], 'Lo': ['L'], + 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], + 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], + 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], + 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], + 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], + 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], + 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], +} + +# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior +# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior +S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28 +S_COUNT = L_COUNT * V_COUNT * T_COUNT + +class UnicodeData(object): + def __init__(self): + self._load_unicode_data() + self.norm_props = self._load_norm_props() + self.norm_tests = self._load_norm_tests() + + self.canon_comp = self._compute_canonical_comp() + self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed() + + self.cjk_compat_variants_fully_decomp = {} + self._load_cjk_compat_ideograph_variants() + + def stats(name, table): + count = sum(len(v) for v in table.values()) + print("%s: %d chars => %d decomposed chars" % (name, len(table), count)) + + print("Decomposition table stats:") + stats("Canonical decomp", self.canon_decomp) + stats("Compatible decomp", self.compat_decomp) + stats("Canonical fully decomp", self.canon_fully_decomp) + stats("Compatible fully decomp", self.compat_fully_decomp) + stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp) + + self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables() + + def _fetch(self, filename): + resp = urllib.request.urlopen(UCD_URL + filename) + return resp.read().decode('utf-8') + + def _load_unicode_data(self): + self.name_to_char_int = {} + self.combining_classes = {} + self.compat_decomp = {} + self.canon_decomp = {} + self.general_category_mark = [] + self.general_category_public_assigned = [] + + assigned_start = 0; + prev_char_int = -1; + prev_name = ""; + + for line in self._fetch("UnicodeData.txt").splitlines(): + # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html + pieces = line.split(';') + assert len(pieces) == 15 + char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5] + char_int = int(char, 16) + + name = pieces[1].strip() + self.name_to_char_int[name] = char_int + + if cc != '0': + self.combining_classes[char_int] = cc + + if decomp.startswith('<'): + self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]] + elif decomp != '': + self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()] + + if category == 'M' or 'M' in expanded_categories.get(category, []): + self.general_category_mark.append(char_int) + + assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt" + if category not in ['Co', 'Cs']: + if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name): + self.general_category_public_assigned.append((assigned_start, prev_char_int)) + assigned_start = char_int + prev_char_int = char_int + prev_name = name; + + self.general_category_public_assigned.append((assigned_start, prev_char_int)) + + def _load_cjk_compat_ideograph_variants(self): + for line in self._fetch("StandardizedVariants.txt").splitlines(): + strip_comments = line.split('#', 1)[0].strip() + if not strip_comments: + continue + + variation_sequence, description, differences = strip_comments.split(';') + description = description.strip() + + # Don't use variations that only apply in particular shaping environments. + if differences: + continue + + # Look for entries where the description field is a codepoint name. + if description not in self.name_to_char_int: + continue + + # Only consider the CJK Compatibility Ideographs. + if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'): + continue + + char_int = self.name_to_char_int[description] + + assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class" + assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition" + assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition" + # If we ever need to handle Hangul here, we'll need to handle it separately. + assert not (S_BASE <= char_int < S_BASE + S_COUNT) + + cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()] + for c in cjk_compat_variant_parts: + assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)" + assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)" + self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts + + def _load_norm_props(self): + props = collections.defaultdict(list) + + for line in self._fetch("DerivedNormalizationProps.txt").splitlines(): + (prop_data, _, _) = line.partition("#") + prop_pieces = prop_data.split(";") + + if len(prop_pieces) < 2: + continue + + assert len(prop_pieces) <= 3 + (low, _, high) = prop_pieces[0].strip().partition("..") + + prop = prop_pieces[1].strip() + + data = None + if len(prop_pieces) == 3: + data = prop_pieces[2].strip() + + props[prop].append((low, high, data)) + + return props + + def _load_norm_tests(self): + tests = [] + for line in self._fetch("NormalizationTest.txt").splitlines(): + (test_data, _, _) = line.partition("#") + test_pieces = test_data.split(";") + + if len(test_pieces) < 5: + continue + + source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]] + tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd)) + + return tests + + def _compute_canonical_comp(self): + canon_comp = {} + comp_exclusions = [ + (int(low, 16), int(high or low, 16)) + for low, high, _ in self.norm_props["Full_Composition_Exclusion"] + ] + for char_int, decomp in self.canon_decomp.items(): + if any(lo <= char_int <= hi for lo, hi in comp_exclusions): + continue + + assert len(decomp) == 2 + assert (decomp[0], decomp[1]) not in canon_comp + canon_comp[(decomp[0], decomp[1])] = char_int + + return canon_comp + + def _compute_fully_decomposed(self): + """ + Even though the decomposition algorithm is recursive, it is possible + to precompute the recursion at table generation time with modest + increase to the table size. Then, for these precomputed tables, we + note that 1) compatible decomposition is a subset of canonical + decomposition and 2) they mostly agree on their intersection. + Therefore, we don't store entries in the compatible table for + characters that decompose the same way under canonical decomposition. + + Decomposition table stats: + Canonical decomp: 2060 chars => 3085 decomposed chars + Compatible decomp: 3662 chars => 5440 decomposed chars + Canonical fully decomp: 2060 chars => 3404 decomposed chars + Compatible fully decomp: 3678 chars => 5599 decomposed chars + + The upshot is that decomposition code is very simple and easy to inline + at mild code size cost. + """ + def _decompose(char_int, compatible): + # 7-bit ASCII never decomposes + if char_int <= 0x7f: + yield char_int + return + + # Assert that we're handling Hangul separately. + assert not (S_BASE <= char_int < S_BASE + S_COUNT) + + decomp = self.canon_decomp.get(char_int) + if decomp is not None: + for decomposed_ch in decomp: + for fully_decomposed_ch in _decompose(decomposed_ch, compatible): + yield fully_decomposed_ch + return + + if compatible and char_int in self.compat_decomp: + for decomposed_ch in self.compat_decomp[char_int]: + for fully_decomposed_ch in _decompose(decomposed_ch, compatible): + yield fully_decomposed_ch + return + + yield char_int + return + + end_codepoint = max( + max(self.canon_decomp.keys()), + max(self.compat_decomp.keys()), + ) + + canon_fully_decomp = {} + compat_fully_decomp = {} + + for char_int in range(0, end_codepoint + 1): + # Always skip Hangul, since it's more efficient to represent its + # decomposition programmatically. + if S_BASE <= char_int < S_BASE + S_COUNT: + continue + + canon = list(_decompose(char_int, False)) + if not (len(canon) == 1 and canon[0] == char_int): + canon_fully_decomp[char_int] = canon + + compat = list(_decompose(char_int, True)) + if not (len(compat) == 1 and compat[0] == char_int): + compat_fully_decomp[char_int] = compat + + # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't + # need to store their overlap when they agree. When they don't agree, + # store the decomposition in the compatibility table since we'll check + # that first when normalizing to NFKD. + assert set(canon_fully_decomp) <= set(compat_fully_decomp) + + for ch in set(canon_fully_decomp) & set(compat_fully_decomp): + if canon_fully_decomp[ch] == compat_fully_decomp[ch]: + del compat_fully_decomp[ch] + + return canon_fully_decomp, compat_fully_decomp + + def _compute_stream_safe_tables(self): + """ + To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4), + we need to be able to know the number of contiguous non-starters *after* + applying compatibility decomposition to each character. + + We can do this incrementally by computing the number of leading and + trailing non-starters for each character's compatibility decomposition + with the following rules: + + 1) If a character is not affected by compatibility decomposition, look + up its canonical combining class to find out if it's a non-starter. + 2) All Hangul characters are starters, even under decomposition. + 3) Otherwise, very few decomposing characters have a nonzero count + of leading or trailing non-starters, so store these characters + with their associated counts in a separate table. + """ + leading_nonstarters = {} + trailing_nonstarters = {} + + for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp): + decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c] + + num_leading = 0 + for d in decomposed: + if d not in self.combining_classes: + break + num_leading += 1 + + num_trailing = 0 + for d in reversed(decomposed): + if d not in self.combining_classes: + break + num_trailing += 1 + + if num_leading > 0: + leading_nonstarters[c] = num_leading + if num_trailing > 0: + trailing_nonstarters[c] = num_trailing + + return leading_nonstarters, trailing_nonstarters + +hexify = lambda c: '{:04X}'.format(c) + +# Test whether `first` and `last` are corresponding "<..., First>" and +# "<..., Last>" markers. +def is_first_and_last(first, last): + if not first.startswith('<') or not first.endswith(', First>'): + return False + if not last.startswith('<') or not last.endswith(', Last>'): + return False + return first[1:-8] == last[1:-7] + +def gen_mph_data(name, d, kv_type, kv_callback): + (salt, keys) = minimal_perfect_hash(d) + out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper()) + for s in salt: + out.write(" 0x{:x},\n".format(s)) + out.write("];\n") + out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type)) + for k in keys: + out.write(" {},\n".format(kv_callback(k))) + out.write("];\n\n") + +def gen_combining_class(combining_classes, out): + gen_mph_data('canonical_combining_class', combining_classes, 'u32', + lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8))) + +def gen_composition_table(canon_comp, out): + table = {} + for (c1, c2), c3 in canon_comp.items(): + if c1 < 0x10000 and c2 < 0x10000: + table[(c1 << 16) | c2] = c3 + (salt, keys) = minimal_perfect_hash(table) + gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)', + lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k]))) + + out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n") + out.write(" match (c1, c2) {\n") + for (c1, c2), c3 in sorted(canon_comp.items()): + if c1 >= 0x10000 and c2 >= 0x10000: + out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3))) + + out.write(" _ => None,\n") + out.write(" }\n") + out.write("}\n") + +def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out): + tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')] + for table, name in tables: + offsets = {} + offset = 0 + out.write("pub(crate) const %s_DECOMPOSED_CHARS: &[char] = &[\n" % name.upper()) + for k, v in table.items(): + offsets[k] = offset + offset += len(v) + for c in v: + out.write(" '\\u{%s}',\n" % hexify(c)) + # The largest offset must fit in a u16. + assert offset < 65536 + out.write("];\n") + gen_mph_data(name + '_decomposed', table, "(u32, (u16, u16))", + lambda k: "(0x{:x}, ({}, {}))".format(k, offsets[k], len(table[k]))) + +def gen_qc_match(prop_table, out): + out.write(" match c {\n") + + for low, high, data in prop_table: + assert data in ('N', 'M') + result = "No" if data == 'N' else "Maybe" + if high: + out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result)) + else: + out.write(r" '\u{%s}' => %s," % (low, result)) + out.write("\n") + + out.write(" _ => Yes,\n") + out.write(" }\n") + +def gen_nfc_qc(prop_tables, out): + out.write("#[inline]\n") + out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") + out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n") + gen_qc_match(prop_tables['NFC_QC'], out) + out.write("}\n") + +def gen_nfkc_qc(prop_tables, out): + out.write("#[inline]\n") + out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") + out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n") + gen_qc_match(prop_tables['NFKC_QC'], out) + out.write("}\n") + +def gen_nfd_qc(prop_tables, out): + out.write("#[inline]\n") + out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") + out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n") + gen_qc_match(prop_tables['NFD_QC'], out) + out.write("}\n") + +def gen_nfkd_qc(prop_tables, out): + out.write("#[inline]\n") + out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") + out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n") + gen_qc_match(prop_tables['NFKD_QC'], out) + out.write("}\n") + +def gen_combining_mark(general_category_mark, out): + gen_mph_data('combining_mark', general_category_mark, 'u32', + lambda k: '0x{:04x}'.format(k)) + +def gen_public_assigned(general_category_public_assigned, out): + # This could be done as a hash but the table is somewhat small. + out.write("#[inline]\n") + out.write("pub fn is_public_assigned(c: char) -> bool {\n") + out.write(" match c {\n") + + start = True + for first, last in general_category_public_assigned: + if start: + out.write(" ") + start = False + else: + out.write(" | ") + if first == last: + out.write("'\\u{%s}'\n" % hexify(first)) + else: + out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last))) + out.write(" => true,\n") + + out.write(" _ => false,\n") + out.write(" }\n") + out.write("}\n") + out.write("\n") + +def gen_stream_safe(leading, trailing, out): + # This could be done as a hash but the table is very small. + out.write("#[inline]\n") + out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n") + out.write(" match c {\n") + + for char, num_leading in sorted(leading.items()): + out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading)) + + out.write(" _ => 0,\n") + out.write(" }\n") + out.write("}\n") + out.write("\n") + + gen_mph_data('trailing_nonstarters', trailing, 'u32', + lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8))) + +def gen_tests(tests, out): + out.write("""#[derive(Debug)] +pub struct NormalizationTest { + pub source: &'static str, + pub nfc: &'static str, + pub nfd: &'static str, + pub nfkc: &'static str, + pub nfkd: &'static str, +} + +""") + + out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n") + str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s) + + for test in tests: + out.write(" NormalizationTest {\n") + out.write(" source: %s,\n" % str_literal(test.source)) + out.write(" nfc: %s,\n" % str_literal(test.nfc)) + out.write(" nfd: %s,\n" % str_literal(test.nfd)) + out.write(" nfkc: %s,\n" % str_literal(test.nfkc)) + out.write(" nfkd: %s,\n" % str_literal(test.nfkd)) + out.write(" },\n") + + out.write("];\n") + +# Guaranteed to be less than n. +def my_hash(x, salt, n): + # This is hash based on the theory that multiplication is efficient + mask_32 = 0xffffffff + y = ((x + salt) * 2654435769) & mask_32 + y ^= (x * 0x31415926) & mask_32 + return (y * n) >> 32 + +# Compute minimal perfect hash function, d can be either a dict or list of keys. +def minimal_perfect_hash(d): + n = len(d) + buckets = dict((h, []) for h in range(n)) + for key in d: + h = my_hash(key, 0, n) + buckets[h].append(key) + bsorted = [(len(buckets[h]), h) for h in range(n)] + bsorted.sort(reverse = True) + claimed = [False] * n + salts = [0] * n + keys = [0] * n + for (bucket_size, h) in bsorted: + # Note: the traditional perfect hashing approach would also special-case + # bucket_size == 1 here and assign any empty slot, rather than iterating + # until rehash finds an empty slot. But we're not doing that so we can + # avoid the branch. + if bucket_size == 0: + break + else: + for salt in range(1, 32768): + rehashes = [my_hash(key, salt, n) for key in buckets[h]] + # Make sure there are no rehash collisions within this bucket. + if all(not claimed[hash] for hash in rehashes): + if len(set(rehashes)) < bucket_size: + continue + salts[h] = salt + for key in buckets[h]: + rehash = my_hash(key, salt, n) + claimed[rehash] = True + keys[rehash] = key + break + if salts[h] == 0: + print("minimal perfect hashing failed") + # Note: if this happens (because of unfortunate data), then there are + # a few things that could be done. First, the hash function could be + # tweaked. Second, the bucket order could be scrambled (especially the + # singletons). Right now, the buckets are sorted, which has the advantage + # of being deterministic. + # + # As a more extreme approach, the singleton bucket optimization could be + # applied (give the direct address for singleton buckets, rather than + # relying on a rehash). That is definitely the more standard approach in + # the minimal perfect hashing literature, but in testing the branch was a + # significant slowdown. + exit(1) + return (salts, keys) + +if __name__ == '__main__': + data = UnicodeData() + with open("tables.rs", "w", newline = "\n") as out: + out.write(PREAMBLE) + out.write("use crate::quick_check::IsNormalized;\n") + out.write("use crate::quick_check::IsNormalized::*;\n") + out.write("\n") + + version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split(".")) + out.write("#[allow(unused)]\n") + out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version) + + gen_combining_class(data.combining_classes, out) + out.write("\n") + + gen_composition_table(data.canon_comp, out) + out.write("\n") + + gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out) + + gen_combining_mark(data.general_category_mark, out) + out.write("\n") + + gen_public_assigned(data.general_category_public_assigned, out) + out.write("\n") + + gen_nfc_qc(data.norm_props, out) + out.write("\n") + + gen_nfkc_qc(data.norm_props, out) + out.write("\n") + + gen_nfd_qc(data.norm_props, out) + out.write("\n") + + gen_nfkd_qc(data.norm_props, out) + out.write("\n") + + gen_stream_safe(data.ss_leading, data.ss_trailing, out) + out.write("\n") + + with open("normalization_tests.rs", "w", newline = "\n") as out: + out.write(PREAMBLE) + gen_tests(data.norm_tests, out) |