diff options
Diffstat (limited to 'library/core/src/unicode/printable.py')
-rwxr-xr-x | library/core/src/unicode/printable.py | 243 |
1 files changed, 243 insertions, 0 deletions
diff --git a/library/core/src/unicode/printable.py b/library/core/src/unicode/printable.py new file mode 100755 index 000000000..7c37f5f09 --- /dev/null +++ b/library/core/src/unicode/printable.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python + +# This script uses the following Unicode tables: +# - UnicodeData.txt + + +from collections import namedtuple +import csv +import os +import subprocess + +NUM_CODEPOINTS=0x110000 + +def to_ranges(iter): + current = None + for i in iter: + if current is None or i != current[1] or i in (0x10000, 0x20000): + if current is not None: + yield tuple(current) + current = [i, i + 1] + else: + current[1] += 1 + if current is not None: + yield tuple(current) + +def get_escaped(codepoints): + for c in codepoints: + if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '): + yield c.value + +def get_file(f): + try: + return open(os.path.basename(f)) + except FileNotFoundError: + subprocess.run(["curl", "-O", f], check=True) + return open(os.path.basename(f)) + +Codepoint = namedtuple('Codepoint', 'value class_') + +def get_codepoints(f): + r = csv.reader(f, delimiter=";") + prev_codepoint = 0 + class_first = None + for row in r: + codepoint = int(row[0], 16) + name = row[1] + class_ = row[2] + + if class_first is not None: + if not name.endswith("Last>"): + raise ValueError("Missing Last after First") + + for c in range(prev_codepoint + 1, codepoint): + yield Codepoint(c, class_first) + + class_first = None + if name.endswith("First>"): + class_first = class_ + + yield Codepoint(codepoint, class_) + prev_codepoint = codepoint + + if class_first is not None: + raise ValueError("Missing Last after First") + + for c in range(prev_codepoint + 1, NUM_CODEPOINTS): + yield Codepoint(c, None) + +def compress_singletons(singletons): + uppers = [] # (upper, # items in lowers) + lowers = [] + + for i in singletons: + upper = i >> 8 + lower = i & 0xff + if len(uppers) == 0 or uppers[-1][0] != upper: + uppers.append((upper, 1)) + else: + upper, count = uppers[-1] + uppers[-1] = upper, count + 1 + lowers.append(lower) + + return uppers, lowers + +def compress_normal(normal): + # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f + # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff + compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] + + prev_start = 0 + for start, count in normal: + truelen = start - prev_start + falselen = count + prev_start = start + count + + assert truelen < 0x8000 and falselen < 0x8000 + entry = [] + if truelen > 0x7f: + entry.append(0x80 | (truelen >> 8)) + entry.append(truelen & 0xff) + else: + entry.append(truelen & 0x7f) + if falselen > 0x7f: + entry.append(0x80 | (falselen >> 8)) + entry.append(falselen & 0xff) + else: + entry.append(falselen & 0x7f) + + compressed.append(entry) + + return compressed + +def print_singletons(uppers, lowers, uppersname, lowersname): + print("#[rustfmt::skip]") + print("const {}: &[(u8, u8)] = &[".format(uppersname)) + for u, c in uppers: + print(" ({:#04x}, {}),".format(u, c)) + print("];") + print("#[rustfmt::skip]") + print("const {}: &[u8] = &[".format(lowersname)) + for i in range(0, len(lowers), 8): + print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8]))) + print("];") + +def print_normal(normal, normalname): + print("#[rustfmt::skip]") + print("const {}: &[u8] = &[".format(normalname)) + for v in normal: + print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) + print("];") + +def main(): + file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt") + + codepoints = get_codepoints(file) + + CUTOFF=0x10000 + singletons0 = [] + singletons1 = [] + normal0 = [] + normal1 = [] + extra = [] + + for a, b in to_ranges(get_escaped(codepoints)): + if a > 2 * CUTOFF: + extra.append((a, b - a)) + elif a == b - 1: + if a & CUTOFF: + singletons1.append(a & ~CUTOFF) + else: + singletons0.append(a) + elif a == b - 2: + if a & CUTOFF: + singletons1.append(a & ~CUTOFF) + singletons1.append((a + 1) & ~CUTOFF) + else: + singletons0.append(a) + singletons0.append(a + 1) + else: + if a >= 2 * CUTOFF: + extra.append((a, b - a)) + elif a & CUTOFF: + normal1.append((a & ~CUTOFF, b - a)) + else: + normal0.append((a, b - a)) + + singletons0u, singletons0l = compress_singletons(singletons0) + singletons1u, singletons1l = compress_singletons(singletons1) + normal0 = compress_normal(normal0) + normal1 = compress_normal(normal1) + + print("""\ +// NOTE: The following code was generated by "library/core/src/unicode/printable.py", +// do not edit directly! + +fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { + let xupper = (x >> 8) as u8; + let mut lowerstart = 0; + for &(upper, lowercount) in singletonuppers { + let lowerend = lowerstart + lowercount as usize; + if xupper == upper { + for &lower in &singletonlowers[lowerstart..lowerend] { + if lower == x as u8 { + return false; + } + } + } else if xupper < upper { + break; + } + lowerstart = lowerend; + } + + let mut x = x as i32; + let mut normal = normal.iter().cloned(); + let mut current = true; + while let Some(v) = normal.next() { + let len = if v & 0x80 != 0 { + ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 + } else { + v as i32 + }; + x -= len; + if x < 0 { + break; + } + current = !current; + } + current +} + +pub(crate) fn is_printable(x: char) -> bool { + let x = x as u32; + let lower = x as u16; + + if x < 32 { + // ASCII fast path + false + } else if x < 127 { + // ASCII fast path + true + } else if x < 0x10000 { + check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) + } else if x < 0x20000 { + check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) + } else {\ +""") + for a, b in extra: + print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b)) + print(" return false;") + print(" }") + print("""\ + true + } +}\ +""") + print() + print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L') + print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L') + print_normal(normal0, 'NORMAL0') + print_normal(normal1, 'NORMAL1') + +if __name__ == '__main__': + main() |