diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/jsparagus/update_unicode.py | |
parent | Initial commit. (diff) | |
download | firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/jsparagus/update_unicode.py')
-rw-r--r-- | third_party/rust/jsparagus/update_unicode.py | 354 |
1 files changed, 354 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/update_unicode.py b/third_party/rust/jsparagus/update_unicode.py new file mode 100644 index 0000000000..3f2bba0956 --- /dev/null +++ b/third_party/rust/jsparagus/update_unicode.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python3 + +""" Generate Unicode data table for parser +""" + +import argparse +import io +import re +import sys +from contextlib import closing +from itertools import tee, zip_longest +from urllib.request import urlopen +from zipfile import ZipFile + + +# These are also part of IdentifierPart ยง11.6 Names and Keywords +compatibility_identifier_part = [ + ord(u'\N{ZERO WIDTH NON-JOINER}'), + ord(u'\N{ZERO WIDTH JOINER}'), +] + +FLAG_ID_START = 1 << 0 +FLAG_ID_CONTINUE = 1 << 1 + + +def download_derived_core_properties(version): + """Downloads UCD.zip for given version, and return the content of + DerivedCoreProperties.txt. """ + + baseurl = 'https://unicode.org/Public' + if version == 'UNIDATA': + url = '%s/%s' % (baseurl, version) + else: + url = '%s/%s/ucd' % (baseurl, version) + + request_url = '{}/UCD.zip'.format(url) + with closing(urlopen(request_url)) as downloaded_file: + downloaded_data = io.BytesIO(downloaded_file.read()) + + with ZipFile(downloaded_data) as zip_file: + return zip_file.read('DerivedCoreProperties.txt').decode() + + +def read_derived_core_properties(derived_core_properties): + """Read DerivedCoreProperties.txt content and yield each item. """ + for line in derived_core_properties.split('\n'): + if line == '' or line.startswith('#'): + continue + row = line.split('#')[0].split(';') + char_range = row[0].strip() + char_property = row[1].strip() + if '..' not in char_range: + yield (int(char_range, 16), char_property) + else: + [start, end] = char_range.split('..') + for char in range(int(start, 16), int(end, 16) + 1): + yield (char, char_property) + + +def process_derived_core_properties(derived_core_properties): + """Parse DerivedCoreProperties.txt and returns its version, + and set of characters with ID_Start and ID_Continue. """ + id_start = set() + id_continue = set() + + m = re.match('# DerivedCoreProperties-([0-9\.]+).txt', derived_core_properties) + assert m + version = m.group(1) + + for (char, prop) in read_derived_core_properties(derived_core_properties): + if prop == 'ID_Start': + id_start.add(char) + if prop == 'ID_Continue': + id_continue.add(char) + + return (version, id_start, id_continue) + + +def int_ranges(ints): + """ Yields consecutive ranges (inclusive) from integer values. """ + (a, b) = tee(sorted(ints)) + start = next(b) + for (curr, succ) in zip_longest(a, b): + if curr + 1 != succ: + yield (start, curr) + start = succ + + +def process_unicode_data(derived_core_properties): + MAX_BMP = 0xffff + + dummy = 0 + table = [dummy] + cache = {dummy: 0} + index = [0] * (MAX_BMP + 1) + non_bmp_id_start_set = {} + non_bmp_id_continue_set = {} + + (version, id_start, id_continue) = process_derived_core_properties(derived_core_properties) + codes = id_start.union(id_continue) + + for code in codes: + if code > MAX_BMP: + if code in id_start: + non_bmp_id_start_set[code] = 1 + if code in id_continue: + non_bmp_id_continue_set[code] = 1 + continue + + flags = 0 + if code in id_start: + flags |= FLAG_ID_START + if code in id_continue or code in compatibility_identifier_part: + flags |= FLAG_ID_CONTINUE + + i = cache.get(flags) + if i is None: + assert flags not in table + cache[flags] = i = len(table) + table.append(flags) + index[code] = i + + return ( + version, + table, + index, + id_start, + id_continue, + non_bmp_id_start_set, + non_bmp_id_continue_set, + ) + + +def getsize(data): + """ return smallest possible integer size for the given array """ + maxdata = max(data) + assert maxdata < 2**32 + + if maxdata < 256: + return 1 + elif maxdata < 65536: + return 2 + else: + return 4 + + +def splitbins(t): + """t -> (t1, t2, shift). Split a table to save space. + + t is a sequence of ints. This function can be useful to save space if + many of the ints are the same. t1 and t2 are lists of ints, and shift + is an int, chosen to minimize the combined size of t1 and t2 (in C + code), and where for each i in range(len(t)), + t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] + where mask is a bitmask isolating the last "shift" bits. + """ + + def dump(t1, t2, shift, bytes): + print("%d+%d bins at shift %d; %d bytes" % ( + len(t1), len(t2), shift, bytes), file=sys.stderr) + print("Size of original table:", len(t) * getsize(t), + "bytes", file=sys.stderr) + + + n = len(t)-1 # last valid index + maxshift = 0 # the most we can shift n and still have something left + if n > 0: + while n >> 1: + n >>= 1 + maxshift += 1 + del n + bytes = sys.maxsize # smallest total size so far + t = tuple(t) # so slices can be dict keys + for shift in range(maxshift + 1): + t1 = [] + t2 = [] + size = 2**shift + bincache = {} + + for i in range(0, len(t), size): + bin = t[i:i + size] + + index = bincache.get(bin) + if index is None: + index = len(t2) + bincache[bin] = index + t2.extend(bin) + t1.append(index >> shift) + + # determine memory size + b = len(t1) * getsize(t1) + len(t2) * getsize(t2) + if b < bytes: + best = t1, t2, shift + bytes = b + t1, t2, shift = best + + print("Best:", end=' ', file=sys.stderr) + dump(t1, t2, shift, bytes) + + # exhaustively verify that the decomposition is correct + mask = 2**shift - 1 + for i in range(len(t)): + assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] + return best + + +def write_table(f, name, type, table, formatter, per_line): + f.write(f""" +pub const {name}: &'static [{type}] = &[ +""") + + i = 0 + for item in table: + if i == 0: + f.write(' ') + f.write(f'{formatter(item)},') + i += 1 + if i == per_line: + i = 0 + f.write(""" +""") + + f.write("""\ +]; +""") + + +def write_func(f, name, group_set): + f.write(f""" +pub fn {name}(c: char) -> bool {{""") + + for (from_code, to_code) in int_ranges(group_set.keys()): + f.write(f""" + if c >= \'\\u{{{from_code:X}}}\' && c <= \'\\u{{{to_code:X}}}\' {{ + return true; + }}""") + + f.write(""" + false +} +""") + + +def make_unicode_file(version, table, index, + id_start, id_continue, + non_bmp_id_start_set, non_bmp_id_continue_set): + index1, index2, shift = splitbins(index) + + # verify correctness + for char in index: + test = table[index[char]] + + idx = index1[char >> shift] + idx = index2[(idx << shift) + (char & ((1 << shift) - 1))] + + assert test == table[idx] + + with open('crates/parser/src/unicode_data.rs', 'w') as f: + f.write(f"""\ +// Generated by update_unicode.py DO NOT MODIFY +// Unicode version: {version} +""") + + f.write(f""" +const FLAG_ID_START: u8 = {FLAG_ID_START}; +const FLAG_ID_CONTINUE: u8 = {FLAG_ID_CONTINUE}; +""") + + f.write(""" +pub struct CharInfo { + flags: u8, +} + +impl CharInfo { + pub fn is_id_start(&self) -> bool { + self.flags & FLAG_ID_START != 0 + } + + pub fn is_id_continue(&self) -> bool { + self.flags & FLAG_ID_CONTINUE != 0 + } +} +""") + + write_table(f, 'CHAR_INFO_TABLE', 'CharInfo', table, + lambda flag: f"CharInfo {{ flags: {flag} }}", + 1) + write_table(f, 'INDEX1', 'u8', index1, + lambda i: f'{i:4d}', 8) + write_table(f, 'INDEX2', 'u8', index2, + lambda i: f'{i:4d}', 8) + + f.write(f""" +const SHIFT: usize = {shift}; +""") + + f.write(""" +pub fn char_info(c: char) -> &'static CharInfo { + let code = c as usize; + let index = INDEX1[code >> SHIFT] as usize; + let index = INDEX2[(index << SHIFT) + (code & ((1 << SHIFT) - 1))] as usize; + + &CHAR_INFO_TABLE[index] +} +""") + + def format_bool(b): + if b: + return 'true ' + else: + return 'false' + + write_table(f, 'IS_ID_START_TABLE', 'bool', range(0, 128), + lambda code: format_bool(code in id_start), 8) + write_table(f, 'IS_ID_CONTINUE_TABLE', 'bool', range(0, 128), + lambda code: format_bool(code in id_continue), 8) + + write_func(f, 'is_id_start_non_bmp', non_bmp_id_start_set) + write_func(f, 'is_id_continue_non_bmp', non_bmp_id_continue_set) + + + +parser = argparse.ArgumentParser(description='Generate Unicode data table for parser') +parser.add_argument('VERSION', + help='Unicode version number to download from\ + <https://unicode.org/Public>. The number must match\ + a published Unicode version, e.g. use\ + "--version=8.0.0" to download Unicode 8 files. Alternatively use\ + "--version=UNIDATA" to download the latest published version.') +parser.add_argument('PATH_TO_JSPARAGUS', + help='Path to jsparagus') +args = parser.parse_args() + +derived_core_properties = download_derived_core_properties(args.VERSION) + +( + version, + table, + index, + id_start, + id_continue, + non_bmp_id_start_set, + non_bmp_id_continue_set, +) = process_unicode_data(derived_core_properties) + +make_unicode_file( + version, + table, + index, + id_start, + id_continue, + non_bmp_id_start_set, + non_bmp_id_continue_set, +) |