#!/usr/bin/env python3 """ Generate Unicode data table for parser """ import argparse import io import re import sys from contextlib import closing from itertools import tee, zip_longest from urllib.request import urlopen from zipfile import ZipFile # These are also part of IdentifierPart ยง11.6 Names and Keywords compatibility_identifier_part = [ ord(u'\N{ZERO WIDTH NON-JOINER}'), ord(u'\N{ZERO WIDTH JOINER}'), ] FLAG_ID_START = 1 << 0 FLAG_ID_CONTINUE = 1 << 1 def download_derived_core_properties(version): """Downloads UCD.zip for given version, and return the content of DerivedCoreProperties.txt. """ baseurl = 'https://unicode.org/Public' if version == 'UNIDATA': url = '%s/%s' % (baseurl, version) else: url = '%s/%s/ucd' % (baseurl, version) request_url = '{}/UCD.zip'.format(url) with closing(urlopen(request_url)) as downloaded_file: downloaded_data = io.BytesIO(downloaded_file.read()) with ZipFile(downloaded_data) as zip_file: return zip_file.read('DerivedCoreProperties.txt').decode() def read_derived_core_properties(derived_core_properties): """Read DerivedCoreProperties.txt content and yield each item. """ for line in derived_core_properties.split('\n'): if line == '' or line.startswith('#'): continue row = line.split('#')[0].split(';') char_range = row[0].strip() char_property = row[1].strip() if '..' not in char_range: yield (int(char_range, 16), char_property) else: [start, end] = char_range.split('..') for char in range(int(start, 16), int(end, 16) + 1): yield (char, char_property) def process_derived_core_properties(derived_core_properties): """Parse DerivedCoreProperties.txt and returns its version, and set of characters with ID_Start and ID_Continue. """ id_start = set() id_continue = set() m = re.match('# DerivedCoreProperties-([0-9\.]+).txt', derived_core_properties) assert m version = m.group(1) for (char, prop) in read_derived_core_properties(derived_core_properties): if prop == 'ID_Start': id_start.add(char) if prop == 'ID_Continue': id_continue.add(char) return (version, id_start, id_continue) def int_ranges(ints): """ Yields consecutive ranges (inclusive) from integer values. """ (a, b) = tee(sorted(ints)) start = next(b) for (curr, succ) in zip_longest(a, b): if curr + 1 != succ: yield (start, curr) start = succ def process_unicode_data(derived_core_properties): MAX_BMP = 0xffff dummy = 0 table = [dummy] cache = {dummy: 0} index = [0] * (MAX_BMP + 1) non_bmp_id_start_set = {} non_bmp_id_continue_set = {} (version, id_start, id_continue) = process_derived_core_properties(derived_core_properties) codes = id_start.union(id_continue) for code in codes: if code > MAX_BMP: if code in id_start: non_bmp_id_start_set[code] = 1 if code in id_continue: non_bmp_id_continue_set[code] = 1 continue flags = 0 if code in id_start: flags |= FLAG_ID_START if code in id_continue or code in compatibility_identifier_part: flags |= FLAG_ID_CONTINUE i = cache.get(flags) if i is None: assert flags not in table cache[flags] = i = len(table) table.append(flags) index[code] = i return ( version, table, index, id_start, id_continue, non_bmp_id_start_set, non_bmp_id_continue_set, ) def getsize(data): """ return smallest possible integer size for the given array """ maxdata = max(data) assert maxdata < 2**32 if maxdata < 256: return 1 elif maxdata < 65536: return 2 else: return 4 def splitbins(t): """t -> (t1, t2, shift). Split a table to save space. t is a sequence of ints. This function can be useful to save space if many of the ints are the same. t1 and t2 are lists of ints, and shift is an int, chosen to minimize the combined size of t1 and t2 (in C code), and where for each i in range(len(t)), t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] where mask is a bitmask isolating the last "shift" bits. """ def dump(t1, t2, shift, bytes): print("%d+%d bins at shift %d; %d bytes" % ( len(t1), len(t2), shift, bytes), file=sys.stderr) print("Size of original table:", len(t) * getsize(t), "bytes", file=sys.stderr) n = len(t)-1 # last valid index maxshift = 0 # the most we can shift n and still have something left if n > 0: while n >> 1: n >>= 1 maxshift += 1 del n bytes = sys.maxsize # smallest total size so far t = tuple(t) # so slices can be dict keys for shift in range(maxshift + 1): t1 = [] t2 = [] size = 2**shift bincache = {} for i in range(0, len(t), size): bin = t[i:i + size] index = bincache.get(bin) if index is None: index = len(t2) bincache[bin] = index t2.extend(bin) t1.append(index >> shift) # determine memory size b = len(t1) * getsize(t1) + len(t2) * getsize(t2) if b < bytes: best = t1, t2, shift bytes = b t1, t2, shift = best print("Best:", end=' ', file=sys.stderr) dump(t1, t2, shift, bytes) # exhaustively verify that the decomposition is correct mask = 2**shift - 1 for i in range(len(t)): assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] return best def write_table(f, name, type, table, formatter, per_line): f.write(f""" pub const {name}: &'static [{type}] = &[ """) i = 0 for item in table: if i == 0: f.write(' ') f.write(f'{formatter(item)},') i += 1 if i == per_line: i = 0 f.write(""" """) f.write("""\ ]; """) def write_func(f, name, group_set): f.write(f""" pub fn {name}(c: char) -> bool {{""") for (from_code, to_code) in int_ranges(group_set.keys()): f.write(f""" if c >= \'\\u{{{from_code:X}}}\' && c <= \'\\u{{{to_code:X}}}\' {{ return true; }}""") f.write(""" false } """) def make_unicode_file(version, table, index, id_start, id_continue, non_bmp_id_start_set, non_bmp_id_continue_set): index1, index2, shift = splitbins(index) # verify correctness for char in index: test = table[index[char]] idx = index1[char >> shift] idx = index2[(idx << shift) + (char & ((1 << shift) - 1))] assert test == table[idx] with open('crates/parser/src/unicode_data.rs', 'w') as f: f.write(f"""\ // Generated by update_unicode.py DO NOT MODIFY // Unicode version: {version} """) f.write(f""" const FLAG_ID_START: u8 = {FLAG_ID_START}; const FLAG_ID_CONTINUE: u8 = {FLAG_ID_CONTINUE}; """) f.write(""" pub struct CharInfo { flags: u8, } impl CharInfo { pub fn is_id_start(&self) -> bool { self.flags & FLAG_ID_START != 0 } pub fn is_id_continue(&self) -> bool { self.flags & FLAG_ID_CONTINUE != 0 } } """) write_table(f, 'CHAR_INFO_TABLE', 'CharInfo', table, lambda flag: f"CharInfo {{ flags: {flag} }}", 1) write_table(f, 'INDEX1', 'u8', index1, lambda i: f'{i:4d}', 8) write_table(f, 'INDEX2', 'u8', index2, lambda i: f'{i:4d}', 8) f.write(f""" const SHIFT: usize = {shift}; """) f.write(""" pub fn char_info(c: char) -> &'static CharInfo { let code = c as usize; let index = INDEX1[code >> SHIFT] as usize; let index = INDEX2[(index << SHIFT) + (code & ((1 << SHIFT) - 1))] as usize; &CHAR_INFO_TABLE[index] } """) def format_bool(b): if b: return 'true ' else: return 'false' write_table(f, 'IS_ID_START_TABLE', 'bool', range(0, 128), lambda code: format_bool(code in id_start), 8) write_table(f, 'IS_ID_CONTINUE_TABLE', 'bool', range(0, 128), lambda code: format_bool(code in id_continue), 8) write_func(f, 'is_id_start_non_bmp', non_bmp_id_start_set) write_func(f, 'is_id_continue_non_bmp', non_bmp_id_continue_set) parser = argparse.ArgumentParser(description='Generate Unicode data table for parser') parser.add_argument('VERSION', help='Unicode version number to download from\ . The number must match\ a published Unicode version, e.g. use\ "--version=8.0.0" to download Unicode 8 files. Alternatively use\ "--version=UNIDATA" to download the latest published version.') parser.add_argument('PATH_TO_JSPARAGUS', help='Path to jsparagus') args = parser.parse_args() derived_core_properties = download_derived_core_properties(args.VERSION) ( version, table, index, id_start, id_continue, non_bmp_id_start_set, non_bmp_id_continue_set, ) = process_unicode_data(derived_core_properties) make_unicode_file( version, table, index, id_start, id_continue, non_bmp_id_start_set, non_bmp_id_continue_set, )