summaryrefslogtreecommitdiffstats
path: root/third_party/rust/jsparagus/update_unicode.py
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/jsparagus/update_unicode.py')
-rw-r--r--third_party/rust/jsparagus/update_unicode.py354
1 files changed, 354 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/update_unicode.py b/third_party/rust/jsparagus/update_unicode.py
new file mode 100644
index 0000000000..3f2bba0956
--- /dev/null
+++ b/third_party/rust/jsparagus/update_unicode.py
@@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+
+""" Generate Unicode data table for parser
+"""
+
+import argparse
+import io
+import re
+import sys
+from contextlib import closing
+from itertools import tee, zip_longest
+from urllib.request import urlopen
+from zipfile import ZipFile
+
+
+# These are also part of IdentifierPart ยง11.6 Names and Keywords
+compatibility_identifier_part = [
+ ord(u'\N{ZERO WIDTH NON-JOINER}'),
+ ord(u'\N{ZERO WIDTH JOINER}'),
+]
+
+FLAG_ID_START = 1 << 0
+FLAG_ID_CONTINUE = 1 << 1
+
+
+def download_derived_core_properties(version):
+ """Downloads UCD.zip for given version, and return the content of
+ DerivedCoreProperties.txt. """
+
+ baseurl = 'https://unicode.org/Public'
+ if version == 'UNIDATA':
+ url = '%s/%s' % (baseurl, version)
+ else:
+ url = '%s/%s/ucd' % (baseurl, version)
+
+ request_url = '{}/UCD.zip'.format(url)
+ with closing(urlopen(request_url)) as downloaded_file:
+ downloaded_data = io.BytesIO(downloaded_file.read())
+
+ with ZipFile(downloaded_data) as zip_file:
+ return zip_file.read('DerivedCoreProperties.txt').decode()
+
+
+def read_derived_core_properties(derived_core_properties):
+ """Read DerivedCoreProperties.txt content and yield each item. """
+ for line in derived_core_properties.split('\n'):
+ if line == '' or line.startswith('#'):
+ continue
+ row = line.split('#')[0].split(';')
+ char_range = row[0].strip()
+ char_property = row[1].strip()
+ if '..' not in char_range:
+ yield (int(char_range, 16), char_property)
+ else:
+ [start, end] = char_range.split('..')
+ for char in range(int(start, 16), int(end, 16) + 1):
+ yield (char, char_property)
+
+
+def process_derived_core_properties(derived_core_properties):
+ """Parse DerivedCoreProperties.txt and returns its version,
+ and set of characters with ID_Start and ID_Continue. """
+ id_start = set()
+ id_continue = set()
+
+ m = re.match('# DerivedCoreProperties-([0-9\.]+).txt', derived_core_properties)
+ assert m
+ version = m.group(1)
+
+ for (char, prop) in read_derived_core_properties(derived_core_properties):
+ if prop == 'ID_Start':
+ id_start.add(char)
+ if prop == 'ID_Continue':
+ id_continue.add(char)
+
+ return (version, id_start, id_continue)
+
+
+def int_ranges(ints):
+ """ Yields consecutive ranges (inclusive) from integer values. """
+ (a, b) = tee(sorted(ints))
+ start = next(b)
+ for (curr, succ) in zip_longest(a, b):
+ if curr + 1 != succ:
+ yield (start, curr)
+ start = succ
+
+
+def process_unicode_data(derived_core_properties):
+ MAX_BMP = 0xffff
+
+ dummy = 0
+ table = [dummy]
+ cache = {dummy: 0}
+ index = [0] * (MAX_BMP + 1)
+ non_bmp_id_start_set = {}
+ non_bmp_id_continue_set = {}
+
+ (version, id_start, id_continue) = process_derived_core_properties(derived_core_properties)
+ codes = id_start.union(id_continue)
+
+ for code in codes:
+ if code > MAX_BMP:
+ if code in id_start:
+ non_bmp_id_start_set[code] = 1
+ if code in id_continue:
+ non_bmp_id_continue_set[code] = 1
+ continue
+
+ flags = 0
+ if code in id_start:
+ flags |= FLAG_ID_START
+ if code in id_continue or code in compatibility_identifier_part:
+ flags |= FLAG_ID_CONTINUE
+
+ i = cache.get(flags)
+ if i is None:
+ assert flags not in table
+ cache[flags] = i = len(table)
+ table.append(flags)
+ index[code] = i
+
+ return (
+ version,
+ table,
+ index,
+ id_start,
+ id_continue,
+ non_bmp_id_start_set,
+ non_bmp_id_continue_set,
+ )
+
+
+def getsize(data):
+ """ return smallest possible integer size for the given array """
+ maxdata = max(data)
+ assert maxdata < 2**32
+
+ if maxdata < 256:
+ return 1
+ elif maxdata < 65536:
+ return 2
+ else:
+ return 4
+
+
+def splitbins(t):
+ """t -> (t1, t2, shift). Split a table to save space.
+
+ t is a sequence of ints. This function can be useful to save space if
+ many of the ints are the same. t1 and t2 are lists of ints, and shift
+ is an int, chosen to minimize the combined size of t1 and t2 (in C
+ code), and where for each i in range(len(t)),
+ t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
+ where mask is a bitmask isolating the last "shift" bits.
+ """
+
+ def dump(t1, t2, shift, bytes):
+ print("%d+%d bins at shift %d; %d bytes" % (
+ len(t1), len(t2), shift, bytes), file=sys.stderr)
+ print("Size of original table:", len(t) * getsize(t),
+ "bytes", file=sys.stderr)
+
+
+ n = len(t)-1 # last valid index
+ maxshift = 0 # the most we can shift n and still have something left
+ if n > 0:
+ while n >> 1:
+ n >>= 1
+ maxshift += 1
+ del n
+ bytes = sys.maxsize # smallest total size so far
+ t = tuple(t) # so slices can be dict keys
+ for shift in range(maxshift + 1):
+ t1 = []
+ t2 = []
+ size = 2**shift
+ bincache = {}
+
+ for i in range(0, len(t), size):
+ bin = t[i:i + size]
+
+ index = bincache.get(bin)
+ if index is None:
+ index = len(t2)
+ bincache[bin] = index
+ t2.extend(bin)
+ t1.append(index >> shift)
+
+ # determine memory size
+ b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
+ if b < bytes:
+ best = t1, t2, shift
+ bytes = b
+ t1, t2, shift = best
+
+ print("Best:", end=' ', file=sys.stderr)
+ dump(t1, t2, shift, bytes)
+
+ # exhaustively verify that the decomposition is correct
+ mask = 2**shift - 1
+ for i in range(len(t)):
+ assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
+ return best
+
+
+def write_table(f, name, type, table, formatter, per_line):
+ f.write(f"""
+pub const {name}: &'static [{type}] = &[
+""")
+
+ i = 0
+ for item in table:
+ if i == 0:
+ f.write(' ')
+ f.write(f'{formatter(item)},')
+ i += 1
+ if i == per_line:
+ i = 0
+ f.write("""
+""")
+
+ f.write("""\
+];
+""")
+
+
+def write_func(f, name, group_set):
+ f.write(f"""
+pub fn {name}(c: char) -> bool {{""")
+
+ for (from_code, to_code) in int_ranges(group_set.keys()):
+ f.write(f"""
+ if c >= \'\\u{{{from_code:X}}}\' && c <= \'\\u{{{to_code:X}}}\' {{
+ return true;
+ }}""")
+
+ f.write("""
+ false
+}
+""")
+
+
+def make_unicode_file(version, table, index,
+ id_start, id_continue,
+ non_bmp_id_start_set, non_bmp_id_continue_set):
+ index1, index2, shift = splitbins(index)
+
+ # verify correctness
+ for char in index:
+ test = table[index[char]]
+
+ idx = index1[char >> shift]
+ idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
+
+ assert test == table[idx]
+
+ with open('crates/parser/src/unicode_data.rs', 'w') as f:
+ f.write(f"""\
+// Generated by update_unicode.py DO NOT MODIFY
+// Unicode version: {version}
+""")
+
+ f.write(f"""
+const FLAG_ID_START: u8 = {FLAG_ID_START};
+const FLAG_ID_CONTINUE: u8 = {FLAG_ID_CONTINUE};
+""")
+
+ f.write("""
+pub struct CharInfo {
+ flags: u8,
+}
+
+impl CharInfo {
+ pub fn is_id_start(&self) -> bool {
+ self.flags & FLAG_ID_START != 0
+ }
+
+ pub fn is_id_continue(&self) -> bool {
+ self.flags & FLAG_ID_CONTINUE != 0
+ }
+}
+""")
+
+ write_table(f, 'CHAR_INFO_TABLE', 'CharInfo', table,
+ lambda flag: f"CharInfo {{ flags: {flag} }}",
+ 1)
+ write_table(f, 'INDEX1', 'u8', index1,
+ lambda i: f'{i:4d}', 8)
+ write_table(f, 'INDEX2', 'u8', index2,
+ lambda i: f'{i:4d}', 8)
+
+ f.write(f"""
+const SHIFT: usize = {shift};
+""")
+
+ f.write("""
+pub fn char_info(c: char) -> &'static CharInfo {
+ let code = c as usize;
+ let index = INDEX1[code >> SHIFT] as usize;
+ let index = INDEX2[(index << SHIFT) + (code & ((1 << SHIFT) - 1))] as usize;
+
+ &CHAR_INFO_TABLE[index]
+}
+""")
+
+ def format_bool(b):
+ if b:
+ return 'true '
+ else:
+ return 'false'
+
+ write_table(f, 'IS_ID_START_TABLE', 'bool', range(0, 128),
+ lambda code: format_bool(code in id_start), 8)
+ write_table(f, 'IS_ID_CONTINUE_TABLE', 'bool', range(0, 128),
+ lambda code: format_bool(code in id_continue), 8)
+
+ write_func(f, 'is_id_start_non_bmp', non_bmp_id_start_set)
+ write_func(f, 'is_id_continue_non_bmp', non_bmp_id_continue_set)
+
+
+
+parser = argparse.ArgumentParser(description='Generate Unicode data table for parser')
+parser.add_argument('VERSION',
+ help='Unicode version number to download from\
+ <https://unicode.org/Public>. The number must match\
+ a published Unicode version, e.g. use\
+ "--version=8.0.0" to download Unicode 8 files. Alternatively use\
+ "--version=UNIDATA" to download the latest published version.')
+parser.add_argument('PATH_TO_JSPARAGUS',
+ help='Path to jsparagus')
+args = parser.parse_args()
+
+derived_core_properties = download_derived_core_properties(args.VERSION)
+
+(
+ version,
+ table,
+ index,
+ id_start,
+ id_continue,
+ non_bmp_id_start_set,
+ non_bmp_id_continue_set,
+) = process_unicode_data(derived_core_properties)
+
+make_unicode_file(
+ version,
+ table,
+ index,
+ id_start,
+ id_continue,
+ non_bmp_id_start_set,
+ non_bmp_id_continue_set,
+)