#!/usr/bin/env python3 # # Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT # file at the top-level directory of this distribution and at # http://rust-lang.org/COPYRIGHT. # # Licensed under the Apache License, Version 2.0 or the MIT license # , at your # option. This file may not be copied, modified, or distributed # except according to those terms. # This script uses the following Unicode tables: # - EastAsianWidth.txt # - ReadMe.txt # - UnicodeData.txt # # Since this should not require frequent updates, we just store this # out-of-line and check the generated module into git. import enum import math import os import re import sys NUM_CODEPOINTS = 0x110000 """An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1)) """The maximum number of bits required to represent a Unicode codepoint.""" class OffsetType(enum.IntEnum): """Represents the data type of a lookup table's offsets. Each variant's value represents the number of bits required to represent that variant's type.""" U2 = 2 """Offsets are 2-bit unsigned integers, packed four-per-byte.""" U4 = 4 """Offsets are 4-bit unsigned integers, packed two-per-byte.""" U8 = 8 """Each offset is a single byte (u8).""" TABLE_CFGS = [ (13, MAX_CODEPOINT_BITS, OffsetType.U8), (6, 13, OffsetType.U8), (0, 6, OffsetType.U2), ] """Represents the format of each level of the multi-level lookup table. A level's entry is of the form `(low_bit, cap_bit, offset_type)`. This means that every sub-table in that level is indexed by bits `low_bit..cap_bit` of the codepoint and those tables offsets are stored according to `offset_type`. If this is edited, you must ensure that `emit_module` reflects your changes.""" MODULE_FILENAME = "tables.rs" """The filename of the emitted Rust module (will be created in the working directory)""" Codepoint = int BitPos = int def fetch_open(filename: str): """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.""" if not os.path.exists(os.path.basename(filename)): os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}") try: return open(filename, encoding="utf-8") except OSError: sys.stderr.write(f"cannot load {filename}") sys.exit(1) def load_unicode_version() -> "tuple[int, int, int]": """Returns the current Unicode version by fetching and processing `ReadMe.txt`.""" with fetch_open("ReadMe.txt") as readme: pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" return tuple(map(int, re.search(pattern, readme.read()).groups())) class EffectiveWidth(enum.IntEnum): """Represents the width of a Unicode character. All East Asian Width classes resolve into either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.""" ZERO = 0 """ Zero columns wide. """ NARROW = 1 """ One column wide. """ WIDE = 2 """ Two columns wide. """ AMBIGUOUS = 3 """ Two columns wide in a CJK context. One column wide in all other contexts. """ def load_east_asian_widths() -> "list[EffectiveWidth]": """Return a list of effective widths, indexed by codepoint. Widths are determined by fetching and parsing `EastAsianWidth.txt`. `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`. `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`. `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`.""" with fetch_open("EastAsianWidth.txt") as eaw: # matches a width assignment for a single codepoint, i.e. "1F336;N # ..." single = re.compile(r"^([0-9A-F]+);(\w+) +# (\w+)") # matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..." multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)") # map between width category code and condensed width width_codes = { **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]}, **{c: EffectiveWidth.WIDE for c in ["W", "F"]}, "A": EffectiveWidth.AMBIGUOUS, } width_map = [] current = 0 for line in eaw.readlines(): raw_data = None # (low, high, width) if match := single.match(line): raw_data = (match.group(1), match.group(1), match.group(2)) elif match := multiple.match(line): raw_data = (match.group(1), match.group(2), match.group(3)) else: continue low = int(raw_data[0], 16) high = int(raw_data[1], 16) width = width_codes[raw_data[2]] assert current <= high while current <= high: # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt. # All such codepoints are implicitly given Neural width (resolves to narrow) width_map.append(EffectiveWidth.NARROW if current < low else width) current += 1 while len(width_map) < NUM_CODEPOINTS: # Catch any leftover codepoints and assign them implicit Neutral/narrow width. width_map.append(EffectiveWidth.NARROW) return width_map def load_zero_widths() -> "list[bool]": """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width character. `c` is considered a zero-width character if `c` is in general categories `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`).""" with fetch_open("UnicodeData.txt") as categories: zw_map = [] current = 0 for line in categories.readlines(): if len(raw_data := line.split(";")) != 15: continue [codepoint, name, cat_code] = [ int(raw_data[0], 16), raw_data[1], raw_data[2], ] zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"] assert current <= codepoint while current <= codepoint: if name.endswith(", Last>") or current == codepoint: # if name ends with Last, we backfill the width value to all codepoints since # the previous codepoint (aka the start of the range) zw_map.append(zero_width) else: # unassigned characters are implicitly given Neutral width, which is nonzero zw_map.append(False) current += 1 while len(zw_map) < NUM_CODEPOINTS: # Catch any leftover codepoints. They must be unassigned (so nonzero width) zw_map.append(False) return zw_map class Bucket: """A bucket contains a group of codepoints and an ordered width list. If one bucket's width list overlaps with another's width list, those buckets can be merged via `try_extend`.""" def __init__(self): """Creates an empty bucket.""" self.entry_set = set() self.widths = [] def append(self, codepoint: Codepoint, width: EffectiveWidth): """Adds a codepoint/width pair to the bucket, and appends `width` to the width list.""" self.entry_set.add((codepoint, width)) self.widths.append(width) def try_extend(self, attempt: "Bucket") -> bool: """If either `self` or `attempt`'s width list starts with the other bucket's width list, set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints into `self`, and return `True`. Otherwise, return `False`.""" (less, more) = (self.widths, attempt.widths) if len(self.widths) > len(attempt.widths): (less, more) = (attempt.widths, self.widths) if less != more[: len(less)]: return False self.entry_set |= attempt.entry_set self.widths = more return True def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]": """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint.""" result = list(self.entry_set) result.sort() return result def width(self) -> "EffectiveWidth": """If all codepoints in this bucket have the same width, return that width; otherwise, return `None`.""" if len(self.widths) == 0: return None potential_width = self.widths[0] for width in self.widths[1:]: if potential_width != width: return None return potential_width def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]": """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the same bucket. Returns a list of the buckets in increasing order of those bits.""" num_bits = cap_bit - low_bit assert num_bits > 0 buckets = [Bucket() for _ in range(0, 2 ** num_bits)] mask = (1 << num_bits) - 1 for (codepoint, width) in entries: buckets[(codepoint >> low_bit) & mask].append(codepoint, width) return buckets class Table: """Represents a lookup table. Each table contains a certain number of subtables; each subtable is indexed by a contiguous bit range of the codepoint and contains a list of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.) Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should be indexed by sub-table `i` in the next-level lookup table. The entries of this table are indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The key to compression is that two different buckets in two different sub-tables may have the same width list, which means that they can be merged into the same bucket. If no bucket contains two codepoints with different widths, calling `indices_to_widths` will discard the buckets and convert the entries into `EffectiveWidth` values.""" def __init__( self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType ): """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`, and each table entry is represented in the format specified by `offset_type`. Asserts that this table is actually representable with `offset_type`.""" self.low_bit = low_bit self.cap_bit = cap_bit self.offset_type = offset_type self.entries = [] self.indexed = [] buckets = [] for entries in entry_groups: buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) for bucket in buckets: for (i, existing) in enumerate(self.indexed): if existing.try_extend(bucket): self.entries.append(i) break else: self.entries.append(len(self.indexed)) self.indexed.append(bucket) # Validate offset type for index in self.entries: assert index < (1 << int(self.offset_type)) def indices_to_widths(self): """Destructively converts the indices in this table to the `EffectiveWidth` values of their buckets. Assumes that no bucket contains codepoints with different widths.""" self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries)) del self.indexed def buckets(self): """Returns an iterator over this table's buckets.""" return self.indexed def to_bytes(self) -> "list[int]": """Returns this table's entries as a list of bytes. The bytes are formatted according to the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will contain four packed 2-bit entries.""" entries_per_byte = 8 // int(self.offset_type) byte_array = [] for i in range(0, len(self.entries), entries_per_byte): byte = 0 for j in range(0, entries_per_byte): byte |= self.entries[i + j] << (j * int(self.offset_type)) byte_array.append(byte) return byte_array def make_tables( table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries ) -> "list[Table]": """Creates a table for each configuration in `table_cfgs`, with the first config corresponding to the top-level lookup table, the second config corresponding to the second-level lookup table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs to include in the top-level table.""" tables = [] entry_groups = [entries] for (low_bit, cap_bit, offset_type) in table_cfgs: table = Table(entry_groups, low_bit, cap_bit, offset_type) entry_groups = map(lambda bucket: bucket.entries(), table.buckets()) tables.append(table) return tables def emit_module( out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]" ): """Outputs a Rust module to `out_name` using table data from `tables`. If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.""" if os.path.exists(out_name): os.remove(out_name) with open(out_name, "w", newline="\n", encoding="utf-8") as module: module.write( """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly """ ) module.write( f""" /// The version of [Unicode](http://www.unicode.org/) /// that this version of unicode-width is based on. pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version}; """ ) module.write( """ pub mod charwidth { use core::option::Option::{self, None, Some}; /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by /// consulting a multi-level lookup table. /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, /// they're treated as single width. /// /// # Maintenance /// The tables themselves are autogenerated but this function is hardcoded. You should have /// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) /// However, if you change the *actual structure* of the lookup tables (perhaps by editing the /// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes. #[inline] fn lookup_width(c: char, is_cjk: bool) -> usize { let cp = c as usize; let t1_offset = TABLES_0[cp >> 13 & 0xFF]; // Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte, // so each sub-table is 128 bytes in size. // (Sub-tables are selected using the computed offset from the previous table.) let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)]; // Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits. // This is accomplished by packing four stored entries into one byte. // So each sub-table is 2**(6-2) == 16 bytes in size. // Since this is the last table, each entry represents an encoded width. let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)]; // Extract the packed width let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; // A width of 3 signifies that the codepoint is ambiguous width. if width == 3 { if is_cjk { 2 } else { 1 } } else { width.into() } } """ ) module.write( """ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or /// `None` if `c` is a control character other than `'\\x00'`. /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, /// they're treated as single width. #[inline] pub fn width(c: char, is_cjk: bool) -> Option { if c < '\\u{7F}' { if c >= '\\u{20}' { // U+0020 to U+007F (exclusive) are single-width ASCII codepoints Some(1) } else if c == '\\0' { // U+0000 *is* a control code, but it's special-cased Some(0) } else { // U+0001 to U+0020 (exclusive) are control codes None } } else if c >= '\\u{A0}' { // No characters >= U+00A0 are control codes, so we can consult the lookup tables Some(lookup_width(c, is_cjk)) } else { // U+007F to U+00A0 (exclusive) are control codes None } } """ ) subtable_count = 1 for (i, table) in enumerate(tables): new_subtable_count = len(table.buckets()) if i == len(tables) - 1: table.indices_to_widths() # for the last table, indices == widths byte_array = table.to_bytes() module.write( f""" /// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info. static TABLES_{i}: [u8; {len(byte_array)}] = [""" ) for (j, byte) in enumerate(byte_array): # Add line breaks for every 15th entry (chosen to match what rustfmt does) if j % 15 == 0: module.write("\n ") module.write(f" 0x{byte:02X},") module.write("\n ];\n") subtable_count = new_subtable_count module.write("}\n") def main(module_filename: str): """Obtain character data from the latest version of Unicode, transform it into a multi-level lookup table for character width, and write a Rust module utilizing that table to `module_filename`. We obey the following rules in decreasing order of importance: - The soft hyphen (`U+00AD`) is single-width. - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width. - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width. - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width. - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width of `Neutral`, `Narrow`, or `Halfwidth`) are single-width. These rules are based off of Markus Kuhn's free `wcwidth()` implementation: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c""" version = load_unicode_version() print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}") eaw_map = load_east_asian_widths() zw_map = load_zero_widths() # Characters marked as zero-width in zw_map should be zero-width in the final map width_map = list( map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map)) ) # Override for soft hyphen width_map[0x00AD] = EffectiveWidth.NARROW # Override for Hangul Jamo medial vowels & final consonants for i in range(0x1160, 0x11FF + 1): width_map[i] = EffectiveWidth.ZERO tables = make_tables(TABLE_CFGS, enumerate(width_map)) print("------------------------") total_size = 0 for (i, table) in enumerate(tables): size_bytes = len(table.to_bytes()) print(f"Table {i} Size: {size_bytes} bytes") total_size += size_bytes print("------------------------") print(f" Total Size: {total_size} bytes") emit_module(module_filename, version, tables) print(f'Wrote to "{module_filename}"') if __name__ == "__main__": main(MODULE_FILENAME)