diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:11:38 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:13:23 +0000 |
commit | 20431706a863f92cb37dc512fef6e48d192aaf2c (patch) | |
tree | 2867f13f5fd5437ba628c67d7f87309ccadcd286 /vendor/unicode-width/scripts/unicode.py | |
parent | Releasing progress-linux version 1.65.0+dfsg1-2~progress7.99u1. (diff) | |
download | rustc-20431706a863f92cb37dc512fef6e48d192aaf2c.tar.xz rustc-20431706a863f92cb37dc512fef6e48d192aaf2c.zip |
Merging upstream version 1.66.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/unicode-width/scripts/unicode.py')
-rwxr-xr-x | vendor/unicode-width/scripts/unicode.py | 748 |
1 files changed, 466 insertions, 282 deletions
diff --git a/vendor/unicode-width/scripts/unicode.py b/vendor/unicode-width/scripts/unicode.py index 7f5959d4b..2efb0b63f 100755 --- a/vendor/unicode-width/scripts/unicode.py +++ b/vendor/unicode-width/scripts/unicode.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT +# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT # file at the top-level directory of this distribution and at # http://rust-lang.org/COPYRIGHT. # @@ -16,11 +16,322 @@ # - UnicodeData.txt # # Since this should not require frequent updates, we just store this -# out-of-line and check the unicode.rs file into git. - -import fileinput, re, os, sys, operator +# out-of-line and check the generated module into git. + +import enum +import math +import os +import re +import sys + +NUM_CODEPOINTS = 0x110000 +"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" + +MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1)) +"""The maximum number of bits required to represent a Unicode codepoint.""" + + +class OffsetType(enum.IntEnum): + """Represents the data type of a lookup table's offsets. Each variant's value represents the + number of bits required to represent that variant's type.""" + + U2 = 2 + """Offsets are 2-bit unsigned integers, packed four-per-byte.""" + U4 = 4 + """Offsets are 4-bit unsigned integers, packed two-per-byte.""" + U8 = 8 + """Each offset is a single byte (u8).""" + + +TABLE_CFGS = [ + (13, MAX_CODEPOINT_BITS, OffsetType.U8), + (6, 13, OffsetType.U8), + (0, 6, OffsetType.U2), +] +"""Represents the format of each level of the multi-level lookup table. +A level's entry is of the form `(low_bit, cap_bit, offset_type)`. +This means that every sub-table in that level is indexed by bits `low_bit..cap_bit` of the +codepoint and those tables offsets are stored according to `offset_type`. + +If this is edited, you must ensure that `emit_module` reflects your changes.""" + +MODULE_FILENAME = "tables.rs" +"""The filename of the emitted Rust module (will be created in the working directory)""" + +Codepoint = int +BitPos = int + + +def fetch_open(filename: str): + """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, + fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.""" + if not os.path.exists(os.path.basename(filename)): + os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}") + try: + return open(filename, encoding="utf-8") + except OSError: + sys.stderr.write(f"cannot load {filename}") + sys.exit(1) + + +def load_unicode_version() -> "tuple[int, int, int]": + """Returns the current Unicode version by fetching and processing `ReadMe.txt`.""" + with fetch_open("ReadMe.txt") as readme: + pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" + return tuple(map(int, re.search(pattern, readme.read()).groups())) + + +class EffectiveWidth(enum.IntEnum): + """Represents the width of a Unicode character. All East Asian Width classes resolve into + either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.""" + + ZERO = 0 + """ Zero columns wide. """ + NARROW = 1 + """ One column wide. """ + WIDE = 2 + """ Two columns wide. """ + AMBIGUOUS = 3 + """ Two columns wide in a CJK context. One column wide in all other contexts. """ + + +def load_east_asian_widths() -> "list[EffectiveWidth]": + """Return a list of effective widths, indexed by codepoint. + Widths are determined by fetching and parsing `EastAsianWidth.txt`. + + `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`. + + `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`. + + `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`.""" + with fetch_open("EastAsianWidth.txt") as eaw: + # matches a width assignment for a single codepoint, i.e. "1F336;N # ..." + single = re.compile(r"^([0-9A-F]+);(\w+) +# (\w+)") + # matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..." + multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)") + # map between width category code and condensed width + width_codes = { + **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]}, + **{c: EffectiveWidth.WIDE for c in ["W", "F"]}, + "A": EffectiveWidth.AMBIGUOUS, + } -preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT + width_map = [] + current = 0 + for line in eaw.readlines(): + raw_data = None # (low, high, width) + if match := single.match(line): + raw_data = (match.group(1), match.group(1), match.group(2)) + elif match := multiple.match(line): + raw_data = (match.group(1), match.group(2), match.group(3)) + else: + continue + low = int(raw_data[0], 16) + high = int(raw_data[1], 16) + width = width_codes[raw_data[2]] + + assert current <= high + while current <= high: + # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt. + # All such codepoints are implicitly given Neural width (resolves to narrow) + width_map.append(EffectiveWidth.NARROW if current < low else width) + current += 1 + + while len(width_map) < NUM_CODEPOINTS: + # Catch any leftover codepoints and assign them implicit Neutral/narrow width. + width_map.append(EffectiveWidth.NARROW) + + return width_map + + +def load_zero_widths() -> "list[bool]": + """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width + character. `c` is considered a zero-width character if `c` is in general categories + `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`).""" + with fetch_open("UnicodeData.txt") as categories: + zw_map = [] + current = 0 + for line in categories.readlines(): + if len(raw_data := line.split(";")) != 15: + continue + [codepoint, name, cat_code] = [ + int(raw_data[0], 16), + raw_data[1], + raw_data[2], + ] + zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"] + + assert current <= codepoint + while current <= codepoint: + if name.endswith(", Last>") or current == codepoint: + # if name ends with Last, we backfill the width value to all codepoints since + # the previous codepoint (aka the start of the range) + zw_map.append(zero_width) + else: + # unassigned characters are implicitly given Neutral width, which is nonzero + zw_map.append(False) + current += 1 + + while len(zw_map) < NUM_CODEPOINTS: + # Catch any leftover codepoints. They must be unassigned (so nonzero width) + zw_map.append(False) + + return zw_map + + +class Bucket: + """A bucket contains a group of codepoints and an ordered width list. If one bucket's width + list overlaps with another's width list, those buckets can be merged via `try_extend`.""" + + def __init__(self): + """Creates an empty bucket.""" + self.entry_set = set() + self.widths = [] + + def append(self, codepoint: Codepoint, width: EffectiveWidth): + """Adds a codepoint/width pair to the bucket, and appends `width` to the width list.""" + self.entry_set.add((codepoint, width)) + self.widths.append(width) + + def try_extend(self, attempt: "Bucket") -> bool: + """If either `self` or `attempt`'s width list starts with the other bucket's width list, + set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints + into `self`, and return `True`. Otherwise, return `False`.""" + (less, more) = (self.widths, attempt.widths) + if len(self.widths) > len(attempt.widths): + (less, more) = (attempt.widths, self.widths) + if less != more[: len(less)]: + return False + self.entry_set |= attempt.entry_set + self.widths = more + return True + + def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]": + """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint.""" + result = list(self.entry_set) + result.sort() + return result + + def width(self) -> "EffectiveWidth": + """If all codepoints in this bucket have the same width, return that width; otherwise, + return `None`.""" + if len(self.widths) == 0: + return None + potential_width = self.widths[0] + for width in self.widths[1:]: + if potential_width != width: + return None + return potential_width + + +def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]": + """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All + codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the + same bucket. Returns a list of the buckets in increasing order of those bits.""" + num_bits = cap_bit - low_bit + assert num_bits > 0 + buckets = [Bucket() for _ in range(0, 2 ** num_bits)] + mask = (1 << num_bits) - 1 + for (codepoint, width) in entries: + buckets[(codepoint >> low_bit) & mask].append(codepoint, width) + return buckets + + +class Table: + """Represents a lookup table. Each table contains a certain number of subtables; each + subtable is indexed by a contiguous bit range of the codepoint and contains a list + of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.) + + Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should + be indexed by sub-table `i` in the next-level lookup table. The entries of this table are + indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The + key to compression is that two different buckets in two different sub-tables may have the + same width list, which means that they can be merged into the same bucket. + + If no bucket contains two codepoints with different widths, calling `indices_to_widths` will + discard the buckets and convert the entries into `EffectiveWidth` values.""" + + def __init__( + self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType + ): + """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator + in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`, + and each table entry is represented in the format specified by `offset_type`. Asserts + that this table is actually representable with `offset_type`.""" + self.low_bit = low_bit + self.cap_bit = cap_bit + self.offset_type = offset_type + self.entries = [] + self.indexed = [] + + buckets = [] + for entries in entry_groups: + buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) + + for bucket in buckets: + for (i, existing) in enumerate(self.indexed): + if existing.try_extend(bucket): + self.entries.append(i) + break + else: + self.entries.append(len(self.indexed)) + self.indexed.append(bucket) + + # Validate offset type + for index in self.entries: + assert index < (1 << int(self.offset_type)) + + def indices_to_widths(self): + """Destructively converts the indices in this table to the `EffectiveWidth` values of + their buckets. Assumes that no bucket contains codepoints with different widths.""" + self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries)) + del self.indexed + + def buckets(self): + """Returns an iterator over this table's buckets.""" + return self.indexed + + def to_bytes(self) -> "list[int]": + """Returns this table's entries as a list of bytes. The bytes are formatted according to + the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries + to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will + contain four packed 2-bit entries.""" + entries_per_byte = 8 // int(self.offset_type) + byte_array = [] + for i in range(0, len(self.entries), entries_per_byte): + byte = 0 + for j in range(0, entries_per_byte): + byte |= self.entries[i + j] << (j * int(self.offset_type)) + byte_array.append(byte) + return byte_array + + +def make_tables( + table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries +) -> "list[Table]": + """Creates a table for each configuration in `table_cfgs`, with the first config corresponding + to the top-level lookup table, the second config corresponding to the second-level lookup + table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs + to include in the top-level table.""" + tables = [] + entry_groups = [entries] + for (low_bit, cap_bit, offset_type) in table_cfgs: + table = Table(entry_groups, low_bit, cap_bit, offset_type) + entry_groups = map(lambda bucket: bucket.entries(), table.buckets()) + tables.append(table) + return tables + + +def emit_module( + out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]" +): + """Outputs a Rust module to `out_name` using table data from `tables`. + If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.""" + if os.path.exists(out_name): + os.remove(out_name) + with open(out_name, "w", newline="\n", encoding="utf-8") as module: + module.write( + """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -31,291 +342,164 @@ preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRI // except according to those terms. // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly - -#![allow(missing_docs, non_upper_case_globals, non_snake_case)] -''' - -# Mapping taken from Table 12 from: -# http://www.unicode.org/reports/tr44/#General_Category_Values -expanded_categories = { - 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], - 'Lm': ['L'], 'Lo': ['L'], - 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], - 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], - 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], - 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], - 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], - 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], - 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], -} - -# these are the surrogate codepoints, which are not valid rust characters -surrogate_codepoints = (0xd800, 0xdfff) - -def fetch(f): - if not os.path.exists(os.path.basename(f)): - os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" - % f) - - if not os.path.exists(os.path.basename(f)): - sys.stderr.write("cannot load %s" % f) - exit(1) - -def is_surrogate(n): - return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] - -def load_unicode_data(f): - fetch(f) - gencats = {} - - udict = {} - range_start = -1 - for line in fileinput.input(f): - data = line.split(';') - if len(data) != 15: - continue - cp = int(data[0], 16) - if is_surrogate(cp): - continue - if range_start >= 0: - for i in range(range_start, cp): - udict[i] = data - range_start = -1 - if data[1].endswith(", First>"): - range_start = cp - continue - udict[cp] = data - - for code in udict: - [code_org, name, gencat, combine, bidi, - decomp, deci, digit, num, mirror, - old, iso, upcase, lowcase, titlecase ] = udict[code] - - # place letter in categories as appropriate - for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []): - if cat not in gencats: - gencats[cat] = [] - gencats[cat].append(code) - - gencats = group_cats(gencats) - - return gencats - -def group_cats(cats): - cats_out = {} - for cat in cats: - cats_out[cat] = group_cat(cats[cat]) - return cats_out - -def group_cat(cat): - cat_out = [] - letters = sorted(set(cat)) - cur_start = letters.pop(0) - cur_end = cur_start - for letter in letters: - assert letter > cur_end, \ - "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) - if letter == cur_end + 1: - cur_end = letter - else: - cat_out.append((cur_start, cur_end)) - cur_start = cur_end = letter - cat_out.append((cur_start, cur_end)) - return cat_out - -def format_table_content(f, content, indent): - line = " "*indent - first = True - for chunk in content.split(","): - if len(line) + len(chunk) < 98: - if first: - line += chunk - else: - line += ", " + chunk - first = False - else: - f.write(line + ",\n") - line = " "*indent + chunk - f.write(line) - -# load all widths of want_widths, except those in except_cats -def load_east_asian_width(want_widths, except_cats): - f = "EastAsianWidth.txt" - fetch(f) - widths = {} - re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)") - re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)") - - for line in fileinput.input(f): - width = None - d_lo = 0 - d_hi = 0 - cat = None - m = re1.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(1) - width = m.group(2) - cat = m.group(3) - else: - m = re2.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(2) - width = m.group(3) - cat = m.group(4) - else: - continue - if cat in except_cats or width not in want_widths: - continue - d_lo = int(d_lo, 16) - d_hi = int(d_hi, 16) - if width not in widths: - widths[width] = [] - widths[width].append((d_lo, d_hi)) - return widths - -def escape_char(c): - return "'\\u{%x}'" % c - -def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, - pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True): - pub_string = "const" - if not is_const: - pub_string = "let" - if is_pub: - pub_string = "pub " + pub_string - f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type)) - data = "" - first = True - for dat in t_data: - if not first: - data += "," - first = False - data += pfun(dat) - format_table_content(f, data, 8) - f.write("\n ];\n\n") - -def emit_charwidth_module(f, width_table): - f.write("pub mod charwidth {") - f.write(""" - use core::option::Option::{self, Some, None}; - use core::result::Result::{Ok, Err}; - +""" + ) + module.write( + f""" +/// The version of [Unicode](http://www.unicode.org/) +/// that this version of unicode-width is based on. +pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version}; +""" + ) + + module.write( + """ +pub mod charwidth { + use core::option::Option::{self, None, Some}; + + /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by + /// consulting a multi-level lookup table. + /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, + /// they're treated as single width. + /// + /// # Maintenance + /// The tables themselves are autogenerated but this function is hardcoded. You should have + /// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) + /// However, if you change the *actual structure* of the lookup tables (perhaps by editing the + /// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes. #[inline] - fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 { - use core::cmp::Ordering::{Equal, Less, Greater}; - match r.binary_search_by(|&(lo, hi, _, _)| { - if lo <= c && c <= hi { Equal } - else if hi < c { Less } - else { Greater } - }) { - Ok(idx) => { - let (_, _, r_ncjk, r_cjk) = r[idx]; - if is_cjk { r_cjk } else { r_ncjk } + fn lookup_width(c: char, is_cjk: bool) -> usize { + let cp = c as usize; + + let t1_offset = TABLES_0[cp >> 13 & 0xFF]; + + // Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte, + // so each sub-table is 128 bytes in size. + // (Sub-tables are selected using the computed offset from the previous table.) + let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)]; + + // Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits. + // This is accomplished by packing four stored entries into one byte. + // So each sub-table is 2**(6-2) == 16 bytes in size. + // Since this is the last table, each entry represents an encoded width. + let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)]; + + // Extract the packed width + let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; + + // A width of 3 signifies that the codepoint is ambiguous width. + if width == 3 { + if is_cjk { + 2 + } else { + 1 } - Err(_) => 1 + } else { + width.into() } } -""") - - f.write(""" +""" + ) + + module.write( + """ + /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or + /// `None` if `c` is a control character other than `'\\x00'`. + /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, + /// they're treated as single width. #[inline] pub fn width(c: char, is_cjk: bool) -> Option<usize> { - match c as usize { - _c @ 0 => Some(0), // null is zero width - cu if cu < 0x20 => None, // control sequences have no width - cu if cu < 0x7F => Some(1), // ASCII - cu if cu < 0xA0 => None, // more control sequences - _ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as usize) + if c < '\\u{7F}' { + if c >= '\\u{20}' { + // U+0020 to U+007F (exclusive) are single-width ASCII codepoints + Some(1) + } else if c == '\\0' { + // U+0000 *is* a control code, but it's special-cased + Some(0) + } else { + // U+0001 to U+0020 (exclusive) are control codes + None + } + } else if c >= '\\u{A0}' { + // No characters >= U+00A0 are control codes, so we can consult the lookup tables + Some(lookup_width(c, is_cjk)) + } else { + // U+007F to U+00A0 (exclusive) are control codes + None } } +""" + ) + + subtable_count = 1 + for (i, table) in enumerate(tables): + new_subtable_count = len(table.buckets()) + if i == len(tables) - 1: + table.indices_to_widths() # for the last table, indices == widths + byte_array = table.to_bytes() + module.write( + f""" + /// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info. + static TABLES_{i}: [u8; {len(byte_array)}] = [""" + ) + for (j, byte) in enumerate(byte_array): + # Add line breaks for every 15th entry (chosen to match what rustfmt does) + if j % 15 == 0: + module.write("\n ") + module.write(f" 0x{byte:02X},") + module.write("\n ];\n") + subtable_count = new_subtable_count + module.write("}\n") + + +def main(module_filename: str): + """Obtain character data from the latest version of Unicode, transform it into a multi-level + lookup table for character width, and write a Rust module utilizing that table to + `module_filename`. + + We obey the following rules in decreasing order of importance: + - The soft hyphen (`U+00AD`) is single-width. + - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width. + - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width. + - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width. + - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. + - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width + of `Neutral`, `Narrow`, or `Halfwidth`) are single-width. + + These rules are based off of Markus Kuhn's free `wcwidth()` implementation: + http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c""" + version = load_unicode_version() + print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}") + + eaw_map = load_east_asian_widths() + zw_map = load_zero_widths() + + # Characters marked as zero-width in zw_map should be zero-width in the final map + width_map = list( + map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map)) + ) + + # Override for soft hyphen + width_map[0x00AD] = EffectiveWidth.NARROW + + # Override for Hangul Jamo medial vowels & final consonants + for i in range(0x1160, 0x11FF + 1): + width_map[i] = EffectiveWidth.ZERO + + tables = make_tables(TABLE_CFGS, enumerate(width_map)) + + print("------------------------") + total_size = 0 + for (i, table) in enumerate(tables): + size_bytes = len(table.to_bytes()) + print(f"Table {i} Size: {size_bytes} bytes") + total_size += size_bytes + print("------------------------") + print(f" Total Size: {total_size} bytes") + + emit_module(module_filename, version, tables) + print(f'Wrote to "{module_filename}"') -""") - - f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n") - f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n") - emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False, - pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3])) - f.write("}\n\n") - -def remove_from_wtable(wtable, val): - wtable_out = [] - while wtable: - if wtable[0][1] < val: - wtable_out.append(wtable.pop(0)) - elif wtable[0][0] > val: - break - else: - (wt_lo, wt_hi, width, width_cjk) = wtable.pop(0) - if wt_lo == wt_hi == val: - continue - elif wt_lo == val: - wtable_out.append((wt_lo+1, wt_hi, width, width_cjk)) - elif wt_hi == val: - wtable_out.append((wt_lo, wt_hi-1, width, width_cjk)) - else: - wtable_out.append((wt_lo, val-1, width, width_cjk)) - wtable_out.append((val+1, wt_hi, width, width_cjk)) - if wtable: - wtable_out.extend(wtable) - return wtable_out - - - -def optimize_width_table(wtable): - wtable_out = [] - w_this = wtable.pop(0) - while wtable: - if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]: - w_tmp = wtable.pop(0) - w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3]) - else: - wtable_out.append(w_this) - w_this = wtable.pop(0) - wtable_out.append(w_this) - return wtable_out if __name__ == "__main__": - r = "tables.rs" - if os.path.exists(r): - os.remove(r) - with open(r, "w") as rf: - # write the file's preamble - rf.write(preamble) - - # download and parse all the data - fetch("ReadMe.txt") - with open("ReadMe.txt") as readme: - pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" - unicode_version = re.search(pattern, readme.read()).groups() - rf.write(""" -/// The version of [Unicode](http://www.unicode.org/) -/// that this version of unicode-width is based on. -pub const UNICODE_VERSION: (u8, u8, u8) = (%s, %s, %s); - -""" % unicode_version) - gencats = load_unicode_data("UnicodeData.txt") - - ### character width module - width_table = [] - for zwcat in ["Me", "Mn", "Cf"]: - width_table.extend([(lo_hi[0], lo_hi[1], 0, 0) for lo_hi in gencats[zwcat]]) - width_table.append((4448, 4607, 0, 0)) - - # get widths, except those that are explicitly marked zero-width above - ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"]) - # these are doublewidth - for dwcat in ["W", "F"]: - width_table.extend([(lo_hi1[0], lo_hi1[1], 2, 2) for lo_hi1 in ea_widths[dwcat]]) - width_table.extend([(lo_hi2[0], lo_hi2[1], 1, 2) for lo_hi2 in ea_widths["A"]]) - - width_table.sort(key=lambda w: w[0]) - - # soft hyphen is not zero width in preformatted text; it's used to indicate - # a hyphen inserted to facilitate a linebreak. - width_table = remove_from_wtable(width_table, 173) - - # optimize the width table by collapsing adjacent entities when possible - width_table = optimize_width_table(width_table) - emit_charwidth_module(rf, width_table) + main(MODULE_FILENAME) |