Merging upstream version 1.66.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:11:38 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:13:23 +0000
commit: 20431706a863f92cb37dc512fef6e48d192aaf2c (patch)
tree: 2867f13f5fd5437ba628c67d7f87309ccadcd286 /vendor/unicode-width/scripts/unicode.py
parent: Releasing progress-linux version 1.65.0+dfsg1-2~progress7.99u1. (diff)
download: rustc-20431706a863f92cb37dc512fef6e48d192aaf2c.tar.xz
rustc-20431706a863f92cb37dc512fef6e48d192aaf2c.zip
1 files changed, 466 insertions, 282 deletions
diff --git a/vendor/unicode-width/scripts/unicode.py b/vendor/unicode-width/scripts/unicode.py
index 7f5959d4b..2efb0b63f 100755
--- a/vendor/unicode-width/scripts/unicode.py
+++ b/vendor/unicode-width/scripts/unicode.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
+# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT
 # file at the top-level directory of this distribution and at
 # http://rust-lang.org/COPYRIGHT.
 #
@@ -16,11 +16,322 @@
 # - UnicodeData.txt
 #
 # Since this should not require frequent updates, we just store this
-# out-of-line and check the unicode.rs file into git.
-
-import fileinput, re, os, sys, operator
+# out-of-line and check the generated module into git.
+
+import enum
+import math
+import os
+import re
+import sys
+
+NUM_CODEPOINTS = 0x110000
+"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
+
+MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1))
+"""The maximum number of bits required to represent a Unicode codepoint."""
+
+
+class OffsetType(enum.IntEnum):
+    """Represents the data type of a lookup table's offsets. Each variant's value represents the
+    number of bits required to represent that variant's type."""
+
+    U2 = 2
+    """Offsets are 2-bit unsigned integers, packed four-per-byte."""
+    U4 = 4
+    """Offsets are 4-bit unsigned integers, packed two-per-byte."""
+    U8 = 8
+    """Each offset is a single byte (u8)."""
+
+
+TABLE_CFGS = [
+    (13, MAX_CODEPOINT_BITS, OffsetType.U8),
+    (6, 13, OffsetType.U8),
+    (0, 6, OffsetType.U2),
+]
+"""Represents the format of each level of the multi-level lookup table.
+A level's entry is of the form `(low_bit, cap_bit, offset_type)`.
+This means that every sub-table in that level is indexed by bits `low_bit..cap_bit` of the
+codepoint and those tables offsets are stored according to `offset_type`.
+
+If this is edited, you must ensure that `emit_module` reflects your changes."""
+
+MODULE_FILENAME = "tables.rs"
+"""The filename of the emitted Rust module (will be created in the working directory)"""
+
+Codepoint = int
+BitPos = int
+
+
+def fetch_open(filename: str):
+    """Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
+    fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
+    if not os.path.exists(os.path.basename(filename)):
+        os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
+    try:
+        return open(filename, encoding="utf-8")
+    except OSError:
+        sys.stderr.write(f"cannot load {filename}")
+        sys.exit(1)
+
+
+def load_unicode_version() -> "tuple[int, int, int]":
+    """Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
+    with fetch_open("ReadMe.txt") as readme:
+        pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
+        return tuple(map(int, re.search(pattern, readme.read()).groups()))
+
+
+class EffectiveWidth(enum.IntEnum):
+    """Represents the width of a Unicode character. All East Asian Width classes resolve into
+    either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
+
+    ZERO = 0
+    """ Zero columns wide. """
+    NARROW = 1
+    """ One column wide. """
+    WIDE = 2
+    """ Two columns wide. """
+    AMBIGUOUS = 3
+    """ Two columns wide in a CJK context. One column wide in all other contexts. """
+
+
+def load_east_asian_widths() -> "list[EffectiveWidth]":
+    """Return a list of effective widths, indexed by codepoint.
+    Widths are determined by fetching and parsing `EastAsianWidth.txt`.
+
+    `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.
+
+    `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
+
+    `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
+    with fetch_open("EastAsianWidth.txt") as eaw:
+        # matches a width assignment for a single codepoint, i.e. "1F336;N  # ..."
+        single = re.compile(r"^([0-9A-F]+);(\w+) +# (\w+)")
+        # matches a width assignment for a range of codepoints, i.e. "3001..3003;W  # ..."
+        multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
+        # map between width category code and condensed width
+        width_codes = {
+            **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]},
+            **{c: EffectiveWidth.WIDE for c in ["W", "F"]},
+            "A": EffectiveWidth.AMBIGUOUS,
+        }
 
-preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
+        width_map = []
+        current = 0
+        for line in eaw.readlines():
+            raw_data = None  # (low, high, width)
+            if match := single.match(line):
+                raw_data = (match.group(1), match.group(1), match.group(2))
+            elif match := multiple.match(line):
+                raw_data = (match.group(1), match.group(2), match.group(3))
+            else:
+                continue
+            low = int(raw_data[0], 16)
+            high = int(raw_data[1], 16)
+            width = width_codes[raw_data[2]]
+
+            assert current <= high
+            while current <= high:
+                # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt.
+                # All such codepoints are implicitly given Neural width (resolves to narrow)
+                width_map.append(EffectiveWidth.NARROW if current < low else width)
+                current += 1
+
+        while len(width_map) < NUM_CODEPOINTS:
+            # Catch any leftover codepoints and assign them implicit Neutral/narrow width.
+            width_map.append(EffectiveWidth.NARROW)
+
+        return width_map
+
+
+def load_zero_widths() -> "list[bool]":
+    """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
+    character. `c` is considered a zero-width character if `c` is in general categories
+    `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
+    with fetch_open("UnicodeData.txt") as categories:
+        zw_map = []
+        current = 0
+        for line in categories.readlines():
+            if len(raw_data := line.split(";")) != 15:
+                continue
+            [codepoint, name, cat_code] = [
+                int(raw_data[0], 16),
+                raw_data[1],
+                raw_data[2],
+            ]
+            zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
+
+            assert current <= codepoint
+            while current <= codepoint:
+                if name.endswith(", Last>") or current == codepoint:
+                    # if name ends with Last, we backfill the width value to all codepoints since
+                    # the previous codepoint (aka the start of the range)
+                    zw_map.append(zero_width)
+                else:
+                    # unassigned characters are implicitly given Neutral width, which is nonzero
+                    zw_map.append(False)
+                current += 1
+
+        while len(zw_map) < NUM_CODEPOINTS:
+            # Catch any leftover codepoints. They must be unassigned (so nonzero width)
+            zw_map.append(False)
+
+        return zw_map
+
+
+class Bucket:
+    """A bucket contains a group of codepoints and an ordered width list. If one bucket's width
+    list overlaps with another's width list, those buckets can be merged via `try_extend`."""
+
+    def __init__(self):
+        """Creates an empty bucket."""
+        self.entry_set = set()
+        self.widths = []
+
+    def append(self, codepoint: Codepoint, width: EffectiveWidth):
+        """Adds a codepoint/width pair to the bucket, and appends `width` to the width list."""
+        self.entry_set.add((codepoint, width))
+        self.widths.append(width)
+
+    def try_extend(self, attempt: "Bucket") -> bool:
+        """If either `self` or `attempt`'s width list starts with the other bucket's width list,
+        set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints
+        into `self`, and return `True`. Otherwise, return `False`."""
+        (less, more) = (self.widths, attempt.widths)
+        if len(self.widths) > len(attempt.widths):
+            (less, more) = (attempt.widths, self.widths)
+        if less != more[: len(less)]:
+            return False
+        self.entry_set |= attempt.entry_set
+        self.widths = more
+        return True
+
+    def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
+        """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
+        result = list(self.entry_set)
+        result.sort()
+        return result
+
+    def width(self) -> "EffectiveWidth":
+        """If all codepoints in this bucket have the same width, return that width; otherwise,
+        return `None`."""
+        if len(self.widths) == 0:
+            return None
+        potential_width = self.widths[0]
+        for width in self.widths[1:]:
+            if potential_width != width:
+                return None
+        return potential_width
+
+
+def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
+    """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
+    codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
+    same bucket. Returns a list of the buckets in increasing order of those bits."""
+    num_bits = cap_bit - low_bit
+    assert num_bits > 0
+    buckets = [Bucket() for _ in range(0, 2 ** num_bits)]
+    mask = (1 << num_bits) - 1
+    for (codepoint, width) in entries:
+        buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
+    return buckets
+
+
+class Table:
+    """Represents a lookup table. Each table contains a certain number of subtables; each
+    subtable is indexed by a contiguous bit range of the codepoint and contains a list
+    of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.)
+
+    Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should
+    be indexed by sub-table `i` in the next-level lookup table. The entries of this table are
+    indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The
+    key to compression is that two different buckets in two different sub-tables may have the
+    same width list, which means that they can be merged into the same bucket.
+
+    If no bucket contains two codepoints with different widths, calling `indices_to_widths` will
+    discard the buckets and convert the entries into `EffectiveWidth` values."""
+
+    def __init__(
+        self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType
+    ):
+        """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator
+        in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`,
+        and each table entry is represented in the format specified by  `offset_type`. Asserts
+        that this table is actually representable with `offset_type`."""
+        self.low_bit = low_bit
+        self.cap_bit = cap_bit
+        self.offset_type = offset_type
+        self.entries = []
+        self.indexed = []
+
+        buckets = []
+        for entries in entry_groups:
+            buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))
+
+        for bucket in buckets:
+            for (i, existing) in enumerate(self.indexed):
+                if existing.try_extend(bucket):
+                    self.entries.append(i)
+                    break
+            else:
+                self.entries.append(len(self.indexed))
+                self.indexed.append(bucket)
+
+        # Validate offset type
+        for index in self.entries:
+            assert index < (1 << int(self.offset_type))
+
+    def indices_to_widths(self):
+        """Destructively converts the indices in this table to the `EffectiveWidth` values of
+        their buckets. Assumes that no bucket contains codepoints with different widths."""
+        self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))
+        del self.indexed
+
+    def buckets(self):
+        """Returns an iterator over this table's buckets."""
+        return self.indexed
+
+    def to_bytes(self) -> "list[int]":
+        """Returns this table's entries as a list of bytes. The bytes are formatted according to
+        the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
+        to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
+        contain four packed 2-bit entries."""
+        entries_per_byte = 8 // int(self.offset_type)
+        byte_array = []
+        for i in range(0, len(self.entries), entries_per_byte):
+            byte = 0
+            for j in range(0, entries_per_byte):
+                byte |= self.entries[i + j] << (j * int(self.offset_type))
+            byte_array.append(byte)
+        return byte_array
+
+
+def make_tables(
+    table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries
+) -> "list[Table]":
+    """Creates a table for each configuration in `table_cfgs`, with the first config corresponding
+    to the top-level lookup table, the second config corresponding to the second-level lookup
+    table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
+    to include in the top-level table."""
+    tables = []
+    entry_groups = [entries]
+    for (low_bit, cap_bit, offset_type) in table_cfgs:
+        table = Table(entry_groups, low_bit, cap_bit, offset_type)
+        entry_groups = map(lambda bucket: bucket.entries(), table.buckets())
+        tables.append(table)
+    return tables
+
+
+def emit_module(
+    out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
+):
+    """Outputs a Rust module to `out_name` using table data from `tables`.
+    If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
+    if os.path.exists(out_name):
+        os.remove(out_name)
+    with open(out_name, "w", newline="\n", encoding="utf-8") as module:
+        module.write(
+            """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
@@ -31,291 +342,164 @@ preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRI
 // except according to those terms.
 
 // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
-
-#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
-'''
-
-# Mapping taken from Table 12 from:
-# http://www.unicode.org/reports/tr44/#General_Category_Values
-expanded_categories = {
-    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
-    'Lm': ['L'], 'Lo': ['L'],
-    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
-    'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
-    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
-    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
-    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
-    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
-    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
-}
-
-# these are the surrogate codepoints, which are not valid rust characters
-surrogate_codepoints = (0xd800, 0xdfff)
-
-def fetch(f):
-    if not os.path.exists(os.path.basename(f)):
-        os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
-                  % f)
-
-    if not os.path.exists(os.path.basename(f)):
-        sys.stderr.write("cannot load %s" % f)
-        exit(1)
-
-def is_surrogate(n):
-    return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
-
-def load_unicode_data(f):
-    fetch(f)
-    gencats = {}
-
-    udict = {}
-    range_start = -1
-    for line in fileinput.input(f):
-        data = line.split(';')
-        if len(data) != 15:
-            continue
-        cp = int(data[0], 16)
-        if is_surrogate(cp):
-            continue
-        if range_start >= 0:
-            for i in range(range_start, cp):
-                udict[i] = data
-            range_start = -1
-        if data[1].endswith(", First>"):
-            range_start = cp
-            continue
-        udict[cp] = data
-
-    for code in udict:
-        [code_org, name, gencat, combine, bidi,
-         decomp, deci, digit, num, mirror,
-         old, iso, upcase, lowcase, titlecase ] = udict[code]
-
-        # place letter in categories as appropriate
-        for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
-            if cat not in gencats:
-                gencats[cat] = []
-            gencats[cat].append(code)
-
-    gencats = group_cats(gencats)
-
-    return gencats
-
-def group_cats(cats):
-    cats_out = {}
-    for cat in cats:
-        cats_out[cat] = group_cat(cats[cat])
-    return cats_out
-
-def group_cat(cat):
-    cat_out = []
-    letters = sorted(set(cat))
-    cur_start = letters.pop(0)
-    cur_end = cur_start
-    for letter in letters:
-        assert letter > cur_end, \
-            "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
-        if letter == cur_end + 1:
-            cur_end = letter
-        else:
-            cat_out.append((cur_start, cur_end))
-            cur_start = cur_end = letter
-    cat_out.append((cur_start, cur_end))
-    return cat_out
-
-def format_table_content(f, content, indent):
-    line = " "*indent
-    first = True
-    for chunk in content.split(","):
-        if len(line) + len(chunk) < 98:
-            if first:
-                line += chunk
-            else:
-                line += ", " + chunk
-            first = False
-        else:
-            f.write(line + ",\n")
-            line = " "*indent + chunk
-    f.write(line)
-
-# load all widths of want_widths, except those in except_cats
-def load_east_asian_width(want_widths, except_cats):
-    f = "EastAsianWidth.txt"
-    fetch(f)
-    widths = {}
-    re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
-    re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
-
-    for line in fileinput.input(f):
-        width = None
-        d_lo = 0
-        d_hi = 0
-        cat = None
-        m = re1.match(line)
-        if m:
-            d_lo = m.group(1)
-            d_hi = m.group(1)
-            width = m.group(2)
-            cat = m.group(3)
-        else:
-            m = re2.match(line)
-            if m:
-                d_lo = m.group(1)
-                d_hi = m.group(2)
-                width = m.group(3)
-                cat = m.group(4)
-            else:
-                continue
-        if cat in except_cats or width not in want_widths:
-            continue
-        d_lo = int(d_lo, 16)
-        d_hi = int(d_hi, 16)
-        if width not in widths:
-            widths[width] = []
-        widths[width].append((d_lo, d_hi))
-    return widths
-
-def escape_char(c):
-    return "'\\u{%x}'" % c
-
-def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
-        pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
-    pub_string = "const"
-    if not is_const:
-        pub_string = "let"
-    if is_pub:
-        pub_string = "pub " + pub_string
-    f.write("    %s %s: %s = &[\n" % (pub_string, name, t_type))
-    data = ""
-    first = True
-    for dat in t_data:
-        if not first:
-            data += ","
-        first = False
-        data += pfun(dat)
-    format_table_content(f, data, 8)
-    f.write("\n    ];\n\n")
-
-def emit_charwidth_module(f, width_table):
-    f.write("pub mod charwidth {")
-    f.write("""
-    use core::option::Option::{self, Some, None};
-    use core::result::Result::{Ok, Err};
-
+"""
+        )
+        module.write(
+            f"""
+/// The version of [Unicode](http://www.unicode.org/)
+/// that this version of unicode-width is based on.
+pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version};
+"""
+        )
+
+        module.write(
+            """
+pub mod charwidth {
+    use core::option::Option::{self, None, Some};
+
+    /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
+    /// consulting a multi-level lookup table.
+    /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
+    /// they're treated as single width.
+    ///
+    /// # Maintenance
+    /// The tables themselves are autogenerated but this function is hardcoded. You should have
+    /// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
+    /// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
+    /// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
     #[inline]
-    fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 {
-        use core::cmp::Ordering::{Equal, Less, Greater};
-        match r.binary_search_by(|&(lo, hi, _, _)| {
-            if lo <= c && c <= hi { Equal }
-            else if hi < c { Less }
-            else { Greater }
-        }) {
-            Ok(idx) => {
-                let (_, _, r_ncjk, r_cjk) = r[idx];
-                if is_cjk { r_cjk } else { r_ncjk }
+    fn lookup_width(c: char, is_cjk: bool) -> usize {
+        let cp = c as usize;
+
+        let t1_offset = TABLES_0[cp >> 13 & 0xFF];
+
+        // Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
+        // so each sub-table is 128 bytes in size.
+        // (Sub-tables are selected using the computed offset from the previous table.)
+        let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
+
+        // Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
+        // This is accomplished by packing four stored entries into one byte.
+        // So each sub-table is 2**(6-2) == 16 bytes in size.
+        // Since this is the last table, each entry represents an encoded width.
+        let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
+
+        // Extract the packed width
+        let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
+
+        // A width of 3 signifies that the codepoint is ambiguous width.
+        if width == 3 {
+            if is_cjk {
+                2
+            } else {
+                1
             }
-            Err(_) => 1
+        } else {
+            width.into()
         }
     }
-""")
-
-    f.write("""
+"""
+        )
+
+        module.write(
+            """
+    /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
+    /// `None` if `c` is a control character other than `'\\x00'`.
+    /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
+    /// they're treated as single width.
     #[inline]
     pub fn width(c: char, is_cjk: bool) -> Option<usize> {
-        match c as usize {
-            _c @ 0 => Some(0),          // null is zero width
-            cu if cu < 0x20 => None,    // control sequences have no width
-            cu if cu < 0x7F => Some(1), // ASCII
-            cu if cu < 0xA0 => None,    // more control sequences
-            _ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as usize)
+        if c < '\\u{7F}' {
+            if c >= '\\u{20}' {
+                // U+0020 to U+007F (exclusive) are single-width ASCII codepoints
+                Some(1)
+            } else if c == '\\0' {
+                // U+0000 *is* a control code, but it's special-cased
+                Some(0)
+            } else {
+                // U+0001 to U+0020 (exclusive) are control codes
+                None
+            }
+        } else if c >= '\\u{A0}' {
+            // No characters >= U+00A0 are control codes, so we can consult the lookup tables
+            Some(lookup_width(c, is_cjk))
+        } else {
+            // U+007F to U+00A0 (exclusive) are control codes
+            None
         }
     }
+"""
+        )
+
+        subtable_count = 1
+        for (i, table) in enumerate(tables):
+            new_subtable_count = len(table.buckets())
+            if i == len(tables) - 1:
+                table.indices_to_widths()  # for the last table, indices == widths
+            byte_array = table.to_bytes()
+            module.write(
+                f"""
+    /// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
+    static TABLES_{i}: [u8; {len(byte_array)}] = ["""
+            )
+            for (j, byte) in enumerate(byte_array):
+                # Add line breaks for every 15th entry (chosen to match what rustfmt does)
+                if j % 15 == 0:
+                    module.write("\n       ")
+                module.write(f" 0x{byte:02X},")
+            module.write("\n    ];\n")
+            subtable_count = new_subtable_count
+        module.write("}\n")
+
+
+def main(module_filename: str):
+    """Obtain character data from the latest version of Unicode, transform it into a multi-level
+    lookup table for character width, and write a Rust module utilizing that table to
+    `module_filename`.
+
+    We obey the following rules in decreasing order of importance:
+    - The soft hyphen (`U+00AD`) is single-width.
+    - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
+    - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
+    - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
+    - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
+    - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
+    of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
+
+    These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
+    http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""
+    version = load_unicode_version()
+    print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")
+
+    eaw_map = load_east_asian_widths()
+    zw_map = load_zero_widths()
+
+    # Characters marked as zero-width in zw_map should be zero-width in the final map
+    width_map = list(
+        map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map))
+    )
+
+    # Override for soft hyphen
+    width_map[0x00AD] = EffectiveWidth.NARROW
+
+    # Override for Hangul Jamo medial vowels & final consonants
+    for i in range(0x1160, 0x11FF + 1):
+        width_map[i] = EffectiveWidth.ZERO
+
+    tables = make_tables(TABLE_CFGS, enumerate(width_map))
+
+    print("------------------------")
+    total_size = 0
+    for (i, table) in enumerate(tables):
+        size_bytes = len(table.to_bytes())
+        print(f"Table {i} Size: {size_bytes} bytes")
+        total_size += size_bytes
+    print("------------------------")
+    print(f"  Total Size: {total_size} bytes")
+
+    emit_module(module_filename, version, tables)
+    print(f'Wrote to "{module_filename}"')
 
-""")
-
-    f.write("    // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n")
-    f.write("    //     http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
-    emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
-            pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
-    f.write("}\n\n")
-
-def remove_from_wtable(wtable, val):
-    wtable_out = []
-    while wtable:
-        if wtable[0][1] < val:
-            wtable_out.append(wtable.pop(0))
-        elif wtable[0][0] > val:
-            break
-        else:
-            (wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
-            if wt_lo == wt_hi == val:
-                continue
-            elif wt_lo == val:
-                wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
-            elif wt_hi == val:
-                wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
-            else:
-                wtable_out.append((wt_lo, val-1, width, width_cjk))
-                wtable_out.append((val+1, wt_hi, width, width_cjk))
-    if wtable:
-        wtable_out.extend(wtable)
-    return wtable_out
-
-
-
-def optimize_width_table(wtable):
-    wtable_out = []
-    w_this = wtable.pop(0)
-    while wtable:
-        if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
-            w_tmp = wtable.pop(0)
-            w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
-        else:
-            wtable_out.append(w_this)
-            w_this = wtable.pop(0)
-    wtable_out.append(w_this)
-    return wtable_out
 
 if __name__ == "__main__":
-    r = "tables.rs"
-    if os.path.exists(r):
-        os.remove(r)
-    with open(r, "w") as rf:
-        # write the file's preamble
-        rf.write(preamble)
-
-        # download and parse all the data
-        fetch("ReadMe.txt")
-        with open("ReadMe.txt") as readme:
-            pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
-            unicode_version = re.search(pattern, readme.read()).groups()
-        rf.write("""
-/// The version of [Unicode](http://www.unicode.org/)
-/// that this version of unicode-width is based on.
-pub const UNICODE_VERSION: (u8, u8, u8) = (%s, %s, %s);
-
-""" % unicode_version)
-        gencats = load_unicode_data("UnicodeData.txt")
-
-        ### character width module
-        width_table = []
-        for zwcat in ["Me", "Mn", "Cf"]:
-            width_table.extend([(lo_hi[0], lo_hi[1], 0, 0) for lo_hi in gencats[zwcat]])
-        width_table.append((4448, 4607, 0, 0))
-
-        # get widths, except those that are explicitly marked zero-width above
-        ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"])
-        # these are doublewidth
-        for dwcat in ["W", "F"]:
-            width_table.extend([(lo_hi1[0], lo_hi1[1], 2, 2) for lo_hi1 in ea_widths[dwcat]])
-        width_table.extend([(lo_hi2[0], lo_hi2[1], 1, 2) for lo_hi2 in ea_widths["A"]])
-
-        width_table.sort(key=lambda w: w[0])
-
-        # soft hyphen is not zero width in preformatted text; it's used to indicate
-        # a hyphen inserted to facilitate a linebreak.
-        width_table = remove_from_wtable(width_table, 173)
-
-        # optimize the width table by collapsing adjacent entities when possible
-        width_table = optimize_width_table(width_table)
-        emit_charwidth_module(rf, width_table)
+    main(MODULE_FILENAME)
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:11:38 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:13:23 +0000
commit	20431706a863f92cb37dc512fef6e48d192aaf2c (patch)
tree	2867f13f5fd5437ba628c67d7f87309ccadcd286 /vendor/unicode-width/scripts/unicode.py
parent	Releasing progress-linux version 1.65.0+dfsg1-2~progress7.99u1. (diff)
download	rustc-20431706a863f92cb37dc512fef6e48d192aaf2c.tar.xz rustc-20431706a863f92cb37dc512fef6e48d192aaf2c.zip