diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/idna/src/make_uts46_mapping_table.py | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/idna/src/make_uts46_mapping_table.py')
-rw-r--r-- | vendor/idna/src/make_uts46_mapping_table.py | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/vendor/idna/src/make_uts46_mapping_table.py b/vendor/idna/src/make_uts46_mapping_table.py new file mode 100644 index 000000000..4f72bbd09 --- /dev/null +++ b/vendor/idna/src/make_uts46_mapping_table.py @@ -0,0 +1,185 @@ +# Copyright 2013-2014 The rust-url developers. +# +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs +# You can get the latest idna table from +# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt + +import collections +import itertools + +print('''\ +// Copyright 2013-2020 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// Generated by make_idna_table.py +''') + +txt = open("IdnaMappingTable.txt") + +def escape_char(c): + return "\\u{%x}" % ord(c[0]) + +def char(s): + return chr(int(s, 16)) + +strtab = collections.OrderedDict() +strtab_offset = 0 + +def strtab_slice(s): + global strtab, strtab_offset + + if s in strtab: + return strtab[s] + else: + utf8_len = len(s.encode('utf8')) + c = (strtab_offset, utf8_len) + strtab[s] = c + strtab_offset += utf8_len + return c + +def rust_slice(s): + start = s[0] + length = s[1] + start_lo = start & 0xff + start_hi = start >> 8 + assert length <= 255 + assert start_hi <= 255 + return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length) + +ranges = [] + +for line in txt: + # remove comments + line, _, _ = line.partition('#') + # skip empty lines + if len(line.strip()) == 0: + continue + fields = line.split(';') + if fields[0].strip() == 'D800..DFFF': + continue # Surrogates don't occur in Rust strings. + first, _, last = fields[0].strip().partition('..') + if not last: + last = first + mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '') + unicode_str = None + if len(fields) > 2: + if fields[2].strip(): + unicode_str = u''.join(char(c) for c in fields[2].strip().split(' ')) + elif mapping == "Deviation": + unicode_str = u'' + + if len(fields) > 3: + assert fields[3].strip() in ('NV8', 'XV8'), fields[3] + assert mapping == 'Valid', mapping + mapping = 'DisallowedIdna2008' + + ranges.append((first, last, mapping, unicode_str)) + +def mergeable_key(r): + mapping = r[2] + + # These types have associated data, so we should not merge them. + if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'): + return r + assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008') + return mapping + +grouped_ranges = itertools.groupby(ranges, key=mergeable_key) + +optimized_ranges = [] + +for (k, g) in grouped_ranges: + group = list(g) + if len(group) == 1: + optimized_ranges.append(group[0]) + continue + # Assert that nothing in the group has an associated unicode string. + for g in group: + if g[3] is not None and len(g[3]) > 2: + assert not g[3][2].strip() + # Assert that consecutive members of the group don't leave gaps in + # the codepoint space. + a, b = itertools.tee(group) + next(b, None) + for (g1, g2) in zip(a, b): + last_char = int(g1[1], 16) + next_char = int(g2[0], 16) + if last_char + 1 == next_char: + continue + # There's a gap where surrogates would appear, but we don't have to + # worry about that gap, as surrogates never appear in Rust strings. + # Assert we're seeing the surrogate case here. + assert last_char == 0xd7ff + assert next_char == 0xe000 + optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:]) + +def is_single_char_range(r): + (first, last, _, _) = r + return first == last + +# We can reduce the size of the character range table and the index table to about 1/4 +# by merging runs of single character ranges and using character offsets from the start +# of that range to retrieve the correct `Mapping` value +def merge_single_char_ranges(ranges): + current = [] + for r in ranges: + if not current or is_single_char_range(current[-1]) and is_single_char_range(r): + current.append(r) + continue + if len(current) != 0: + ret = current + current = [r] + yield ret + continue + current.append(r) + ret = current + current = [] + yield ret + yield current + +optimized_ranges = list(merge_single_char_ranges(optimized_ranges)) + +SINGLE_MARKER = 1 << 15 + +print("static TABLE: &[(char, u16)] = &[") + +offset = 0 +for ranges in optimized_ranges: + assert offset < SINGLE_MARKER + + block_len = len(ranges) + single = SINGLE_MARKER if block_len == 1 else 0 + index = offset | single + offset += block_len + + start = escape_char(char(ranges[0][0])) + print(" ('%s', %s)," % (start, index)) + +print("];\n") + +print("static MAPPING_TABLE: &[Mapping] = &[") + +for ranges in optimized_ranges: + for (first, last, mapping, unicode_str) in ranges: + if unicode_str is not None: + mapping += rust_slice(strtab_slice(unicode_str)) + print(" %s," % mapping) + +print("];\n") + +def escape_str(s): + return [escape_char(c) for c in s] + +print("static STRING_TABLE: &str = \"%s\";" + % '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()]))) |