summaryrefslogtreecommitdiffstats
path: root/third_party/rust/idna/src/make_uts46_mapping_table.py
blob: 4f72bbd095d926c42fbd798ce9ede026c4f95118 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# Copyright 2013-2014 The rust-url developers.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
# You can get the latest idna table from
# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt

import collections
import itertools

print('''\
// Copyright 2013-2020 The rust-url developers.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// Generated by make_idna_table.py
''')

txt = open("IdnaMappingTable.txt")

def escape_char(c):
    return "\\u{%x}" % ord(c[0])

def char(s):
    return chr(int(s, 16))

strtab = collections.OrderedDict()
strtab_offset = 0

def strtab_slice(s):
    global strtab, strtab_offset

    if s in strtab:
        return strtab[s]
    else:
        utf8_len = len(s.encode('utf8'))
        c = (strtab_offset, utf8_len)
        strtab[s] = c
        strtab_offset += utf8_len
        return c

def rust_slice(s):
    start = s[0]
    length = s[1]
    start_lo = start & 0xff
    start_hi = start >> 8
    assert length <= 255
    assert start_hi <= 255
    return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)

ranges = []

for line in txt:
    # remove comments
    line, _, _ = line.partition('#')
    # skip empty lines
    if len(line.strip()) == 0:
        continue
    fields = line.split(';')
    if fields[0].strip() == 'D800..DFFF':
        continue  # Surrogates don't occur in Rust strings.
    first, _, last = fields[0].strip().partition('..')
    if not last:
        last = first
    mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
    unicode_str = None
    if len(fields) > 2:
        if fields[2].strip():
            unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
        elif mapping == "Deviation":
            unicode_str = u''

    if len(fields) > 3:
        assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
        assert mapping == 'Valid', mapping
        mapping = 'DisallowedIdna2008'

    ranges.append((first, last, mapping, unicode_str))

def mergeable_key(r):
    mapping = r[2]

    # These types have associated data, so we should not merge them.
    if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
        return r
    assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
    return mapping

grouped_ranges = itertools.groupby(ranges, key=mergeable_key)

optimized_ranges = []

for (k, g) in grouped_ranges:
    group = list(g)
    if len(group) == 1:
        optimized_ranges.append(group[0])
        continue
    # Assert that nothing in the group has an associated unicode string.
    for g in group:
        if g[3] is not None and len(g[3]) > 2:
            assert not g[3][2].strip()
    # Assert that consecutive members of the group don't leave gaps in
    # the codepoint space.
    a, b = itertools.tee(group)
    next(b, None)
    for (g1, g2) in zip(a, b):
        last_char = int(g1[1], 16)
        next_char = int(g2[0], 16)
        if last_char + 1 == next_char:
            continue
        # There's a gap where surrogates would appear, but we don't have to
        # worry about that gap, as surrogates never appear in Rust strings.
        # Assert we're seeing the surrogate case here.
        assert last_char == 0xd7ff
        assert next_char == 0xe000
    optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])

def is_single_char_range(r):
    (first, last, _, _) = r
    return first == last

# We can reduce the size of the character range table and the index table to about 1/4
# by merging runs of single character ranges and using character offsets from the start
# of that range to retrieve the correct `Mapping` value
def merge_single_char_ranges(ranges):
    current = []
    for r in ranges:
        if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
            current.append(r)
            continue
        if len(current) != 0:
            ret = current
            current = [r]
            yield ret
            continue
        current.append(r)
        ret = current
        current = []
        yield ret
    yield current

optimized_ranges = list(merge_single_char_ranges(optimized_ranges))

SINGLE_MARKER = 1 << 15

print("static TABLE: &[(char, u16)] = &[")

offset = 0
for ranges in optimized_ranges:
    assert offset < SINGLE_MARKER

    block_len = len(ranges)
    single = SINGLE_MARKER if block_len == 1 else 0
    index = offset | single
    offset += block_len

    start = escape_char(char(ranges[0][0]))
    print("    ('%s', %s)," % (start, index))

print("];\n")

print("static MAPPING_TABLE: &[Mapping] = &[")

for ranges in optimized_ranges:
    for (first, last, mapping, unicode_str) in ranges:
        if unicode_str is not None:
            mapping += rust_slice(strtab_slice(unicode_str))
        print("    %s," % mapping)

print("];\n")

def escape_str(s):
    return [escape_char(c) for c in s]

print("static STRING_TABLE: &str = \"%s\";"
      % '\\\n  '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()])))