1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
|
# Copyright 2013-2014 The rust-url developers.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
# You can get the latest idna table from
# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
import collections
import itertools
print('''\
// Copyright 2013-2020 The rust-url developers.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// Generated by make_idna_table.py
''')
txt = open("IdnaMappingTable.txt")
def escape_char(c):
return "\\u{%x}" % ord(c[0])
def char(s):
return chr(int(s, 16))
strtab = collections.OrderedDict()
strtab_offset = 0
def strtab_slice(s):
global strtab, strtab_offset
if s in strtab:
return strtab[s]
else:
utf8_len = len(s.encode('utf8'))
c = (strtab_offset, utf8_len)
strtab[s] = c
strtab_offset += utf8_len
return c
def rust_slice(s):
start = s[0]
length = s[1]
start_lo = start & 0xff
start_hi = start >> 8
assert length <= 255
assert start_hi <= 255
return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
ranges = []
for line in txt:
# remove comments
line, _, _ = line.partition('#')
# skip empty lines
if len(line.strip()) == 0:
continue
fields = line.split(';')
if fields[0].strip() == 'D800..DFFF':
continue # Surrogates don't occur in Rust strings.
first, _, last = fields[0].strip().partition('..')
if not last:
last = first
mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
unicode_str = None
if len(fields) > 2:
if fields[2].strip():
unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
elif mapping == "Deviation":
unicode_str = u''
if len(fields) > 3:
assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
assert mapping == 'Valid', mapping
mapping = 'DisallowedIdna2008'
ranges.append((first, last, mapping, unicode_str))
def mergeable_key(r):
mapping = r[2]
# These types have associated data, so we should not merge them.
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
return r
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
return mapping
grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
optimized_ranges = []
for (k, g) in grouped_ranges:
group = list(g)
if len(group) == 1:
optimized_ranges.append(group[0])
continue
# Assert that nothing in the group has an associated unicode string.
for g in group:
if g[3] is not None and len(g[3]) > 2:
assert not g[3][2].strip()
# Assert that consecutive members of the group don't leave gaps in
# the codepoint space.
a, b = itertools.tee(group)
next(b, None)
for (g1, g2) in zip(a, b):
last_char = int(g1[1], 16)
next_char = int(g2[0], 16)
if last_char + 1 == next_char:
continue
# There's a gap where surrogates would appear, but we don't have to
# worry about that gap, as surrogates never appear in Rust strings.
# Assert we're seeing the surrogate case here.
assert last_char == 0xd7ff
assert next_char == 0xe000
optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])
def is_single_char_range(r):
(first, last, _, _) = r
return first == last
# We can reduce the size of the character range table and the index table to about 1/4
# by merging runs of single character ranges and using character offsets from the start
# of that range to retrieve the correct `Mapping` value
def merge_single_char_ranges(ranges):
current = []
for r in ranges:
if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
current.append(r)
continue
if len(current) != 0:
ret = current
current = [r]
yield ret
continue
current.append(r)
ret = current
current = []
yield ret
yield current
optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
SINGLE_MARKER = 1 << 15
print("static TABLE: &[(char, u16)] = &[")
offset = 0
for ranges in optimized_ranges:
assert offset < SINGLE_MARKER
block_len = len(ranges)
single = SINGLE_MARKER if block_len == 1 else 0
index = offset | single
offset += block_len
start = escape_char(char(ranges[0][0]))
print(" ('%s', %s)," % (start, index))
print("];\n")
print("static MAPPING_TABLE: &[Mapping] = &[")
for ranges in optimized_ranges:
for (first, last, mapping, unicode_str) in ranges:
if unicode_str is not None:
mapping += rust_slice(strtab_slice(unicode_str))
print(" %s," % mapping)
print("];\n")
def escape_str(s):
return [escape_char(c) for c in s]
print("static STRING_TABLE: &str = \"%s\";"
% '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()])))
|