1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
|
#!/usr/bin/env python3
#
# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# This script uses the following Unicode security tables:
# - IdentifierStatus.txt
# - IdentifierType.txt
# - PropertyValueAliases.txt
# - confusables.txt
# - ReadMe.txt
# This script also uses the following Unicode UCD data:
# - Scripts.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the tables.rs file into git.
import fileinput, re, os, sys, operator
preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
'''
UNICODE_VERSION = (13, 0, 0)
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
# Download a Unicode security table file
def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O http://www.unicode.org/Public/security/%s/%s"
% (UNICODE_VERSION_NUMBER, f))
if not os.path.exists(os.path.basename(f)):
sys.stderr.write("cannot load %s\n" % f)
exit(1)
# Download a UCD table file
def fetch_unidata(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
% (UNICODE_VERSION_NUMBER, f))
if not os.path.exists(os.path.basename(f)):
sys.stderr.write("cannot load %s" % f)
exit(1)
# Loads code point data from IdentifierStatus.txt and
# IdentifierType.txt
# Implementation from unicode-segmentation
def load_properties(f, interestingprops = None):
fetch(f)
props = {}
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
prop = None
d_lo = 0
d_hi = 0
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
prop = m.group(2).strip()
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
prop = m.group(3).strip()
else:
continue
if interestingprops and prop not in interestingprops:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if prop not in props:
props[prop] = []
props[prop].append((d_lo, d_hi))
return props
# Loads script data from Scripts.txt
def load_script_properties(f, interestingprops):
fetch_unidata(f)
props = {}
# Note: these regexes are different from those in unicode-segmentation,
# becase we need to handle spaces here
re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
for line in fileinput.input(os.path.basename(f)):
prop = None
d_lo = 0
d_hi = 0
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
prop = m.group(2).strip()
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
prop = m.group(3).strip()
else:
continue
if interestingprops and prop not in interestingprops:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if prop not in props:
props[prop] = []
props[prop].append((d_lo, d_hi))
return props
# Loads confusables data from confusables.txt
def load_confusables(f):
fetch(f)
confusables = []
re1 = re.compile(r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*")
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
d_input = 0
d_outputs = []
m = re1.match(line)
if not m:
continue
d_inputs = m.group(1).split()
if len(d_inputs) != 1:
raise Exception('More than one code point in first column')
d_input = int(d_inputs[0].strip(), 16)
for d_output in m.group(2).split():
d_outputitem = int(d_output, 16)
d_outputs.append(d_outputitem)
confusables.append((d_input, d_outputs))
return confusables
# Loads Unicode script name correspondence from PropertyValueAliases.txt
def aliases():
# This function is taken from the `unicode-script` crate. If significant
# changes are introduced, update accordingly.
# Note that this file is in UCD directly, not security directory.
# we use `fetch_unidata` function to download it.
fetch_unidata("PropertyValueAliases.txt")
longforms = {}
shortforms = {}
re1 = re.compile(r"^ *sc *; *(\w+) *; *(\w+)")
for line in fileinput.input(os.path.basename("PropertyValueAliases.txt")):
m = re1.match(line)
if m:
l = m.group(2).strip()
s = m.group(1).strip()
assert(s not in longforms)
assert(l not in shortforms)
longforms[s] = l
shortforms[l] = s
else:
continue
return (longforms, shortforms)
# Loads Unicode script name list and correspondence mapping
def load_scripts(f):
# This function is taken from the `unicode-script` crate. If significant
# changes are introduced, update accordingly.
(longforms, shortforms) = aliases()
scripts = load_script_properties(f, [])
script_table = []
script_list = []
for script in scripts:
if script not in ["Common", "Unknown", "Inherited"]:
script_list.append(shortforms[script])
script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]])
script_list.sort()
script_table.sort(key=lambda w: w[0])
return (longforms, script_table)
def is_script_ignored_in_mixedscript(source):
return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz'
# When a codepoint's prototype consists of multiple codepoints.
# The situation is more complex. Here we make up a few rules
# to cover all the cases in confusables.txt .
# The principle is that when replacing the original codepoint with its prototype.
# Neither a "non-ignored script" appears nor it disappears.
#
# We make up several rules to cover the cases occurred within confusables.txt
# Return True, True when we want to consider it confusable,
# and return True, False when we want to consider it non-confusable.
# and return False, _ when new not-yet-processed cases are added in future Unicode versions.
def process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts):
script_lst = script_list(proto_lst, scripts)
script_lst.sort()
# here's a few rules to process current version of Unicode data (13.0 at this time)
script_lst_len = len(script_lst)
assert(script_lst_len > 0)
# Rule: A - A -> Processed, DontAdd
if script_lst_len == 1 and script_lst[0] == script_i:
return True, False
# Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
if (script_lst_len == 1 and not is_script_ignored_in_mixedscript(script_lst[0])
and not is_script_ignored_in_mixedscript(script_i)
and script_lst[0] != script_i):
return True, True
# Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
if (script_lst_len == 1 and is_script_ignored_in_mixedscript(script_lst[0])
and not is_script_ignored_in_mixedscript(script_i)):
return True, True
# Rule: A ... - A -> Processed, DontAdd
if script_lst_len > 1 and script_i in script_lst:
return True, False
# Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0])
and not is_script_ignored_in_mixedscript(script_lst[1])
and not is_script_ignored_in_mixedscript(script_i)
and script_lst[1] != script_i):
return True, True
if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[1])
and not is_script_ignored_in_mixedscript(script_lst[0])
and not is_script_ignored_in_mixedscript(script_i)
and script_lst[0] != script_i):
return True, True
# Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0])
and is_script_ignored_in_mixedscript(script_lst[1])
and not is_script_ignored_in_mixedscript(script_i)):
return True, True
# NotProcessed, DontAdd
return False, False
def is_codepoint_identifier_allowed(c, identifier_allowed):
for data in identifier_allowed:
if c >= data[0] and c <= data[1]:
return True
return False
# This function load and generates a table of all the confusable characters.
# It returns a pair consists of a `mixedscript_confusable` table and a
# `mixedscript_confusable_unresolved` table.
# The `mixedscript_confusable` is a dict, its keys are Unicode script names, and each
# entry has a value of a inner dict. The inner dict's keys are confusable code points
# converted to string with the `escape_char` function, and its values are pairs.
# pair[0] keeps a copy of the confusable code point itself but as integer.
# pair[1] keeps a list of all the code points that are mixed script confusable with it.
# which is only used for debugging purposes.
# note that the string 'multi' will occur in the list when pair[0] is considered
# confusable with its multiple code point prototype.
# Usually the `mixedscript_confusable_unresolved` table is empty, but it's possible
# that future Unicode version update may cause that table become nonempty, in which
# case more rules needs to be added to the `process_mixedscript_single_to_multi` function
# above to cover those new cases.
def load_potential_mixedscript_confusables(f, identifier_allowed, scripts):
# First, load all confusables data from confusables.txt
confusables = load_confusables(f)
# The confusables.txt is reductive, means that it is intended to be used in
# on the fly substitutions. The code points that didn't occur in the file can be
# seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C,
# and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable.
# Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes.
# Principally we'll be using the rhs operands as the representive element of its equivalence classes.
# However some rhs operands are single code point, while some others are not.
# Here we collect them separately into `codepoint_map` and `multicodepoint_map`.
codepoint_map = {}
multicodepoint_map = {}
for item in confusables:
d_source = item[0]
# According to the RFC, we'll skip those code points that are restricted from identifier usage.
if not is_codepoint_identifier_allowed(d_source, identifier_allowed):
continue
d_proto_list = item[1]
if len(d_proto_list) == 1:
d_proto = escape_char(d_proto_list[0])
# we use the escaped representation of rhs as key to the dict when creating new equivalence class.
if d_proto not in codepoint_map:
codepoint_map[d_proto] = []
# when we create new equivalence class, we'll check whether the representative element should be collected.
# i.e. if it is not restricted from identifier usage, we collect it into the equivalence class.
if is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed):
codepoint_map[d_proto].append(d_proto_list[0])
# we collect the original code point to be substituted into this list.
codepoint_map[d_proto].append(d_source)
else:
d_protos = escape_char_list(d_proto_list)
# difference in multi code point case: the rhs part is not directly usable, however we store it in
# dict for further special examination between each lhs and this multi code point rhs.
# and there's an extra level of tuple here.
if d_protos not in multicodepoint_map:
multicodepoint_map[d_protos] = (d_proto_list, [])
multicodepoint_map[d_protos][1].append(d_source)
mixedscript_confusable = {}
def confusable_entry_item(confusable, script, item_text, item):
if script not in confusable:
confusable[script] = {}
script_entry = confusable[script]
if item_text not in script_entry:
script_entry[item_text] = (item, [])
return script_entry[item_text][1]
# First let's examine the each code point having single code point prototype case.
for _, source in codepoint_map.items():
source_len = len(source)
# Examine each pair in the equivalence class
for i in range(0, source_len - 1):
for j in range(i + 1, source_len):
item_i, item_j = source[i], source[j]
script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts)
# If they're in the same script, just skip this pair.
if script_i == script_j:
continue
# If `item_i` (the first) is not in a non-ignored script, and `item_j` (the second) is in a differnt one (maybe ignored),
# this means that this usage of the `item_i` can be suspicious, when it occurs in a document that is written in `script_j`.
# We'll consider it a mixed_script_confusable code point.
if not is_script_ignored_in_mixedscript(script_i):
# store it within the map, saving as much information as possible, for further investigation on the final results.
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j)
# Do the same in reverse from `item_j` to `item_i`
if not is_script_ignored_in_mixedscript(script_j):
confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i)
# Then let's examine the each code point having multiple code point prototype case.
# We'll check between the code points that shares the same prototype
for _, proto_lst_and_source in multicodepoint_map.items():
source = proto_lst_and_source[1]
source_len = len(source)
# This is basically the same as the single code point case.
for i in range(0, source_len - 1):
for j in range(i + 1, source_len):
item_i, item_j = source[i], source[j]
script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts)
if script_i == script_j:
continue
if not is_script_ignored_in_mixedscript(script_i):
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j)
if not is_script_ignored_in_mixedscript(script_j):
confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i)
mixedscript_confusable_unresolved = {}
# We'll also check between each code points and its multiple codepoint prototype
for _, proto_lst_and_source in multicodepoint_map.items():
proto_lst = proto_lst_and_source[0]
proto_lst_can_be_part_of_identifier = True
# If the prototype contains one or more restricted code point, then we skip it.
for c in proto_lst:
if not is_codepoint_identifier_allowed(c, identifier_allowed):
proto_lst_can_be_part_of_identifier = False
break
if not proto_lst_can_be_part_of_identifier:
continue
source = proto_lst_and_source[1]
source_len = len(source)
for i in range(0, source_len):
item_i = source[i]
# So here we're just checking whether the single code point should be considered confusable.
script_i = codepoint_script(item_i, scripts)
# If it's in ignored script, we don't need to do anything here.
if is_script_ignored_in_mixedscript(script_i):
continue
# Here're some rules on examining whether the single code point should be considered confusable.
# The principle is that, when subsitution happens, no new non-ignored script are introduced, and its
# own script is not lost.
processed, should_add = process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts)
if should_add:
assert(processed)
# Mark the single code point as confusable.
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append('multi')
if processed:
# Finished dealing with this code point.
continue
# If it's not processed we must be dealing with a newer version Unicode data, which introduced some significant
# changes. We don't throw an exception here, instead we collect it into a table for debugging purpose, and throw
# an exception after we returned and printed the table out.
proto_lst_text = escape_char_list(proto_lst)
if not proto_lst_text in mixedscript_confusable_unresolved:
mixedscript_confusable_unresolved[proto_lst_text] = (proto_lst, [])
mixedscript_confusable_unresolved[proto_lst_text][1].append(item_i)
return (mixedscript_confusable, mixedscript_confusable_unresolved)
def codepoint_script(c, scripts):
for x, y, script in scripts:
if c >= x and c <= y:
return script
raise Exception("Not in scripts: " + escape_char(c))
# Emit some useful information for debugging when further update happens.
def debug_emit_mixedscript_confusable(f, mixedscript_confusable, text, scripts):
f.write("/* " + text + "\n")
for script, lst in mixedscript_confusable.items():
f.write("/// Script - " + script + "\n")
source_lst = [v[0] for (_, v) in lst.items()]
source_lst.sort()
for source in source_lst:
source_text = escape_char(source)
source_item_and_target_lst = lst[source_text]
target_lst = source_item_and_target_lst[1]
f.write(source_text + " => " + escape_char_list(target_lst) + " // " + escape_script_list(target_lst, scripts)+ "\n")
f.write("*/\n")
def script_list(char_lst, scripts):
script_lst = []
for c in char_lst:
if c == 'multi':
script = 'Z~multi'
else:
script = codepoint_script(c, scripts)
if script not in script_lst:
script_lst.append(script)
return script_lst
def escape_script_list(char_lst, scripts):
script_lst = script_list(char_lst, scripts)
script_lst.sort()
return str(script_lst)
def debug_emit_mixedscript_confusable_unresolved(f, map, text, scripts):
if len(map) == 0:
return
print("// " + text + "\n")
for prototype_text, pair in map.items():
prototype = pair[0]
source = pair[1]
print(prototype_text + " => " + escape_char_list(source) + " // " + escape_script_list(prototype, scripts) + " => " + escape_script_list(source, scripts) + "\n")
raise Exception("update the python script to add new rules for new data")
def format_table_content(f, content, indent):
line = " "*indent
first = True
for chunk in content.split(","):
if len(line) + len(chunk) < 98:
if first:
line += chunk
else:
line += ", " + chunk
first = False
else:
f.write(line + ",\n")
line = " "*indent + chunk
f.write(line)
def escape_char(c):
if c == 'multi':
return "\"<multiple code points>\""
return "'\\u{%x}'" % c
def escape_char_list(l):
line = "["
first = True
for c in l:
if first:
line += escape_char(c)
else:
line += ", " + escape_char(c)
first = False
line += "]"
return line
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
pub_string = "const"
if not is_const:
pub_string = "let"
if is_pub:
pub_string = "pub " + pub_string
f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
data = ""
first = True
for dat in t_data:
if not first:
data += ","
first = False
data += pfun(dat)
format_table_content(f, data, 8)
f.write("\n ];\n\n")
def emit_identifier_module(f):
f.write("pub mod identifier {")
f.write("""
#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
#[allow(non_camel_case_types)]
/// https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type
pub enum IdentifierType {
// Restricted
Not_Character,
Deprecated,
Default_Ignorable,
Not_NFKC,
Not_XID,
Exclusion,
Obsolete,
Technical,
Uncommon_Use,
Limited_Use,
// Allowed
Inclusion,
Recommended
}
#[inline]
pub fn identifier_status_allowed(c: char) -> bool {
// FIXME: do we want to special case ASCII here?
match c as usize {
_ => super::util::bsearch_range_table(c, IDENTIFIER_STATUS)
}
}
#[inline]
pub fn identifier_type(c: char) -> Option<IdentifierType> {
// FIXME: do we want to special case ASCII here?
match c as usize {
_ => super::util::bsearch_range_value_table(c, IDENTIFIER_TYPE)
}
}
""")
f.write(" // Identifier status table:\n")
identifier_status_table = load_properties("IdentifierStatus.txt")
emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
identifier_type = load_properties("IdentifierType.txt")
type_table = []
for ty in identifier_type:
type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]])
type_table.sort(key=lambda w: w[0])
emit_table(f, "IDENTIFIER_TYPE", type_table, "&'static [(char, char, IdentifierType)]", is_pub=False,
pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
f.write("}\n\n")
def emit_confusable_detection_module(f):
f.write("pub mod confusable_detection {")
f.write("""
#[inline]
pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
// FIXME: do we want to special case ASCII here?
match c as usize {
_ => super::util::bsearch_value_table(c, CONFUSABLES)
}
}
""")
f.write(" // Confusable table:\n")
confusable_table = load_confusables("confusables.txt")
confusable_table.sort(key=lambda w: w[0])
last_key = None
for (k, _) in confusable_table:
if k == last_key:
raise Exception("duplicate keys in confusables table: %s" % k)
last_key = k
emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, &'static [char])]", is_pub=False,
pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1])))
f.write("}\n\n")
def escape_script_constant(name, longforms):
return "Script::" + longforms[name].strip()
def emit_potiential_mixed_script_confusable(f):
f.write("pub mod potential_mixed_script_confusable {")
f.write("""
#[inline]
pub fn potential_mixed_script_confusable(c: char) -> bool {
match c as usize {
_ => super::util::bsearch_table(c, CONFUSABLES)
}
}
""")
identifier_status_table = load_properties("IdentifierStatus.txt")
_, scripts = load_scripts("Scripts.txt")
identifier_allowed = identifier_status_table['Allowed']
(mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
debug = False
if debug == True:
debug_emit_mixedscript_confusable(f, mixedscript_confusable, "mixedscript_confusable", scripts)
debug_emit_mixedscript_confusable_unresolved(f, mixedscript_confusable_unresolved, "mixedscript_confusable_unresolved", scripts)
confusable_table = []
for script, lst in mixedscript_confusable.items():
for _, pair in lst.items():
source = pair[0]
confusable_table.append((source, script))
confusable_table.sort(key=lambda w: w[0])
emit_table(f, "CONFUSABLES", confusable_table, "&'static [char]", is_pub=False,
pfun=lambda x: "%s" % escape_char(x[0]))
f.write("}\n\n")
def emit_util_mod(f):
f.write("""
pub mod util {
use core::result::Result::{Ok, Err};
#[inline]
pub fn bsearch_table(c: char, r: &'static [char]) -> bool {
r.binary_search(&c).is_ok()
}
#[inline]
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
match r.binary_search_by_key(&c, |&(k, _)| k) {
Ok(idx) => {
let (_, v) = r[idx];
Some(v)
}
Err(_) => None
}
}
#[inline]
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
use core::cmp::Ordering::{Equal, Less, Greater};
r.binary_search_by(|&(lo,hi)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}).is_ok()
}
pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Ok(idx) => {
let (_, _, cat) = r[idx];
Some(cat)
}
Err(_) => None
}
}
}
""")
if __name__ == "__main__":
r = "tables.rs"
if os.path.exists(r):
os.remove(r)
with open(r, "w") as rf:
# write the file's preamble
rf.write(preamble)
rf.write("""
/// The version of [Unicode](http://www.unicode.org/)
/// that this version of unicode-security is based on.
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" % UNICODE_VERSION)
emit_util_mod(rf)
### identifier module
emit_identifier_module(rf)
### confusable_detection module
emit_confusable_detection_module(rf)
### mixed_script_confusable_detection module
emit_potiential_mixed_script_confusable(rf)
|