vendor/unicode_categories/scripts/unicode.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

#!/usr/bin/python

import collections
import re

column_size = 8

categories = {
    'Cc': ['Other', 'Control'],
    'Cf': ['Other', 'Format'],
    'Cn': ['Other', 'NotAssigned'],
    'Co': ['Other', 'PrivateUse'],
    'Cs': ['Other', 'Surrogate'],
    'Ls': ['Letter', 'Cased'],
    'Ll': ['Letter', 'Lowercased'],
    'Lm': ['Letter', 'Modifier'],
    'Lo': ['Letter', 'Other'],
    'Lt': ['Letter', 'Titlecase'],
    'Lu': ['Letter', 'Uppercase'],
    'Mc': ['Mark', 'SpaceCombining'],
    'Me': ['Mark', 'Enclosing'],
    'Mn': ['Mark', 'Nonspacing'],
    'Nd': ['Number', 'DecimalDigit'],
    'Nl': ['Number', 'Letter'],
    'No': ['Number', 'Other'],
    'Pc': ['Punctuation', 'Connector'],
    'Pd': ['Punctuation', 'Dash'],
    'Pe': ['Punctuation', 'Close'],
    'Pf': ['Punctuation', 'FinalQuote'],
    'Pi': ['Punctuation', 'InitialQuote'],
    'Po': ['Punctuation', 'Other'],
    'Ps': ['Punctuation', 'Open'],
    'Sc': ['Symbol', 'Currency'],
    'Sk': ['Symbol', 'Modifier'],
    'Sm': ['Symbol', 'Math'],
    'So': ['Symbol', 'Other'],
    'Zl': ['Separator', 'Line'],
    'Zp': ['Separator', 'Paragraph'],
    'Zs': ['Separator', 'Space']
}

def generate_rows():
    with open('UnicodeData.txt', 'r') as ucd:
        for line in ucd:
            split = line.split(';')
            char, category = split[0], split[2]
            yield (char, category)


def generate_dict(rows_gen):
    d = collections.defaultdict(list)
    for char, category in rows_gen:
        if category == 'Cs':
            # for whatever reason, rust doesn't allow this class of characters
            # as unicode literals.
            continue
        d[category].append(char)
    return d

def generate_tables(d):
    new_dict = collections.defaultdict(list)
    for key in d.keys():
        name = ''.join(categories[key])
        new_dict[name] = d[key]
    return new_dict

def print_header():
    print("// This file is autogenerated by scripts/unicode.py.\n")

def main():
    print_header()
    row_generator = generate_rows()
    dictionary = generate_dict(row_generator)
    named_table = generate_tables(dictionary)
    output_tables(named_table)

def output_tables(d):
    for key in sorted(d.keys()):
        name = camel_to_snake_case(key).upper()
        rust_unicode_escapes = map(lambda x: r"'\u{{{}}}'".format(x), d[key])
        table_lines = []
        for chunk in [rust_unicode_escapes[x:x+column_size] for x in xrange(0, len(rust_unicode_escapes), column_size)]:
            table_lines.append('    ' + ', '.join(chunk))
        table_string = ',\n'.join(table_lines)
        print("pub static {} : &'static [char] = &[\n{}];\n".format(name, table_string))

def camel_to_snake_case(name):
    # thanks to http://stackoverflow.com/a/1176023/1030074
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

if __name__ == "__main__":
    main()