diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/unicode_categories/scripts | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/unicode_categories/scripts')
-rw-r--r-- | vendor/unicode_categories/scripts/unicode.py | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/vendor/unicode_categories/scripts/unicode.py b/vendor/unicode_categories/scripts/unicode.py new file mode 100644 index 000000000..6a35d1742 --- /dev/null +++ b/vendor/unicode_categories/scripts/unicode.py @@ -0,0 +1,93 @@ +#!/usr/bin/python + +import collections +import re + +column_size = 8 + +categories = { + 'Cc': ['Other', 'Control'], + 'Cf': ['Other', 'Format'], + 'Cn': ['Other', 'NotAssigned'], + 'Co': ['Other', 'PrivateUse'], + 'Cs': ['Other', 'Surrogate'], + 'Ls': ['Letter', 'Cased'], + 'Ll': ['Letter', 'Lowercased'], + 'Lm': ['Letter', 'Modifier'], + 'Lo': ['Letter', 'Other'], + 'Lt': ['Letter', 'Titlecase'], + 'Lu': ['Letter', 'Uppercase'], + 'Mc': ['Mark', 'SpaceCombining'], + 'Me': ['Mark', 'Enclosing'], + 'Mn': ['Mark', 'Nonspacing'], + 'Nd': ['Number', 'DecimalDigit'], + 'Nl': ['Number', 'Letter'], + 'No': ['Number', 'Other'], + 'Pc': ['Punctuation', 'Connector'], + 'Pd': ['Punctuation', 'Dash'], + 'Pe': ['Punctuation', 'Close'], + 'Pf': ['Punctuation', 'FinalQuote'], + 'Pi': ['Punctuation', 'InitialQuote'], + 'Po': ['Punctuation', 'Other'], + 'Ps': ['Punctuation', 'Open'], + 'Sc': ['Symbol', 'Currency'], + 'Sk': ['Symbol', 'Modifier'], + 'Sm': ['Symbol', 'Math'], + 'So': ['Symbol', 'Other'], + 'Zl': ['Separator', 'Line'], + 'Zp': ['Separator', 'Paragraph'], + 'Zs': ['Separator', 'Space'] +} + +def generate_rows(): + with open('UnicodeData.txt', 'r') as ucd: + for line in ucd: + split = line.split(';') + char, category = split[0], split[2] + yield (char, category) + + +def generate_dict(rows_gen): + d = collections.defaultdict(list) + for char, category in rows_gen: + if category == 'Cs': + # for whatever reason, rust doesn't allow this class of characters + # as unicode literals. + continue + d[category].append(char) + return d + +def generate_tables(d): + new_dict = collections.defaultdict(list) + for key in d.keys(): + name = ''.join(categories[key]) + new_dict[name] = d[key] + return new_dict + +def print_header(): + print("// This file is autogenerated by scripts/unicode.py.\n") + +def main(): + print_header() + row_generator = generate_rows() + dictionary = generate_dict(row_generator) + named_table = generate_tables(dictionary) + output_tables(named_table) + +def output_tables(d): + for key in sorted(d.keys()): + name = camel_to_snake_case(key).upper() + rust_unicode_escapes = map(lambda x: r"'\u{{{}}}'".format(x), d[key]) + table_lines = [] + for chunk in [rust_unicode_escapes[x:x+column_size] for x in xrange(0, len(rust_unicode_escapes), column_size)]: + table_lines.append(' ' + ', '.join(chunk)) + table_string = ',\n'.join(table_lines) + print("pub static {} : &'static [char] = &[\n{}];\n".format(name, table_string)) + +def camel_to_snake_case(name): + # thanks to http://stackoverflow.com/a/1176023/1030074 + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + +if __name__ == "__main__": + main() |