summaryrefslogtreecommitdiffstats
path: root/vendor/unicode_categories/scripts
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
commit698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree173a775858bd501c378080a10dca74132f05bc50 /vendor/unicode_categories/scripts
parentInitial commit. (diff)
downloadrustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/unicode_categories/scripts')
-rw-r--r--vendor/unicode_categories/scripts/unicode.py93
1 files changed, 93 insertions, 0 deletions
diff --git a/vendor/unicode_categories/scripts/unicode.py b/vendor/unicode_categories/scripts/unicode.py
new file mode 100644
index 000000000..6a35d1742
--- /dev/null
+++ b/vendor/unicode_categories/scripts/unicode.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python
+
+import collections
+import re
+
+column_size = 8
+
+categories = {
+ 'Cc': ['Other', 'Control'],
+ 'Cf': ['Other', 'Format'],
+ 'Cn': ['Other', 'NotAssigned'],
+ 'Co': ['Other', 'PrivateUse'],
+ 'Cs': ['Other', 'Surrogate'],
+ 'Ls': ['Letter', 'Cased'],
+ 'Ll': ['Letter', 'Lowercased'],
+ 'Lm': ['Letter', 'Modifier'],
+ 'Lo': ['Letter', 'Other'],
+ 'Lt': ['Letter', 'Titlecase'],
+ 'Lu': ['Letter', 'Uppercase'],
+ 'Mc': ['Mark', 'SpaceCombining'],
+ 'Me': ['Mark', 'Enclosing'],
+ 'Mn': ['Mark', 'Nonspacing'],
+ 'Nd': ['Number', 'DecimalDigit'],
+ 'Nl': ['Number', 'Letter'],
+ 'No': ['Number', 'Other'],
+ 'Pc': ['Punctuation', 'Connector'],
+ 'Pd': ['Punctuation', 'Dash'],
+ 'Pe': ['Punctuation', 'Close'],
+ 'Pf': ['Punctuation', 'FinalQuote'],
+ 'Pi': ['Punctuation', 'InitialQuote'],
+ 'Po': ['Punctuation', 'Other'],
+ 'Ps': ['Punctuation', 'Open'],
+ 'Sc': ['Symbol', 'Currency'],
+ 'Sk': ['Symbol', 'Modifier'],
+ 'Sm': ['Symbol', 'Math'],
+ 'So': ['Symbol', 'Other'],
+ 'Zl': ['Separator', 'Line'],
+ 'Zp': ['Separator', 'Paragraph'],
+ 'Zs': ['Separator', 'Space']
+}
+
+def generate_rows():
+ with open('UnicodeData.txt', 'r') as ucd:
+ for line in ucd:
+ split = line.split(';')
+ char, category = split[0], split[2]
+ yield (char, category)
+
+
+def generate_dict(rows_gen):
+ d = collections.defaultdict(list)
+ for char, category in rows_gen:
+ if category == 'Cs':
+ # for whatever reason, rust doesn't allow this class of characters
+ # as unicode literals.
+ continue
+ d[category].append(char)
+ return d
+
+def generate_tables(d):
+ new_dict = collections.defaultdict(list)
+ for key in d.keys():
+ name = ''.join(categories[key])
+ new_dict[name] = d[key]
+ return new_dict
+
+def print_header():
+ print("// This file is autogenerated by scripts/unicode.py.\n")
+
+def main():
+ print_header()
+ row_generator = generate_rows()
+ dictionary = generate_dict(row_generator)
+ named_table = generate_tables(dictionary)
+ output_tables(named_table)
+
+def output_tables(d):
+ for key in sorted(d.keys()):
+ name = camel_to_snake_case(key).upper()
+ rust_unicode_escapes = map(lambda x: r"'\u{{{}}}'".format(x), d[key])
+ table_lines = []
+ for chunk in [rust_unicode_escapes[x:x+column_size] for x in xrange(0, len(rust_unicode_escapes), column_size)]:
+ table_lines.append(' ' + ', '.join(chunk))
+ table_string = ',\n'.join(table_lines)
+ print("pub static {} : &'static [char] = &[\n{}];\n".format(name, table_string))
+
+def camel_to_snake_case(name):
+ # thanks to http://stackoverflow.com/a/1176023/1030074
+ s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+ return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+if __name__ == "__main__":
+ main()