From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/unicode-script/scripts/unicode.py | 402 +++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 vendor/unicode-script/scripts/unicode.py (limited to 'vendor/unicode-script/scripts') diff --git a/vendor/unicode-script/scripts/unicode.py b/vendor/unicode-script/scripts/unicode.py new file mode 100644 index 000000000..e40a92c6d --- /dev/null +++ b/vendor/unicode-script/scripts/unicode.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python +# +# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# This script uses the following Unicode tables: +# - PropertyValueAliases.txt +# - ScriptExtensions.txt +# - Scripts.txt +# +# Since this should not require frequent updates, we just store this +# out-of-line and check the unicode.rs file into git. + +import fileinput, re, os, sys + +preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly + +#![allow(missing_docs, non_upper_case_globals, non_snake_case)] + +pub use tables_impl::*; + +#[rustfmt::skip] +mod tables_impl { +use crate::ScriptExtension; +''' + +# Close `mod impl {` +ending=''' +} +''' + +UNICODE_VERSION = (13, 0, 0) + +UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION + +def escape_char(c): + return "'\\u{%x}'" % c + +def fetch(f): + if not os.path.exists(os.path.basename(f)): + if "emoji" in f: + os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s" + % (UNICODE_VERSION[0], UNICODE_VERSION[1], f)) + else: + os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s" + % (UNICODE_VERSION_NUMBER, f)) + + if not os.path.exists(os.path.basename(f)): + sys.stderr.write("cannot load %s" % f) + exit(1) + +def group_cats(cats): + cats_out = {} + for cat in cats: + cats_out[cat] = group_cat(cats[cat]) + return cats_out + +def aliases(): + """ + Fetch the shorthand aliases for each longhand Script name + """ + fetch("PropertyValueAliases.txt") + longforms = {} + shortforms = {} + re1 = re.compile(r"^ *sc *; *(\w+) *; *(\w+)") + for line in fileinput.input(os.path.basename("PropertyValueAliases.txt")): + m = re1.match(line) + if m: + l = m.group(2).strip() + s = m.group(1).strip() + assert(s not in longforms) + assert(l not in shortforms) + longforms[s] = l + shortforms[l] = s + else: + continue + + return (longforms, shortforms) + +def format_table_content(f, content, indent): + line = " "*indent + first = True + for chunk in content.split(","): + if len(line) + len(chunk) < 98: + if first: + line += chunk + else: + line += ", " + chunk + first = False + else: + f.write(line + ",\n") + line = " "*indent + chunk + f.write(line) + +# Implementation from unicode-segmentation +def load_properties(f, interestingprops): + fetch(f) + props = {} + # Note: these regexes are different from those in unicode-segmentation, + # becase we need to handle spaces here + re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#") + re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#") + + for line in fileinput.input(os.path.basename(f)): + prop = None + d_lo = 0 + d_hi = 0 + m = re1.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(1) + prop = m.group(2).strip() + else: + m = re2.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(2) + prop = m.group(3).strip() + else: + continue + if interestingprops and prop not in interestingprops: + continue + d_lo = int(d_lo, 16) + d_hi = int(d_hi, 16) + if prop not in props: + props[prop] = [] + props[prop].append((d_lo, d_hi)) + + return props + +# Implementation from unicode-segmentation +def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, + pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True): + pub_string = "const" + if not is_const: + pub_string = "let" + if is_pub: + pub_string = "pub " + pub_string + f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type)) + data = "" + first = True + for dat in t_data: + if not first: + data += "," + first = False + data += pfun(dat) + format_table_content(f, data, 8) + f.write("\n ];\n\n") + +def emit_search(f): + f.write(""" +pub fn bsearch_range_value_table(c: char, r: &'static [(char, char, T)]) -> Option { + use core::cmp::Ordering::{Equal, Less, Greater}; + match r.binary_search_by(|&(lo, hi, _)| { + if lo <= c && c <= hi { Equal } + else if hi < c { Less } + else { Greater } + }) { + Ok(idx) => { + let (_, _, cat) = r[idx]; + Some(cat) + } + Err(_) => None + } +} + +#[inline] +pub fn get_script(c: char) -> Option