diff options
Diffstat (limited to 'vendor/unicode-properties/scripts/unicode.py')
-rw-r--r-- | vendor/unicode-properties/scripts/unicode.py | 542 |
1 files changed, 542 insertions, 0 deletions
diff --git a/vendor/unicode-properties/scripts/unicode.py b/vendor/unicode-properties/scripts/unicode.py new file mode 100644 index 000000000..6bb300b97 --- /dev/null +++ b/vendor/unicode-properties/scripts/unicode.py @@ -0,0 +1,542 @@ +#!/usr/bin/env python3 +# +# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# This script uses the following Unicode UCD data: +# - emoji/emoji-data.txt +# +# Since this should not require frequent updates, we just store this +# out-of-line and check the tables.rs file into git. + +import fileinput, re, os, sys, operator + +preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly + +#![allow(missing_docs, non_upper_case_globals, non_snake_case)] +''' + +UNICODE_VERSION = (15, 0, 0) + +UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION + +# Download a UCD table file +def fetch_unidata(f): + if not os.path.exists(os.path.basename(f)): + os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s" + % (UNICODE_VERSION_NUMBER, f)) + + if not os.path.exists(os.path.basename(f)): + sys.stderr.write("cannot load %s" % f) + exit(1) + +# Loads code point data from emoji-data.txt +# Implementation from unicode-segmentation +def load_emoji_properties(f): + fetch_unidata(f) + kinds = {} + re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") + re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+) *#") + + for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")): + kind = None + d_lo = 0 + d_hi = 0 + m = re1.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(1) + kind = m.group(2).strip() + else: + m = re2.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(2) + kind = m.group(3).strip() + else: + continue + d_lo = int(d_lo, 16) + d_hi = int(d_hi, 16) + if kind not in kinds: + kinds[kind] = [] + kinds[kind].append((d_lo, d_hi)) + + return kinds + + +def load_general_category_properties(f): + fetch_unidata(f) + general_category_list = [] + re1 = re.compile(r"^([0-9A-F]+);([^;]+);([A-Za-z]+);.*$") + re2 = re.compile(r"^<(.*), First>$") + re3 = re.compile(r"^<(.*), Last>$") + re4 = re.compile(r"^<(.*)>$") + + special_group_lo = 0 + special_group_text = '' + special_group_gc = '' + for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")): + d_ch = 0 + d_name = '' + d_gc = '' + d_lo = 0 + d_hi = 0 + m = re1.match(line) + if not m: + continue + + d_ch = m.group(1) + d_name = m.group(2).strip() + d_gc = m.group(3).strip() + + if not d_name.startswith('<'): + d_lo = int(d_ch, 16) + d_hi = d_lo + general_category_list.append((d_lo, d_hi, d_gc)) + continue + m2 = re2.match(d_name) + if m2: + special_group_lo = int(d_ch, 16) + special_group_text = m2.group(1) + special_group_gc = d_gc + continue + m3 = re3.match(d_name) + if m3: + assert(special_group_text == m3.group(1)) + assert(special_group_gc == d_gc) + d_lo = special_group_lo + d_hi = int(d_ch, 16) + general_category_list.append((d_lo, d_hi, d_gc)) + continue + m4 = re4.match(d_name) + if m4: + d_lo = int(d_ch, 16) + d_hi = d_lo + general_category_list.append((d_lo, d_hi, d_gc)) + continue + raise ValueError("unreachable") + return general_category_list + +def format_table_content(f, content, indent): + line = " "*indent + first = True + for chunk in content.split(","): + if len(line) + len(chunk) < 98: + if first: + line += chunk + else: + line += ", " + chunk + first = False + else: + f.write(line + ",\n") + line = " "*indent + chunk + f.write(line) + +def escape_char(c): + if c == 'multi': + return "\"<multiple code points>\"" + return "'\\u{%x}'" % c + +def escape_char_list(l): + line = "[" + first = True + for c in l: + if first: + line += escape_char(c) + else: + line += ", " + escape_char(c) + first = False + line += "]" + return line + +def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, + pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True): + pub_string = "const" + if not is_const: + pub_string = "let" + if is_pub: + pub_string = "pub " + pub_string + f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type)) + data = "" + first = True + for dat in t_data: + if not first: + data += "," + first = False + data += pfun(dat) + format_table_content(f, data, 8) + f.write("\n ];\n\n") + +def emit_general_category_module(f): + f.write("""#[cfg(feature = \"general-category\")] +pub mod general_category {""") + f.write(""" + + #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)] + /// The most general classification of a character. + pub enum GeneralCategory { + /// `Lu`, an uppercase letter + UppercaseLetter, + /// `Ll`, a lowercase letter + LowercaseLetter, + /// `Lt`, a digraphic character, with first part uppercase + TitlecaseLetter, + /// `Lm`, a modifier letter + ModifierLetter, + /// `Lo`, other letters, including syllables and ideographs + OtherLetter, + /// `Mn`, a nonspacing combining mark (zero advance width) + NonspacingMark, + /// `Mc`, a spacing combining mark (positive advance width) + SpacingMark, + /// `Me`, an enclosing combining mark + EnclosingMark, + /// `Nd`, a decimal digit + DecimalNumber, + /// `Nl`, a letterlike numeric character + LetterNumber, + /// `No`, a numeric character of other type + OtherNumber, + /// `Pc`, a connecting punctuation mark, like a tie + ConnectorPunctuation, + /// `Pd`, a dash or hyphen punctuation mark + DashPunctuation, + /// `Ps`, an opening punctuation mark (of a pair) + OpenPunctuation, + /// `Pe`, a closing punctuation mark (of a pair) + ClosePunctuation, + /// `Pi`, an initial quotation mark + InitialPunctuation, + /// `Pf`, a final quotation mark + FinalPunctuation, + /// `Po`, a punctuation mark of other type + OtherPunctuation, + /// `Sm`, a symbol of mathematical use + MathSymbol, + /// `Sc`, a currency sign + CurrencySymbol, + /// `Sk`, a non-letterlike modifier symbol + ModifierSymbol, + /// `So`, a symbol of other type + OtherSymbol, + /// `Zs`, a space character (of various non-zero widths) + SpaceSeparator, + /// `Zl`, U+2028 LINE SEPARATOR only + LineSeparator, + /// `Zp`, U+2029 PARAGRAPH SEPARATOR only + ParagraphSeparator, + /// `Cc`, a C0 or C1 control code + Control, + /// `Cf`, a format control character + Format, + /// `Cs`, a surrogate code point + Surrogate, + /// `Co`, a private-use character + PrivateUse, + /// `Cn`, a reserved unassigned code point or a noncharacter + Unassigned, + } + + #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)] + /// Groupings of the most general classification of a character. + pub enum GeneralCategoryGroup { + /// Lu | Ll | Lt | Lm | Lo + Letter, + /// Mn | Mc | Me + Mark, + /// Nd | Nl | No + Number, + /// Pc | Pd | Ps | Pe | Pi | Pf | Po + Punctuation, + /// Sm | Sc | Sk | So + Symbol, + /// Zs | Zl | Zp + Separator, + /// Cc | Cf | Cs | Co | Cn + Other, + } + + #[inline] + pub(crate) fn general_category_of_char(c: char) -> GeneralCategory { + match c as usize { + _ => super::util::bsearch_range_value_table(c, GENERAL_CATEGORY).unwrap_or(GeneralCategory::Unassigned) + } + } + + #[inline] + pub(crate) fn general_category_is_letter_cased(gc: GeneralCategory) -> bool { + matches!(gc, GeneralCategory::UppercaseLetter | GeneralCategory::LowercaseLetter | GeneralCategory::TitlecaseLetter) + } + + #[inline] + pub(crate) fn general_category_group(gc: GeneralCategory) -> GeneralCategoryGroup { + match gc { + GeneralCategory::UppercaseLetter | + GeneralCategory::LowercaseLetter | + GeneralCategory::TitlecaseLetter | + GeneralCategory::ModifierLetter | + GeneralCategory::OtherLetter => GeneralCategoryGroup::Letter, + GeneralCategory::NonspacingMark | + GeneralCategory::SpacingMark | + GeneralCategory::EnclosingMark => GeneralCategoryGroup::Mark, + GeneralCategory::DecimalNumber | + GeneralCategory::LetterNumber | + GeneralCategory::OtherNumber => GeneralCategoryGroup::Number, + GeneralCategory::ConnectorPunctuation | + GeneralCategory::DashPunctuation | + GeneralCategory::OpenPunctuation | + GeneralCategory::ClosePunctuation | + GeneralCategory::InitialPunctuation | + GeneralCategory::FinalPunctuation | + GeneralCategory::OtherPunctuation => GeneralCategoryGroup::Punctuation, + GeneralCategory::MathSymbol | + GeneralCategory::CurrencySymbol | + GeneralCategory::ModifierSymbol | + GeneralCategory::OtherSymbol => GeneralCategoryGroup::Symbol, + GeneralCategory::SpaceSeparator | + GeneralCategory::LineSeparator | + GeneralCategory::ParagraphSeparator => GeneralCategoryGroup::Separator, + GeneralCategory::Control | + GeneralCategory::Format | + GeneralCategory::Surrogate | + GeneralCategory::PrivateUse | + GeneralCategory::Unassigned => GeneralCategoryGroup::Other, + } + } +""") + gc_variants = { + "Lu": "GeneralCategory::UppercaseLetter", + "Ll": "GeneralCategory::LowercaseLetter" , + "Lt": "GeneralCategory::TitlecaseLetter" , + "Lm": "GeneralCategory::ModifierLetter" , + "Lo": "GeneralCategory::OtherLetter", + "Mn": "GeneralCategory::NonspacingMark", + "Mc": "GeneralCategory::SpacingMark" , + "Me": "GeneralCategory::EnclosingMark", + "Nd": "GeneralCategory::DecimalNumber", + "Nl": "GeneralCategory::LetterNumber" , + "No": "GeneralCategory::OtherNumber", + "Pc": "GeneralCategory::ConnectorPunctuation", + "Pd": "GeneralCategory::DashPunctuation" , + "Ps": "GeneralCategory::OpenPunctuation" , + "Pe": "GeneralCategory::ClosePunctuation" , + "Pi": "GeneralCategory::InitialPunctuation" , + "Pf": "GeneralCategory::FinalPunctuation" , + "Po": "GeneralCategory::OtherPunctuation", + "Sm": "GeneralCategory::MathSymbol", + "Sc": "GeneralCategory::CurrencySymbol" , + "Sk": "GeneralCategory::ModifierSymbol" , + "So": "GeneralCategory::OtherSymbol", + "Zs": "GeneralCategory::SpaceSeparator", + "Zl": "GeneralCategory::LineSeparator" , + "Zp": "GeneralCategory::ParagraphSeparator", + "Cc": "GeneralCategory::Control", + "Cf": "GeneralCategory::Format" , + "Cs": "GeneralCategory::Surrogate" , + "Co": "GeneralCategory::PrivateUse" , + "Cn": "GeneralCategory::Unassigned", + } + + f.write(" // General category table:\n") + general_category_char_table = load_general_category_properties("UnicodeData.txt") + general_category_group_table = [] + for input_idx in range(len(general_category_char_table)): + if general_category_char_table[input_idx][2] == "Cs": + continue + existing_group_count = len(general_category_group_table) + if existing_group_count == 0: + general_category_group_table.append(general_category_char_table[input_idx]) + elif (general_category_group_table[existing_group_count - 1][1] + 1 == general_category_char_table[input_idx][0] and + general_category_group_table[existing_group_count - 1][2] == general_category_char_table[input_idx][2]): + general_category_group_table[existing_group_count - 1] = (general_category_group_table[existing_group_count - 1][0], + general_category_char_table[input_idx][1], general_category_group_table[existing_group_count - 1][2]) + else: + general_category_group_table.append(general_category_char_table[input_idx]) + emit_table(f, "GENERAL_CATEGORY", general_category_group_table, "&'static [(char, char, GeneralCategory)]", is_pub=False, + pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), gc_variants[x[2]])) + f.write("}\n\n") + + +def emit_emoji_module(f): + f.write("""#[cfg(feature = \"emoji\")] +pub mod emoji {""") + f.write(""" + + #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)] + #[non_exhaustive] + /// The emoji character properties of a character. + pub enum EmojiStatus { + /// `Emoji=NO`, `Emoji_Component=NO` + NonEmoji, + /// `Emoji=NO`, `Emoji_Component=YES` + NonEmojiButEmojiComponent, + /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES` + EmojiPresentation, + /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Modifier_Base=YES` + EmojiModifierBase, + /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES`, `Emoji_Modifier_Base=YES` + EmojiPresentationAndModifierBase, + /// `Emoji=YES`, `Emoji_Component=NO` + EmojiOther, + /// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES` + EmojiPresentationAndEmojiComponent, + /// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES`, `Emoji_Modifier=YES` + EmojiPresentationAndModifierAndEmojiComponent, + /// `Emoji=YES`, `Emoji_Component=YES` + EmojiOtherAndEmojiComponent, + } + #[inline] + pub(crate) fn emoji_status(c: char) -> EmojiStatus { + // FIXME: do we want to special case ASCII here? + match c as usize { + _ => super::util::bsearch_range_value_table(c, EMOJI_STATUS).unwrap() + } + } + #[inline] + pub(crate) fn is_emoji_status_for_emoji_char_or_emoji_component(s: EmojiStatus) -> bool { + !matches!(s, EmojiStatus::NonEmoji) + } + #[inline] + pub(crate) fn is_emoji_status_for_emoji_char(s: EmojiStatus) -> bool { + !matches!(s, EmojiStatus::NonEmoji | EmojiStatus::NonEmojiButEmojiComponent) + } + #[inline] + pub(crate) fn is_emoji_status_for_emoji_component(s: EmojiStatus) -> bool { + matches!(s, EmojiStatus::EmojiPresentationAndEmojiComponent | + EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent | + EmojiStatus::EmojiOtherAndEmojiComponent) + } +""") + + f.write(" // Emoji status table:\n") + emoji_status_table = load_emoji_properties("emoji/emoji-data.txt") + # we combine things together here. + + # `Extended_Pictographic`` is only for future proof usages, we ignore it here. + # emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component", "Extended_Pictographic"] + emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component"] + + # need to skip surrogates because they're not representable by rust `char`s + emoji_status_table["Surrogate"] = [(0xD800, 0xDFFF)] + emoji_prop_list.append("Surrogate") + + emoji_prop_list_len = [len(emoji_status_table[x]) for x in emoji_prop_list] + emoji_prop_count = len(emoji_prop_list) + code_point_first = 0 + code_point_last = 0x10FFFF + emoji_prop_list_pos = [0 for _ in emoji_prop_list] + cur_group_first = code_point_first + emoji_table = [] + def group_text(s): + if s == "Surrogate": + return "<Surrogate>" + elif s == "": + return "EmojiStatus::NonEmoji" + elif s == "Emoji_Component": + return "EmojiStatus::NonEmojiButEmojiComponent" + elif s == "Emoji;Emoji_Presentation": + return "EmojiStatus::EmojiPresentation" + elif s == "Emoji;Emoji_Presentation;Emoji_Modifier_Base": + return "EmojiStatus::EmojiPresentationAndModifierBase" + elif s == "Emoji;Emoji_Modifier_Base": + return "EmojiStatus::EmojiModifierBase" + elif s == "Emoji": + return "EmojiStatus::EmojiOther" + elif s == "Emoji;Emoji_Presentation;Emoji_Component": + return "EmojiStatus::EmojiPresentationAndEmojiComponent" + elif s == "Emoji;Emoji_Presentation;Emoji_Modifier;Emoji_Component": + return "EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent" + elif s == "Emoji;Emoji_Component": + return "EmojiStatus::EmojiOtherAndEmojiComponent" + else: + return "EmojiStatus::NewCombination(\"" + s + "\")" + while cur_group_first <= code_point_last: + cur_group_props = [] + cur_group_last = code_point_last + for prop_list_idx in range(emoji_prop_count): + if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]: + continue + elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first: + cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] - 1) + else: + cur_group_props.append(emoji_prop_list[prop_list_idx]) + cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]) + cur_group_text = group_text(";".join(cur_group_props)) + if cur_group_text != "<Surrogate>": + emoji_table.append((cur_group_first, cur_group_last, cur_group_text)) + for prop_list_idx in range(emoji_prop_count): + if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]: + continue + elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first: + continue + else: + if cur_group_last == emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]: + emoji_prop_list_pos[prop_list_idx] += 1 + cur_group_first = cur_group_last + 1 + + emit_table(f, "EMOJI_STATUS", emoji_table, "&'static [(char, char, EmojiStatus)]", is_pub=False, + pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) + f.write("}\n\n") + +def emit_util_mod(f): + f.write(""" +#[allow(dead_code)] +pub mod util { + use core::result::Result::{Ok, Err}; + + pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> { + use core::cmp::Ordering::{Equal, Less, Greater}; + match r.binary_search_by(|&(lo, hi, _)| { + if lo <= c && c <= hi { Equal } + else if hi < c { Less } + else { Greater } + }) { + Ok(idx) => { + let (_, _, cat) = r[idx]; + Some(cat) + } + Err(_) => None + } + } + +} + +""") + +if __name__ == "__main__": + r = "tables.rs" + if os.path.exists(r): + os.remove(r) + with open(r, "w") as rf: + # write the file's preamble + rf.write(preamble) + + rf.write(""" +/// The version of [Unicode](http://www.unicode.org/) +/// that this version of unicode-security is based on. +pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); + +""" % UNICODE_VERSION) + + emit_util_mod(rf) + ### general category module + emit_general_category_module(rf) + ### emoji module + emit_emoji_module(rf) |