1 files changed, 542 insertions, 0 deletions
diff --git a/vendor/unicode-properties/scripts/unicode.py b/vendor/unicode-properties/scripts/unicode.py
new file mode 100644
index 000000000..6bb300b97
--- /dev/null
+++ b/vendor/unicode-properties/scripts/unicode.py
@@ -0,0 +1,542 @@
+#!/usr/bin/env python3
+#
+# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+# This script uses the following Unicode UCD data:
+# - emoji/emoji-data.txt
+#
+# Since this should not require frequent updates, we just store this
+# out-of-line and check the tables.rs file into git.
+
+import fileinput, re, os, sys, operator
+
+preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
+
+#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
+'''
+
+UNICODE_VERSION = (15, 0, 0)
+
+UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
+
+# Download a UCD table file
+def fetch_unidata(f):
+    if not os.path.exists(os.path.basename(f)):
+        os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s"
+                  % (UNICODE_VERSION_NUMBER, f))
+
+    if not os.path.exists(os.path.basename(f)):
+        sys.stderr.write("cannot load %s" % f)
+        exit(1)
+
+# Loads code point data from emoji-data.txt
+# Implementation from unicode-segmentation
+def load_emoji_properties(f):
+    fetch_unidata(f)
+    kinds = {}
+    re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
+    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+) *#")
+
+    for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
+        kind = None
+        d_lo = 0
+        d_hi = 0
+        m = re1.match(line)
+        if m:
+            d_lo = m.group(1)
+            d_hi = m.group(1)
+            kind = m.group(2).strip()
+        else:
+            m = re2.match(line)
+            if m:
+                d_lo = m.group(1)
+                d_hi = m.group(2)
+                kind = m.group(3).strip()
+            else:
+                continue
+        d_lo = int(d_lo, 16)
+        d_hi = int(d_hi, 16)
+        if kind not in kinds:
+            kinds[kind] = []
+        kinds[kind].append((d_lo, d_hi))
+
+    return kinds
+
+
+def load_general_category_properties(f):
+    fetch_unidata(f)
+    general_category_list = []
+    re1 = re.compile(r"^([0-9A-F]+);([^;]+);([A-Za-z]+);.*$")
+    re2 = re.compile(r"^<(.*), First>$")
+    re3 = re.compile(r"^<(.*), Last>$")
+    re4 = re.compile(r"^<(.*)>$")
+
+    special_group_lo = 0
+    special_group_text = ''
+    special_group_gc = ''
+    for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
+        d_ch = 0
+        d_name = ''
+        d_gc = ''
+        d_lo = 0
+        d_hi = 0
+        m = re1.match(line)
+        if not m:
+            continue
+
+        d_ch = m.group(1)
+        d_name = m.group(2).strip()
+        d_gc = m.group(3).strip()
+
+        if not d_name.startswith('<'):
+            d_lo = int(d_ch, 16)
+            d_hi = d_lo
+            general_category_list.append((d_lo, d_hi, d_gc))
+            continue
+        m2 = re2.match(d_name)
+        if m2:
+            special_group_lo = int(d_ch, 16)
+            special_group_text = m2.group(1)
+            special_group_gc = d_gc
+            continue
+        m3 = re3.match(d_name)
+        if m3:
+            assert(special_group_text == m3.group(1))
+            assert(special_group_gc == d_gc)
+            d_lo = special_group_lo
+            d_hi = int(d_ch, 16)
+            general_category_list.append((d_lo, d_hi, d_gc))
+            continue
+        m4 = re4.match(d_name)
+        if m4:
+            d_lo = int(d_ch, 16)
+            d_hi = d_lo
+            general_category_list.append((d_lo, d_hi, d_gc))
+            continue
+        raise ValueError("unreachable")
+    return general_category_list
+
+def format_table_content(f, content, indent):
+    line = " "*indent
+    first = True
+    for chunk in content.split(","):
+        if len(line) + len(chunk) < 98:
+            if first:
+                line += chunk
+            else:
+                line += ", " + chunk
+            first = False
+        else:
+            f.write(line + ",\n")
+            line = " "*indent + chunk
+    f.write(line)
+
+def escape_char(c):
+    if c == 'multi':
+        return "\"<multiple code points>\""
+    return "'\\u{%x}'" % c
+
+def escape_char_list(l):
+    line = "["
+    first = True
+    for c in l:
+        if first:
+            line += escape_char(c)
+        else:
+            line += ", " + escape_char(c)
+        first = False
+    line += "]"
+    return line
+
+def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
+        pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
+    pub_string = "const"
+    if not is_const:
+        pub_string = "let"
+    if is_pub:
+        pub_string = "pub " + pub_string
+    f.write("    %s %s: %s = &[\n" % (pub_string, name, t_type))
+    data = ""
+    first = True
+    for dat in t_data:
+        if not first:
+            data += ","
+        first = False
+        data += pfun(dat)
+    format_table_content(f, data, 8)
+    f.write("\n    ];\n\n")
+
+def emit_general_category_module(f):
+    f.write("""#[cfg(feature = \"general-category\")]
+pub mod general_category {""")
+    f.write("""
+
+    #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
+    /// The most general classification of a character.
+    pub enum GeneralCategory {
+        /// `Lu`, an uppercase letter
+        UppercaseLetter,
+        /// `Ll`, a lowercase letter
+        LowercaseLetter,
+        /// `Lt`, a digraphic character, with first part uppercase
+        TitlecaseLetter,
+        /// `Lm`, a modifier letter
+        ModifierLetter,
+        /// `Lo`, other letters, including syllables and ideographs
+        OtherLetter,
+        /// `Mn`, a nonspacing combining mark (zero advance width)
+        NonspacingMark,
+        /// `Mc`, a spacing combining mark (positive advance width)
+        SpacingMark,
+        /// `Me`, an enclosing combining mark
+        EnclosingMark,
+        /// `Nd`, a decimal digit
+        DecimalNumber,
+        /// `Nl`, a letterlike numeric character
+        LetterNumber,
+        /// `No`, a numeric character of other type
+        OtherNumber,
+        /// `Pc`, a connecting punctuation mark, like a tie
+        ConnectorPunctuation,
+        /// `Pd`, a dash or hyphen punctuation mark
+        DashPunctuation,
+        /// `Ps`, an opening punctuation mark (of a pair)
+        OpenPunctuation,
+        /// `Pe`, a closing punctuation mark (of a pair)
+        ClosePunctuation,
+        /// `Pi`, an initial quotation mark
+        InitialPunctuation,
+        /// `Pf`, a final quotation mark
+        FinalPunctuation,
+        /// `Po`, a punctuation mark of other type
+        OtherPunctuation,
+        /// `Sm`, a symbol of mathematical use
+        MathSymbol,
+        /// `Sc`, a currency sign
+        CurrencySymbol,
+        /// `Sk`, a non-letterlike modifier symbol
+        ModifierSymbol,
+        /// `So`, a symbol of other type
+        OtherSymbol,
+        /// `Zs`, a space character (of various non-zero widths)
+        SpaceSeparator,
+        /// `Zl`, U+2028 LINE SEPARATOR only
+        LineSeparator,
+        /// `Zp`, U+2029 PARAGRAPH SEPARATOR only
+        ParagraphSeparator,
+        /// `Cc`, a C0 or C1 control code
+        Control,
+        /// `Cf`, a format control character
+        Format,
+        /// `Cs`, a surrogate code point
+        Surrogate,
+        /// `Co`, a private-use character
+        PrivateUse,
+        /// `Cn`, a reserved unassigned code point or a noncharacter
+        Unassigned,
+    }
+
+    #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
+    /// Groupings of the most general classification of a character.
+    pub enum GeneralCategoryGroup {
+        /// Lu | Ll | Lt | Lm | Lo
+        Letter,
+        /// Mn | Mc | Me
+        Mark,
+        /// Nd | Nl | No
+        Number,
+        /// Pc | Pd | Ps | Pe | Pi | Pf | Po
+        Punctuation,
+        /// Sm | Sc | Sk | So
+        Symbol,        
+        /// Zs | Zl | Zp
+        Separator,
+        /// Cc | Cf | Cs | Co | Cn
+        Other,
+    }
+
+    #[inline]
+    pub(crate) fn general_category_of_char(c: char) -> GeneralCategory {
+        match c as usize {
+            _ => super::util::bsearch_range_value_table(c, GENERAL_CATEGORY).unwrap_or(GeneralCategory::Unassigned)
+        }
+    }
+
+    #[inline]
+    pub(crate) fn general_category_is_letter_cased(gc: GeneralCategory) -> bool {
+        matches!(gc, GeneralCategory::UppercaseLetter | GeneralCategory::LowercaseLetter | GeneralCategory::TitlecaseLetter)
+    }
+
+    #[inline]
+    pub(crate) fn general_category_group(gc: GeneralCategory) -> GeneralCategoryGroup {
+        match gc {
+            GeneralCategory::UppercaseLetter |
+            GeneralCategory::LowercaseLetter |
+            GeneralCategory::TitlecaseLetter |
+            GeneralCategory::ModifierLetter |
+            GeneralCategory::OtherLetter => GeneralCategoryGroup::Letter,
+            GeneralCategory::NonspacingMark |
+            GeneralCategory::SpacingMark |
+            GeneralCategory::EnclosingMark => GeneralCategoryGroup::Mark,
+            GeneralCategory::DecimalNumber |
+            GeneralCategory::LetterNumber |
+            GeneralCategory::OtherNumber => GeneralCategoryGroup::Number,
+            GeneralCategory::ConnectorPunctuation |
+            GeneralCategory::DashPunctuation |
+            GeneralCategory::OpenPunctuation |
+            GeneralCategory::ClosePunctuation |
+            GeneralCategory::InitialPunctuation |
+            GeneralCategory::FinalPunctuation |
+            GeneralCategory::OtherPunctuation => GeneralCategoryGroup::Punctuation,
+            GeneralCategory::MathSymbol |
+            GeneralCategory::CurrencySymbol |
+            GeneralCategory::ModifierSymbol |
+            GeneralCategory::OtherSymbol => GeneralCategoryGroup::Symbol,
+            GeneralCategory::SpaceSeparator |
+            GeneralCategory::LineSeparator |
+            GeneralCategory::ParagraphSeparator => GeneralCategoryGroup::Separator,
+            GeneralCategory::Control |
+            GeneralCategory::Format |
+            GeneralCategory::Surrogate |
+            GeneralCategory::PrivateUse |
+            GeneralCategory::Unassigned => GeneralCategoryGroup::Other,
+        }
+    }
+""")
+    gc_variants = {
+        "Lu": "GeneralCategory::UppercaseLetter",
+        "Ll": "GeneralCategory::LowercaseLetter" ,
+        "Lt": "GeneralCategory::TitlecaseLetter" ,
+        "Lm": "GeneralCategory::ModifierLetter" ,
+        "Lo": "GeneralCategory::OtherLetter",
+        "Mn": "GeneralCategory::NonspacingMark",
+        "Mc": "GeneralCategory::SpacingMark" ,
+        "Me": "GeneralCategory::EnclosingMark",
+        "Nd": "GeneralCategory::DecimalNumber",
+        "Nl": "GeneralCategory::LetterNumber" ,
+        "No": "GeneralCategory::OtherNumber",
+        "Pc": "GeneralCategory::ConnectorPunctuation",
+        "Pd": "GeneralCategory::DashPunctuation" ,
+        "Ps": "GeneralCategory::OpenPunctuation" ,
+        "Pe": "GeneralCategory::ClosePunctuation" ,
+        "Pi": "GeneralCategory::InitialPunctuation" ,
+        "Pf": "GeneralCategory::FinalPunctuation" ,
+        "Po": "GeneralCategory::OtherPunctuation",
+        "Sm": "GeneralCategory::MathSymbol",
+        "Sc": "GeneralCategory::CurrencySymbol" ,
+        "Sk": "GeneralCategory::ModifierSymbol" ,
+        "So": "GeneralCategory::OtherSymbol",
+        "Zs": "GeneralCategory::SpaceSeparator",
+        "Zl": "GeneralCategory::LineSeparator" ,
+        "Zp": "GeneralCategory::ParagraphSeparator",
+        "Cc": "GeneralCategory::Control",
+        "Cf": "GeneralCategory::Format" ,
+        "Cs": "GeneralCategory::Surrogate" ,
+        "Co": "GeneralCategory::PrivateUse" ,
+        "Cn": "GeneralCategory::Unassigned",
+    }
+
+    f.write("    // General category table:\n")
+    general_category_char_table = load_general_category_properties("UnicodeData.txt")
+    general_category_group_table = []
+    for input_idx in range(len(general_category_char_table)):
+        if general_category_char_table[input_idx][2] == "Cs":
+            continue
+        existing_group_count = len(general_category_group_table)
+        if existing_group_count == 0:
+            general_category_group_table.append(general_category_char_table[input_idx])
+        elif (general_category_group_table[existing_group_count - 1][1] + 1 == general_category_char_table[input_idx][0] and
+            general_category_group_table[existing_group_count - 1][2] == general_category_char_table[input_idx][2]):
+            general_category_group_table[existing_group_count - 1] = (general_category_group_table[existing_group_count - 1][0],
+                general_category_char_table[input_idx][1], general_category_group_table[existing_group_count - 1][2])
+        else:
+            general_category_group_table.append(general_category_char_table[input_idx])
+    emit_table(f, "GENERAL_CATEGORY", general_category_group_table, "&'static [(char, char, GeneralCategory)]", is_pub=False,
+            pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), gc_variants[x[2]]))
+    f.write("}\n\n")
+
+
+def emit_emoji_module(f):
+    f.write("""#[cfg(feature = \"emoji\")]
+pub mod emoji {""")
+    f.write("""
+
+    #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
+    #[non_exhaustive]
+    /// The emoji character properties of a character.
+    pub enum EmojiStatus {
+        /// `Emoji=NO`, `Emoji_Component=NO`
+        NonEmoji,
+        /// `Emoji=NO`, `Emoji_Component=YES`
+        NonEmojiButEmojiComponent,
+        /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES`
+        EmojiPresentation,
+        /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Modifier_Base=YES`
+        EmojiModifierBase,
+        /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES`, `Emoji_Modifier_Base=YES`
+        EmojiPresentationAndModifierBase,
+        /// `Emoji=YES`, `Emoji_Component=NO`
+        EmojiOther,
+        /// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES`
+        EmojiPresentationAndEmojiComponent,
+        /// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES`, `Emoji_Modifier=YES`
+        EmojiPresentationAndModifierAndEmojiComponent,
+        /// `Emoji=YES`, `Emoji_Component=YES`
+        EmojiOtherAndEmojiComponent,
+    }
+    #[inline]
+    pub(crate) fn emoji_status(c: char) -> EmojiStatus {
+        // FIXME: do we want to special case ASCII here?
+        match c as usize {
+            _ => super::util::bsearch_range_value_table(c, EMOJI_STATUS).unwrap()
+        }
+    }
+    #[inline]
+    pub(crate) fn is_emoji_status_for_emoji_char_or_emoji_component(s: EmojiStatus) -> bool {
+        !matches!(s, EmojiStatus::NonEmoji)
+    }
+    #[inline]
+    pub(crate) fn is_emoji_status_for_emoji_char(s: EmojiStatus) -> bool {
+        !matches!(s, EmojiStatus::NonEmoji | EmojiStatus::NonEmojiButEmojiComponent)
+    }
+    #[inline]
+    pub(crate) fn is_emoji_status_for_emoji_component(s: EmojiStatus) -> bool {
+        matches!(s, EmojiStatus::EmojiPresentationAndEmojiComponent |
+            EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent |
+            EmojiStatus::EmojiOtherAndEmojiComponent)
+    }
+""")
+
+    f.write("    // Emoji status table:\n")
+    emoji_status_table = load_emoji_properties("emoji/emoji-data.txt")
+    # we combine things together here.
+    
+    # `Extended_Pictographic`` is only for future proof usages, we ignore it here.
+    # emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component", "Extended_Pictographic"]
+    emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component"]
+
+    # need to skip surrogates because they're not representable by rust `char`s
+    emoji_status_table["Surrogate"] = [(0xD800, 0xDFFF)]
+    emoji_prop_list.append("Surrogate")
+    
+    emoji_prop_list_len = [len(emoji_status_table[x]) for x in emoji_prop_list]
+    emoji_prop_count = len(emoji_prop_list)
+    code_point_first = 0
+    code_point_last = 0x10FFFF
+    emoji_prop_list_pos = [0 for _ in emoji_prop_list]
+    cur_group_first = code_point_first
+    emoji_table = []
+    def group_text(s):
+        if s == "Surrogate":
+            return "<Surrogate>"
+        elif s == "":
+            return "EmojiStatus::NonEmoji"
+        elif s == "Emoji_Component":
+            return "EmojiStatus::NonEmojiButEmojiComponent"
+        elif s == "Emoji;Emoji_Presentation":
+            return "EmojiStatus::EmojiPresentation"
+        elif s == "Emoji;Emoji_Presentation;Emoji_Modifier_Base":
+            return "EmojiStatus::EmojiPresentationAndModifierBase"
+        elif s == "Emoji;Emoji_Modifier_Base":
+            return "EmojiStatus::EmojiModifierBase"
+        elif s == "Emoji":
+            return "EmojiStatus::EmojiOther"
+        elif s == "Emoji;Emoji_Presentation;Emoji_Component":
+            return "EmojiStatus::EmojiPresentationAndEmojiComponent"
+        elif s == "Emoji;Emoji_Presentation;Emoji_Modifier;Emoji_Component":
+            return "EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent"
+        elif s == "Emoji;Emoji_Component":
+            return "EmojiStatus::EmojiOtherAndEmojiComponent"
+        else:
+            return "EmojiStatus::NewCombination(\"" + s + "\")"
+    while cur_group_first <= code_point_last:
+        cur_group_props = []
+        cur_group_last = code_point_last
+        for prop_list_idx in range(emoji_prop_count):
+            if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]:
+                continue
+            elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first:
+                cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] - 1)
+            else:
+                cur_group_props.append(emoji_prop_list[prop_list_idx])
+                cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1])
+        cur_group_text = group_text(";".join(cur_group_props))
+        if cur_group_text != "<Surrogate>":
+            emoji_table.append((cur_group_first, cur_group_last, cur_group_text))
+        for prop_list_idx in range(emoji_prop_count):
+            if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]:
+                continue
+            elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first:
+                continue
+            else:
+                if cur_group_last == emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]:
+                    emoji_prop_list_pos[prop_list_idx] += 1
+        cur_group_first = cur_group_last + 1
+
+    emit_table(f, "EMOJI_STATUS", emoji_table, "&'static [(char, char, EmojiStatus)]", is_pub=False,
+            pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
+    f.write("}\n\n")
+
+def emit_util_mod(f):
+    f.write("""
+#[allow(dead_code)]
+pub mod util {
+    use core::result::Result::{Ok, Err};
+
+    pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
+        use core::cmp::Ordering::{Equal, Less, Greater};
+        match r.binary_search_by(|&(lo, hi, _)| {
+            if lo <= c && c <= hi { Equal }
+            else if hi < c { Less }
+            else { Greater }
+        }) {
+            Ok(idx) => {
+                let (_, _, cat) = r[idx];
+                Some(cat)
+            }
+            Err(_) => None
+        }
+    }
+
+}
+
+""")
+
+if __name__ == "__main__":
+    r = "tables.rs"
+    if os.path.exists(r):
+        os.remove(r)
+    with open(r, "w") as rf:
+        # write the file's preamble
+        rf.write(preamble)
+
+        rf.write("""
+/// The version of [Unicode](http://www.unicode.org/)
+/// that this version of unicode-security is based on.
+pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
+
+""" % UNICODE_VERSION)
+
+        emit_util_mod(rf)
+        ### general category module
+        emit_general_category_module(rf)
+        ### emoji module
+        emit_emoji_module(rf)