diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/regex-syntax/src | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-syntax/src')
32 files changed, 50564 insertions, 0 deletions
diff --git a/vendor/regex-syntax/src/ast/mod.rs b/vendor/regex-syntax/src/ast/mod.rs new file mode 100644 index 000000000..9b9127b1f --- /dev/null +++ b/vendor/regex-syntax/src/ast/mod.rs @@ -0,0 +1,1502 @@ +/*! +Defines an abstract syntax for regular expressions. +*/ + +use std::cmp::Ordering; +use std::error; +use std::fmt; + +pub use crate::ast::visitor::{visit, Visitor}; + +pub mod parse; +pub mod print; +mod visitor; + +/// An error that occurred while parsing a regular expression into an abstract +/// syntax tree. +/// +/// Note that note all ASTs represents a valid regular expression. For example, +/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a +/// valid Unicode property name. That particular error is reported when +/// translating an AST to the high-level intermediate representation (`HIR`). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the parser generated the error from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } + + /// Return an auxiliary span. This span exists only for some errors that + /// benefit from being able to point to two locations in the original + /// regular expression. For example, "duplicate" errors will have the + /// main error position set to the duplicate occurrence while its + /// auxiliary span will be set to the initial occurrence. + pub fn auxiliary_span(&self) -> Option<&Span> { + use self::ErrorKind::*; + match self.kind { + FlagDuplicate { ref original } => Some(original), + FlagRepeatedNegation { ref original, .. } => Some(original), + GroupNameDuplicate { ref original, .. } => Some(original), + _ => None, + } + } +} + +/// The type of an error that occurred while building an AST. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// The capturing group limit was exceeded. + /// + /// Note that this represents a limit on the total number of capturing + /// groups in a regex and not necessarily the number of nested capturing + /// groups. That is, the nest limit can be low and it is still possible for + /// this error to occur. + CaptureLimitExceeded, + /// An invalid escape sequence was found in a character class set. + ClassEscapeInvalid, + /// An invalid character class range was found. An invalid range is any + /// range where the start is greater than the end. + ClassRangeInvalid, + /// An invalid range boundary was found in a character class. Range + /// boundaries must be a single literal codepoint, but this error indicates + /// that something else was found, such as a nested class. + ClassRangeLiteral, + /// An opening `[` was found with no corresponding closing `]`. + ClassUnclosed, + /// Note that this error variant is no longer used. Namely, a decimal + /// number can only appear as a repetition quantifier. When the number + /// in a repetition quantifier is empty, then it gets its own specialized + /// error, `RepetitionCountDecimalEmpty`. + DecimalEmpty, + /// An invalid decimal number was given where one was expected. + DecimalInvalid, + /// A bracketed hex literal was empty. + EscapeHexEmpty, + /// A bracketed hex literal did not correspond to a Unicode scalar value. + EscapeHexInvalid, + /// An invalid hexadecimal digit was found. + EscapeHexInvalidDigit, + /// EOF was found before an escape sequence was completed. + EscapeUnexpectedEof, + /// An unrecognized escape sequence. + EscapeUnrecognized, + /// A dangling negation was used when setting flags, e.g., `i-`. + FlagDanglingNegation, + /// A flag was used twice, e.g., `i-i`. + FlagDuplicate { + /// The position of the original flag. The error position + /// points to the duplicate flag. + original: Span, + }, + /// The negation operator was used twice, e.g., `-i-s`. + FlagRepeatedNegation { + /// The position of the original negation operator. The error position + /// points to the duplicate negation operator. + original: Span, + }, + /// Expected a flag but got EOF, e.g., `(?`. + FlagUnexpectedEof, + /// Unrecognized flag, e.g., `a`. + FlagUnrecognized, + /// A duplicate capture name was found. + GroupNameDuplicate { + /// The position of the initial occurrence of the capture name. The + /// error position itself points to the duplicate occurrence. + original: Span, + }, + /// A capture group name is empty, e.g., `(?P<>abc)`. + GroupNameEmpty, + /// An invalid character was seen for a capture group name. This includes + /// errors where the first character is a digit (even though subsequent + /// characters are allowed to be digits). + GroupNameInvalid, + /// A closing `>` could not be found for a capture group name. + GroupNameUnexpectedEof, + /// An unclosed group, e.g., `(ab`. + /// + /// The span of this error corresponds to the unclosed parenthesis. + GroupUnclosed, + /// An unopened group, e.g., `ab)`. + GroupUnopened, + /// The nest limit was exceeded. The limit stored here is the limit + /// configured in the parser. + NestLimitExceeded(u32), + /// The range provided in a counted repetition operator is invalid. The + /// range is invalid if the start is greater than the end. + RepetitionCountInvalid, + /// An opening `{` was not followed by a valid decimal value. + /// For example, `x{}` or `x{]}` would fail. + RepetitionCountDecimalEmpty, + /// An opening `{` was found with no corresponding closing `}`. + RepetitionCountUnclosed, + /// A repetition operator was applied to a missing sub-expression. This + /// occurs, for example, in the regex consisting of just a `*` or even + /// `(?i)*`. It is, however, possible to create a repetition operating on + /// an empty sub-expression. For example, `()*` is still considered valid. + RepetitionMissing, + /// The Unicode class is not valid. This typically occurs when a `\p` is + /// followed by something other than a `{`. + UnicodeClassInvalid, + /// When octal support is disabled, this error is produced when an octal + /// escape is used. The octal escape is assumed to be an invocation of + /// a backreference, which is the common case. + UnsupportedBackreference, + /// When syntax similar to PCRE's look-around is used, this error is + /// returned. Some example syntaxes that are rejected include, but are + /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and + /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this + /// error is used to improve the user experience. + UnsupportedLookAround, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + use self::ErrorKind::*; + match self.kind { + CaptureLimitExceeded => "capture group limit exceeded", + ClassEscapeInvalid => "invalid escape sequence in character class", + ClassRangeInvalid => "invalid character class range", + ClassRangeLiteral => "invalid range boundary, must be a literal", + ClassUnclosed => "unclosed character class", + DecimalEmpty => "empty decimal literal", + DecimalInvalid => "invalid decimal literal", + EscapeHexEmpty => "empty hexadecimal literal", + EscapeHexInvalid => "invalid hexadecimal literal", + EscapeHexInvalidDigit => "invalid hexadecimal digit", + EscapeUnexpectedEof => "unexpected eof (escape sequence)", + EscapeUnrecognized => "unrecognized escape sequence", + FlagDanglingNegation => "dangling flag negation operator", + FlagDuplicate { .. } => "duplicate flag", + FlagRepeatedNegation { .. } => "repeated negation", + FlagUnexpectedEof => "unexpected eof (flag)", + FlagUnrecognized => "unrecognized flag", + GroupNameDuplicate { .. } => "duplicate capture group name", + GroupNameEmpty => "empty capture group name", + GroupNameInvalid => "invalid capture group name", + GroupNameUnexpectedEof => "unclosed capture group name", + GroupUnclosed => "unclosed group", + GroupUnopened => "unopened group", + NestLimitExceeded(_) => "nest limit exceeded", + RepetitionCountInvalid => "invalid repetition count range", + RepetitionCountUnclosed => "unclosed counted repetition", + RepetitionMissing => "repetition operator missing expression", + UnicodeClassInvalid => "invalid Unicode character class", + UnsupportedBackreference => "backreferences are not supported", + UnsupportedLookAround => "look-around is not supported", + _ => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::ErrorKind::*; + match *self { + CaptureLimitExceeded => write!( + f, + "exceeded the maximum number of \ + capturing groups ({})", + ::std::u32::MAX + ), + ClassEscapeInvalid => { + write!(f, "invalid escape sequence found in character class") + } + ClassRangeInvalid => write!( + f, + "invalid character class range, \ + the start must be <= the end" + ), + ClassRangeLiteral => { + write!(f, "invalid range boundary, must be a literal") + } + ClassUnclosed => write!(f, "unclosed character class"), + DecimalEmpty => write!(f, "decimal literal empty"), + DecimalInvalid => write!(f, "decimal literal invalid"), + EscapeHexEmpty => write!(f, "hexadecimal literal empty"), + EscapeHexInvalid => { + write!(f, "hexadecimal literal is not a Unicode scalar value") + } + EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"), + EscapeUnexpectedEof => write!( + f, + "incomplete escape sequence, \ + reached end of pattern prematurely" + ), + EscapeUnrecognized => write!(f, "unrecognized escape sequence"), + FlagDanglingNegation => { + write!(f, "dangling flag negation operator") + } + FlagDuplicate { .. } => write!(f, "duplicate flag"), + FlagRepeatedNegation { .. } => { + write!(f, "flag negation operator repeated") + } + FlagUnexpectedEof => { + write!(f, "expected flag but got end of regex") + } + FlagUnrecognized => write!(f, "unrecognized flag"), + GroupNameDuplicate { .. } => { + write!(f, "duplicate capture group name") + } + GroupNameEmpty => write!(f, "empty capture group name"), + GroupNameInvalid => write!(f, "invalid capture group character"), + GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), + GroupUnclosed => write!(f, "unclosed group"), + GroupUnopened => write!(f, "unopened group"), + NestLimitExceeded(limit) => write!( + f, + "exceed the maximum number of \ + nested parentheses/brackets ({})", + limit + ), + RepetitionCountInvalid => write!( + f, + "invalid repetition count range, \ + the start must be <= the end" + ), + RepetitionCountDecimalEmpty => { + write!(f, "repetition quantifier expects a valid decimal") + } + RepetitionCountUnclosed => { + write!(f, "unclosed counted repetition") + } + RepetitionMissing => { + write!(f, "repetition operator missing expression") + } + UnicodeClassInvalid => { + write!(f, "invalid Unicode character class") + } + UnsupportedBackreference => { + write!(f, "backreferences are not supported") + } + UnsupportedLookAround => write!( + f, + "look-around, including look-ahead and look-behind, \ + is not supported" + ), + _ => unreachable!(), + } + } +} + +/// Span represents the position information of a single AST item. +/// +/// All span positions are absolute byte offsets that can be used on the +/// original regular expression that was parsed. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct Span { + /// The start byte offset. + pub start: Position, + /// The end byte offset. + pub end: Position, +} + +impl fmt::Debug for Span { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Span({:?}, {:?})", self.start, self.end) + } +} + +impl Ord for Span { + fn cmp(&self, other: &Span) -> Ordering { + (&self.start, &self.end).cmp(&(&other.start, &other.end)) + } +} + +impl PartialOrd for Span { + fn partial_cmp(&self, other: &Span) -> Option<Ordering> { + Some(self.cmp(other)) + } +} + +/// A single position in a regular expression. +/// +/// A position encodes one half of a span, and include the byte offset, line +/// number and column number. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct Position { + /// The absolute offset of this position, starting at `0` from the + /// beginning of the regular expression pattern string. + pub offset: usize, + /// The line number, starting at `1`. + pub line: usize, + /// The approximate column number, starting at `1`. + pub column: usize, +} + +impl fmt::Debug for Position { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "Position(o: {:?}, l: {:?}, c: {:?})", + self.offset, self.line, self.column + ) + } +} + +impl Ord for Position { + fn cmp(&self, other: &Position) -> Ordering { + self.offset.cmp(&other.offset) + } +} + +impl PartialOrd for Position { + fn partial_cmp(&self, other: &Position) -> Option<Ordering> { + Some(self.cmp(other)) + } +} + +impl Span { + /// Create a new span with the given positions. + pub fn new(start: Position, end: Position) -> Span { + Span { start: start, end: end } + } + + /// Create a new span using the given position as the start and end. + pub fn splat(pos: Position) -> Span { + Span::new(pos, pos) + } + + /// Create a new span by replacing the starting the position with the one + /// given. + pub fn with_start(self, pos: Position) -> Span { + Span { start: pos, ..self } + } + + /// Create a new span by replacing the ending the position with the one + /// given. + pub fn with_end(self, pos: Position) -> Span { + Span { end: pos, ..self } + } + + /// Returns true if and only if this span occurs on a single line. + pub fn is_one_line(&self) -> bool { + self.start.line == self.end.line + } + + /// Returns true if and only if this span is empty. That is, it points to + /// a single position in the concrete syntax of a regular expression. + pub fn is_empty(&self) -> bool { + self.start.offset == self.end.offset + } +} + +impl Position { + /// Create a new position with the given information. + /// + /// `offset` is the absolute offset of the position, starting at `0` from + /// the beginning of the regular expression pattern string. + /// + /// `line` is the line number, starting at `1`. + /// + /// `column` is the approximate column number, starting at `1`. + pub fn new(offset: usize, line: usize, column: usize) -> Position { + Position { offset: offset, line: line, column: column } + } +} + +/// An abstract syntax tree for a singular expression along with comments +/// found. +/// +/// Comments are not stored in the tree itself to avoid complexity. Each +/// comment contains a span of precisely where it occurred in the original +/// regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct WithComments { + /// The actual ast. + pub ast: Ast, + /// All comments found in the original regular expression. + pub comments: Vec<Comment>, +} + +/// A comment from a regular expression with an associated span. +/// +/// A regular expression can only contain comments when the `x` flag is +/// enabled. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Comment { + /// The span of this comment, including the beginning `#` and ending `\n`. + pub span: Span, + /// The comment text, starting with the first character following the `#` + /// and ending with the last character preceding the `\n`. + pub comment: String, +} + +/// An abstract syntax tree for a single regular expression. +/// +/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap +/// space proportional to the size of the `Ast`. +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the `Ast`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Ast { + /// An empty regex that matches everything. + Empty(Span), + /// A set of flags, e.g., `(?is)`. + Flags(SetFlags), + /// A single character literal, which includes escape sequences. + Literal(Literal), + /// The "any character" class. + Dot(Span), + /// A single zero-width assertion. + Assertion(Assertion), + /// A single character class. This includes all forms of character classes + /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. + Class(Class), + /// A repetition operator applied to an arbitrary regular expression. + Repetition(Repetition), + /// A grouped regular expression. + Group(Group), + /// An alternation of regular expressions. + Alternation(Alternation), + /// A concatenation of regular expressions. + Concat(Concat), +} + +impl Ast { + /// Return the span of this abstract syntax tree. + pub fn span(&self) -> &Span { + match *self { + Ast::Empty(ref span) => span, + Ast::Flags(ref x) => &x.span, + Ast::Literal(ref x) => &x.span, + Ast::Dot(ref span) => span, + Ast::Assertion(ref x) => &x.span, + Ast::Class(ref x) => x.span(), + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + } + } + + /// Return true if and only if this Ast is empty. + pub fn is_empty(&self) -> bool { + match *self { + Ast::Empty(_) => true, + _ => false, + } + } + + /// Returns true if and only if this AST has any (including possibly empty) + /// subexpressions. + fn has_subexprs(&self) -> bool { + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) => false, + Ast::Class(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => true, + } + } +} + +/// Print a display representation of this Ast. +/// +/// This does not preserve any of the original whitespace formatting that may +/// have originally been present in the concrete syntax from which this Ast +/// was generated. +/// +/// This implementation uses constant stack space and heap space proportional +/// to the size of the `Ast`. +impl fmt::Display for Ast { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use crate::ast::print::Printer; + Printer::new().print(self, f) + } +} + +/// An alternation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Alternation { + /// The span of this alternation. + pub span: Span, + /// The alternate regular expressions. + pub asts: Vec<Ast>, +} + +impl Alternation { + /// Return this alternation as an AST. + /// + /// If this alternation contains zero ASTs, then Ast::Empty is + /// returned. If this alternation contains exactly 1 AST, then the + /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::Empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::Alternation(self), + } + } +} + +/// A concatenation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Concat { + /// The span of this concatenation. + pub span: Span, + /// The concatenation regular expressions. + pub asts: Vec<Ast>, +} + +impl Concat { + /// Return this concatenation as an AST. + /// + /// If this concatenation contains zero ASTs, then Ast::Empty is + /// returned. If this concatenation contains exactly 1 AST, then the + /// corresponding AST is returned. Otherwise, Ast::Concat is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::Empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::Concat(self), + } + } +} + +/// A single literal expression. +/// +/// A literal corresponds to a single Unicode scalar value. Literals may be +/// represented in their literal form, e.g., `a` or in their escaped form, +/// e.g., `\x61`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Literal { + /// The span of this literal. + pub span: Span, + /// The kind of this literal. + pub kind: LiteralKind, + /// The Unicode scalar value corresponding to this literal. + pub c: char, +} + +impl Literal { + /// If this literal was written as a `\x` hex escape, then this returns + /// the corresponding byte value. Otherwise, this returns `None`. + pub fn byte(&self) -> Option<u8> { + let short_hex = LiteralKind::HexFixed(HexLiteralKind::X); + if self.c as u32 <= 255 && self.kind == short_hex { + Some(self.c as u8) + } else { + None + } + } +} + +/// The kind of a single literal expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum LiteralKind { + /// The literal is written verbatim, e.g., `a` or `â`. + Verbatim, + /// The literal is written as an escape because it is punctuation, e.g., + /// `\*` or `\[`. + Punctuation, + /// The literal is written as an octal escape, e.g., `\141`. + Octal, + /// The literal is written as a hex code with a fixed number of digits + /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or + /// `\U00000061`. + HexFixed(HexLiteralKind), + /// The literal is written as a hex code with a bracketed number of + /// digits. The only restriction is that the bracketed hex code must refer + /// to a valid Unicode scalar value. + HexBrace(HexLiteralKind), + /// The literal is written as a specially recognized escape, e.g., `\f` + /// or `\n`. + Special(SpecialLiteralKind), +} + +/// The type of a special literal. +/// +/// A special literal is a special escape sequence recognized by the regex +/// parser, e.g., `\f` or `\n`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum SpecialLiteralKind { + /// Bell, spelled `\a` (`\x07`). + Bell, + /// Form feed, spelled `\f` (`\x0C`). + FormFeed, + /// Tab, spelled `\t` (`\x09`). + Tab, + /// Line feed, spelled `\n` (`\x0A`). + LineFeed, + /// Carriage return, spelled `\r` (`\x0D`). + CarriageReturn, + /// Vertical tab, spelled `\v` (`\x0B`). + VerticalTab, + /// Space, spelled `\ ` (`\x20`). Note that this can only appear when + /// parsing in verbose mode. + Space, +} + +/// The type of a Unicode hex literal. +/// +/// Note that all variants behave the same when used with brackets. They only +/// differ when used without brackets in the number of hex digits that must +/// follow. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HexLiteralKind { + /// A `\x` prefix. When used without brackets, this form is limited to + /// two digits. + X, + /// A `\u` prefix. When used without brackets, this form is limited to + /// four digits. + UnicodeShort, + /// A `\U` prefix. When used without brackets, this form is limited to + /// eight digits. + UnicodeLong, +} + +impl HexLiteralKind { + /// The number of digits that must be used with this literal form when + /// used without brackets. When used with brackets, there is no + /// restriction on the number of digits. + pub fn digits(&self) -> u32 { + match *self { + HexLiteralKind::X => 2, + HexLiteralKind::UnicodeShort => 4, + HexLiteralKind::UnicodeLong => 8, + } + } +} + +/// A single character class expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Class { + /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. + Unicode(ClassUnicode), + /// A perl character class, e.g., `\d` or `\W`. + Perl(ClassPerl), + /// A bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + Bracketed(ClassBracketed), +} + +impl Class { + /// Return the span of this character class. + pub fn span(&self) -> &Span { + match *self { + Class::Perl(ref x) => &x.span, + Class::Unicode(ref x) => &x.span, + Class::Bracketed(ref x) => &x.span, + } + } +} + +/// A Perl character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassPerl { + /// The span of this class. + pub span: Span, + /// The kind of Perl class. + pub kind: ClassPerlKind, + /// Whether the class is negated or not. e.g., `\d` is not negated but + /// `\D` is. + pub negated: bool, +} + +/// The available Perl character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassPerlKind { + /// Decimal numbers. + Digit, + /// Whitespace. + Space, + /// Word characters. + Word, +} + +/// An ASCII character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassAscii { + /// The span of this class. + pub span: Span, + /// The kind of ASCII class. + pub kind: ClassAsciiKind, + /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated + /// but `[[:^alpha:]]` is. + pub negated: bool, +} + +/// The available ASCII character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassAsciiKind { + /// `[0-9A-Za-z]` + Alnum, + /// `[A-Za-z]` + Alpha, + /// `[\x00-\x7F]` + Ascii, + /// `[ \t]` + Blank, + /// `[\x00-\x1F\x7F]` + Cntrl, + /// `[0-9]` + Digit, + /// `[!-~]` + Graph, + /// `[a-z]` + Lower, + /// `[ -~]` + Print, + /// `[!-/:-@\[-`{-~]` + Punct, + /// `[\t\n\v\f\r ]` + Space, + /// `[A-Z]` + Upper, + /// `[0-9A-Za-z_]` + Word, + /// `[0-9A-Fa-f]` + Xdigit, +} + +impl ClassAsciiKind { + /// Return the corresponding ClassAsciiKind variant for the given name. + /// + /// The name given should correspond to the lowercase version of the + /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`. + /// + /// If no variant with the corresponding name exists, then `None` is + /// returned. + pub fn from_name(name: &str) -> Option<ClassAsciiKind> { + use self::ClassAsciiKind::*; + match name { + "alnum" => Some(Alnum), + "alpha" => Some(Alpha), + "ascii" => Some(Ascii), + "blank" => Some(Blank), + "cntrl" => Some(Cntrl), + "digit" => Some(Digit), + "graph" => Some(Graph), + "lower" => Some(Lower), + "print" => Some(Print), + "punct" => Some(Punct), + "space" => Some(Space), + "upper" => Some(Upper), + "word" => Some(Word), + "xdigit" => Some(Xdigit), + _ => None, + } + } +} + +/// A Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. + /// + /// Note: be careful when using this attribute. This specifically refers + /// to whether the class is written as `\p` or `\P`, where the latter + /// is `negated = true`. However, it also possible to write something like + /// `\P{scx!=Katakana}` which is actually equivalent to + /// `\p{scx=Katakana}` and is therefore not actually negated even though + /// `negated = true` here. To test whether this class is truly negated + /// or not, use the `is_negated` method. + pub negated: bool, + /// The kind of Unicode class. + pub kind: ClassUnicodeKind, +} + +impl ClassUnicode { + /// Returns true if this class has been negated. + /// + /// Note that this takes the Unicode op into account, if it's present. + /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`. + pub fn is_negated(&self) -> bool { + match self.kind { + ClassUnicodeKind::NamedValue { + op: ClassUnicodeOpKind::NotEqual, + .. + } => !self.negated, + _ => self.negated, + } + } +} + +/// The available forms of Unicode character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassUnicodeKind { + /// A one letter abbreviated class, e.g., `\pN`. + OneLetter(char), + /// A binary property, general category or script. The string may be + /// empty. + Named(String), + /// A property name and an associated value. + NamedValue { + /// The type of Unicode op used to associate `name` with `value`. + op: ClassUnicodeOpKind, + /// The property name (which may be empty). + name: String, + /// The property value (which may be empty). + value: String, + }, +} + +/// The type of op used in a Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassUnicodeOpKind { + /// A property set to a specific value, e.g., `\p{scx=Katakana}`. + Equal, + /// A property set to a specific value using a colon, e.g., + /// `\p{scx:Katakana}`. + Colon, + /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`. + NotEqual, +} + +impl ClassUnicodeOpKind { + /// Whether the op is an equality op or not. + pub fn is_equal(&self) -> bool { + match *self { + ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true, + _ => false, + } + } +} + +/// A bracketed character class, e.g., `[a-z0-9]`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBracketed { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. e.g., `[a]` is not negated but + /// `[^a]` is. + pub negated: bool, + /// The type of this set. A set is either a normal union of things, e.g., + /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`. + pub kind: ClassSet, +} + +/// A character class set. +/// +/// This type corresponds to the internal structure of a bracketed character +/// class. That is, every bracketed character is one of two types: a union of +/// items (literals, ranges, other bracketed classes) or a tree of binary set +/// operations. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassSet { + /// An item, which can be a single literal, range, nested character class + /// or a union of items. + Item(ClassSetItem), + /// A single binary operation (i.e., &&, -- or ~~). + BinaryOp(ClassSetBinaryOp), +} + +impl ClassSet { + /// Build a set from a union. + pub fn union(ast: ClassSetUnion) -> ClassSet { + ClassSet::Item(ClassSetItem::Union(ast)) + } + + /// Return the span of this character class set. + pub fn span(&self) -> &Span { + match *self { + ClassSet::Item(ref x) => x.span(), + ClassSet::BinaryOp(ref x) => &x.span, + } + } + + /// Return true if and only if this class set is empty. + fn is_empty(&self) -> bool { + match *self { + ClassSet::Item(ClassSetItem::Empty(_)) => true, + _ => false, + } + } +} + +/// A single component of a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassSetItem { + /// An empty item. + /// + /// Note that a bracketed character class cannot contain a single empty + /// item. Empty items can appear when using one of the binary operators. + /// For example, `[&&]` is the intersection of two empty classes. + Empty(Span), + /// A single literal. + Literal(Literal), + /// A range between two literals. + Range(ClassSetRange), + /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`. + Ascii(ClassAscii), + /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. + Unicode(ClassUnicode), + /// A perl character class, e.g., `\d` or `\W`. + Perl(ClassPerl), + /// A bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + Bracketed(Box<ClassBracketed>), + /// A union of items. + Union(ClassSetUnion), +} + +impl ClassSetItem { + /// Return the span of this character class set item. + pub fn span(&self) -> &Span { + match *self { + ClassSetItem::Empty(ref span) => span, + ClassSetItem::Literal(ref x) => &x.span, + ClassSetItem::Range(ref x) => &x.span, + ClassSetItem::Ascii(ref x) => &x.span, + ClassSetItem::Perl(ref x) => &x.span, + ClassSetItem::Unicode(ref x) => &x.span, + ClassSetItem::Bracketed(ref x) => &x.span, + ClassSetItem::Union(ref x) => &x.span, + } + } +} + +/// A single character class range in a set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetRange { + /// The span of this range. + pub span: Span, + /// The start of this range. + pub start: Literal, + /// The end of this range. + pub end: Literal, +} + +impl ClassSetRange { + /// Returns true if and only if this character class range is valid. + /// + /// The only case where a range is invalid is if its start is greater than + /// its end. + pub fn is_valid(&self) -> bool { + self.start.c <= self.end.c + } +} + +/// A union of items inside a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetUnion { + /// The span of the items in this operation. e.g., the `a-z0-9` in + /// `[^a-z0-9]` + pub span: Span, + /// The sequence of items that make up this union. + pub items: Vec<ClassSetItem>, +} + +impl ClassSetUnion { + /// Push a new item in this union. + /// + /// The ending position of this union's span is updated to the ending + /// position of the span of the item given. If the union is empty, then + /// the starting position of this union is set to the starting position + /// of this item. + /// + /// In other words, if you only use this method to add items to a union + /// and you set the spans on each item correctly, then you should never + /// need to adjust the span of the union directly. + pub fn push(&mut self, item: ClassSetItem) { + if self.items.is_empty() { + self.span.start = item.span().start; + } + self.span.end = item.span().end; + self.items.push(item); + } + + /// Return this union as a character class set item. + /// + /// If this union contains zero items, then an empty union is + /// returned. If this concatenation contains exactly 1 item, then the + /// corresponding item is returned. Otherwise, ClassSetItem::Union is + /// returned. + pub fn into_item(mut self) -> ClassSetItem { + match self.items.len() { + 0 => ClassSetItem::Empty(self.span), + 1 => self.items.pop().unwrap(), + _ => ClassSetItem::Union(self), + } + } +} + +/// A Unicode character class set operation. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetBinaryOp { + /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`. + pub span: Span, + /// The type of this set operation. + pub kind: ClassSetBinaryOpKind, + /// The left hand side of the operation. + pub lhs: Box<ClassSet>, + /// The right hand side of the operation. + pub rhs: Box<ClassSet>, +} + +/// The type of a Unicode character class set operation. +/// +/// Note that this doesn't explicitly represent union since there is no +/// explicit union operator. Concatenation inside a character class corresponds +/// to the union operation. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ClassSetBinaryOpKind { + /// The intersection of two sets, e.g., `\pN&&[a-z]`. + Intersection, + /// The difference of two sets, e.g., `\pN--[0-9]`. + Difference, + /// The symmetric difference of two sets. The symmetric difference is the + /// set of elements belonging to one but not both sets. + /// e.g., `[\pL~~[:ascii:]]`. + SymmetricDifference, +} + +/// A single zero-width assertion. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Assertion { + /// The span of this assertion. + pub span: Span, + /// The assertion kind, e.g., `\b` or `^`. + pub kind: AssertionKind, +} + +/// An assertion kind. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum AssertionKind { + /// `^` + StartLine, + /// `$` + EndLine, + /// `\A` + StartText, + /// `\z` + EndText, + /// `\b` + WordBoundary, + /// `\B` + NotWordBoundary, +} + +/// A repetition operation applied to a regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The span of this operation. + pub span: Span, + /// The actual operation. + pub op: RepetitionOp, + /// Whether this operation was applied greedily or not. + pub greedy: bool, + /// The regular expression under repetition. + pub ast: Box<Ast>, +} + +/// The repetition operator itself. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RepetitionOp { + /// The span of this operator. This includes things like `+`, `*?` and + /// `{m,n}`. + pub span: Span, + /// The type of operation. + pub kind: RepetitionKind, +} + +/// The kind of a repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionKind { + /// `?` + ZeroOrOne, + /// `*` + ZeroOrMore, + /// `+` + OneOrMore, + /// `{m,n}` + Range(RepetitionRange), +} + +/// A range repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionRange { + /// `{m}` + Exactly(u32), + /// `{m,}` + AtLeast(u32), + /// `{m,n}` + Bounded(u32, u32), +} + +impl RepetitionRange { + /// Returns true if and only if this repetition range is valid. + /// + /// The only case where a repetition range is invalid is if it is bounded + /// and its start is greater than its end. + pub fn is_valid(&self) -> bool { + match *self { + RepetitionRange::Bounded(s, e) if s > e => false, + _ => true, + } + } +} + +/// A grouped regular expression. +/// +/// This includes both capturing and non-capturing groups. This does **not** +/// include flag-only groups like `(?is)`, but does contain any group that +/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and +/// `(?is:a)`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Group { + /// The span of this group. + pub span: Span, + /// The kind of this group. + pub kind: GroupKind, + /// The regular expression in this group. + pub ast: Box<Ast>, +} + +impl Group { + /// If this group is non-capturing, then this returns the (possibly empty) + /// set of flags. Otherwise, `None` is returned. + pub fn flags(&self) -> Option<&Flags> { + match self.kind { + GroupKind::NonCapturing(ref flags) => Some(flags), + _ => None, + } + } + + /// Returns true if and only if this group is capturing. + pub fn is_capturing(&self) -> bool { + match self.kind { + GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true, + GroupKind::NonCapturing(_) => false, + } + } + + /// Returns the capture index of this group, if this is a capturing group. + /// + /// This returns a capture index precisely when `is_capturing` is `true`. + pub fn capture_index(&self) -> Option<u32> { + match self.kind { + GroupKind::CaptureIndex(i) => Some(i), + GroupKind::CaptureName(ref x) => Some(x.index), + GroupKind::NonCapturing(_) => None, + } + } +} + +/// The kind of a group. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum GroupKind { + /// `(a)` + CaptureIndex(u32), + /// `(?P<name>a)` + CaptureName(CaptureName), + /// `(?:a)` and `(?i:a)` + NonCapturing(Flags), +} + +/// A capture name. +/// +/// This corresponds to the name itself between the angle brackets in, e.g., +/// `(?P<foo>expr)`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CaptureName { + /// The span of this capture name. + pub span: Span, + /// The capture name. + pub name: String, + /// The capture index. + pub index: u32, +} + +/// A group of flags that is not applied to a particular regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SetFlags { + /// The span of these flags, including the grouping parentheses. + pub span: Span, + /// The actual sequence of flags. + pub flags: Flags, +} + +/// A group of flags. +/// +/// This corresponds only to the sequence of flags themselves, e.g., `is-u`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Flags { + /// The span of this group of flags. + pub span: Span, + /// A sequence of flag items. Each item is either a flag or a negation + /// operator. + pub items: Vec<FlagsItem>, +} + +impl Flags { + /// Add the given item to this sequence of flags. + /// + /// If the item was added successfully, then `None` is returned. If the + /// given item is a duplicate, then `Some(i)` is returned, where + /// `items[i].kind == item.kind`. + pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> { + for (i, x) in self.items.iter().enumerate() { + if x.kind == item.kind { + return Some(i); + } + } + self.items.push(item); + None + } + + /// Returns the state of the given flag in this set. + /// + /// If the given flag is in the set but is negated, then `Some(false)` is + /// returned. + /// + /// If the given flag is in the set and is not negated, then `Some(true)` + /// is returned. + /// + /// Otherwise, `None` is returned. + pub fn flag_state(&self, flag: Flag) -> Option<bool> { + let mut negated = false; + for x in &self.items { + match x.kind { + FlagsItemKind::Negation => { + negated = true; + } + FlagsItemKind::Flag(ref xflag) if xflag == &flag => { + return Some(!negated); + } + _ => {} + } + } + None + } +} + +/// A single item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct FlagsItem { + /// The span of this item. + pub span: Span, + /// The kind of this item. + pub kind: FlagsItemKind, +} + +/// The kind of an item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum FlagsItemKind { + /// A negation operator applied to all subsequent flags in the enclosing + /// group. + Negation, + /// A single flag in a group. + Flag(Flag), +} + +impl FlagsItemKind { + /// Returns true if and only if this item is a negation operator. + pub fn is_negation(&self) -> bool { + match *self { + FlagsItemKind::Negation => true, + _ => false, + } + } +} + +/// A single flag. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Flag { + /// `i` + CaseInsensitive, + /// `m` + MultiLine, + /// `s` + DotMatchesNewLine, + /// `U` + SwapGreed, + /// `u` + Unicode, + /// `x` + IgnoreWhitespace, +} + +/// A custom `Drop` impl is used for `Ast` such that it uses constant stack +/// space but heap space proportional to the depth of the `Ast`. +impl Drop for Ast { + fn drop(&mut self) { + use std::mem; + + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + // Classes are recursive, so they get their own Drop impl. + | Ast::Class(_) => return, + Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, + Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::Alternation(ref x) if x.asts.is_empty() => return, + Ast::Concat(ref x) if x.asts.is_empty() => return, + _ => {} + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_ast = || Ast::Empty(empty_span()); + let mut stack = vec![mem::replace(self, empty_ast())]; + while let Some(mut ast) = stack.pop() { + match ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + // Classes are recursive, so they get their own Drop impl. + | Ast::Class(_) => {} + Ast::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Group(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Alternation(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + Ast::Concat(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + } + } + } +} + +/// A custom `Drop` impl is used for `ClassSet` such that it uses constant +/// stack space but heap space proportional to the depth of the `ClassSet`. +impl Drop for ClassSet { + fn drop(&mut self) { + use std::mem; + + match *self { + ClassSet::Item(ref item) => match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => return, + ClassSetItem::Bracketed(ref x) => { + if x.kind.is_empty() { + return; + } + } + ClassSetItem::Union(ref x) => { + if x.items.is_empty() { + return; + } + } + }, + ClassSet::BinaryOp(ref op) => { + if op.lhs.is_empty() && op.rhs.is_empty() { + return; + } + } + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span())); + let mut stack = vec![mem::replace(self, empty_set())]; + while let Some(mut set) = stack.pop() { + match set { + ClassSet::Item(ref mut item) => match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => {} + ClassSetItem::Bracketed(ref mut x) => { + stack.push(mem::replace(&mut x.kind, empty_set())); + } + ClassSetItem::Union(ref mut x) => { + stack.extend(x.items.drain(..).map(ClassSet::Item)); + } + }, + ClassSet::BinaryOp(ref mut op) => { + stack.push(mem::replace(&mut op.lhs, empty_set())); + stack.push(mem::replace(&mut op.rhs, empty_set())); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // We use a thread with an explicit stack size to test that our destructor + // for Ast can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let span = || Span::splat(Position::new(0, 0, 0)); + let mut ast = Ast::Empty(span()); + for i in 0..200 { + ast = Ast::Group(Group { + span: span(), + kind: GroupKind::CaptureIndex(i), + ast: Box::new(ast), + }); + } + assert!(!ast.is_empty()); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + thread::Builder::new() + .stack_size(1 << 10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } +} diff --git a/vendor/regex-syntax/src/ast/parse.rs b/vendor/regex-syntax/src/ast/parse.rs new file mode 100644 index 000000000..9824661c9 --- /dev/null +++ b/vendor/regex-syntax/src/ast/parse.rs @@ -0,0 +1,5944 @@ +/*! +This module provides a regular expression parser. +*/ + +use std::borrow::Borrow; +use std::cell::{Cell, RefCell}; +use std::mem; +use std::result; + +use crate::ast::{self, Ast, Position, Span}; +use crate::either::Either; + +use crate::is_meta_character; + +type Result<T> = result::Result<T, ast::Error>; + +/// A primitive is an expression with no sub-expressions. This includes +/// literals, assertions and non-set character classes. This representation +/// is used as intermediate state in the parser. +/// +/// This does not include ASCII character classes, since they can only appear +/// within a set character class. +#[derive(Clone, Debug, Eq, PartialEq)] +enum Primitive { + Literal(ast::Literal), + Assertion(ast::Assertion), + Dot(Span), + Perl(ast::ClassPerl), + Unicode(ast::ClassUnicode), +} + +impl Primitive { + /// Return the span of this primitive. + fn span(&self) -> &Span { + match *self { + Primitive::Literal(ref x) => &x.span, + Primitive::Assertion(ref x) => &x.span, + Primitive::Dot(ref span) => span, + Primitive::Perl(ref x) => &x.span, + Primitive::Unicode(ref x) => &x.span, + } + } + + /// Convert this primitive into a proper AST. + fn into_ast(self) -> Ast { + match self { + Primitive::Literal(lit) => Ast::Literal(lit), + Primitive::Assertion(assert) => Ast::Assertion(assert), + Primitive::Dot(span) => Ast::Dot(span), + Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), + Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), + } + } + + /// Convert this primitive into an item in a character class. + /// + /// If this primitive is not a legal item (i.e., an assertion or a dot), + /// then return an error. + fn into_class_set_item<P: Borrow<Parser>>( + self, + p: &ParserI<'_, P>, + ) -> Result<ast::ClassSetItem> { + use self::Primitive::*; + use crate::ast::ClassSetItem; + + match self { + Literal(lit) => Ok(ClassSetItem::Literal(lit)), + Perl(cls) => Ok(ClassSetItem::Perl(cls)), + Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), + } + } + + /// Convert this primitive into a literal in a character class. In + /// particular, literals are the only valid items that can appear in + /// ranges. + /// + /// If this primitive is not a legal item (i.e., a class, assertion or a + /// dot), then return an error. + fn into_class_literal<P: Borrow<Parser>>( + self, + p: &ParserI<'_, P>, + ) -> Result<ast::Literal> { + use self::Primitive::*; + + match self { + Literal(lit) => Ok(lit), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)), + } + } +} + +/// Returns true if the given character is a hexadecimal digit. +fn is_hex(c: char) -> bool { + ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') +} + +/// Returns true if the given character is a valid in a capture group name. +/// +/// If `first` is true, then `c` is treated as the first character in the +/// group name (which must be alphabetic or underscore). +fn is_capture_char(c: char, first: bool) -> bool { + c == '_' + || (!first + && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']')) + || ('A' <= c && c <= 'Z') + || ('a' <= c && c <= 'z') +} + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +#[derive(Clone, Debug)] +pub struct ParserBuilder { + ignore_whitespace: bool, + nest_limit: u32, + octal: bool, +} + +impl Default for ParserBuilder { + fn default() -> ParserBuilder { + ParserBuilder::new() + } +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder { + ignore_whitespace: false, + nest_limit: 250, + octal: false, + } + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { + pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), + capture_index: Cell::new(0), + nest_limit: self.nest_limit, + octal: self.octal, + initial_ignore_whitespace: self.ignore_whitespace, + ignore_whitespace: Cell::new(self.ignore_whitespace), + comments: RefCell::new(vec![]), + stack_group: RefCell::new(vec![]), + stack_class: RefCell::new(vec![]), + capture_names: RefCell::new(vec![]), + scratch: RefCell::new(String::new()), + } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.octal = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ignore_whitespace = yes; + self + } +} + +/// A regular expression parser. +/// +/// This parses a string representation of a regular expression into an +/// abstract syntax tree. The size of the tree is proportional to the length +/// of the regular expression pattern. +/// +/// A `Parser` can be configured in more detail via a +/// [`ParserBuilder`](struct.ParserBuilder.html). +#[derive(Clone, Debug)] +pub struct Parser { + /// The current position of the parser. + pos: Cell<Position>, + /// The current capture index. + capture_index: Cell<u32>, + /// The maximum number of open parens/brackets allowed. If the parser + /// exceeds this number, then an error is returned. + nest_limit: u32, + /// Whether to support octal syntax or not. When `false`, the parser will + /// return an error helpfully pointing out that backreferences are not + /// supported. + octal: bool, + /// The initial setting for `ignore_whitespace` as provided by + /// `ParserBuilder`. It is used when resetting the parser's state. + initial_ignore_whitespace: bool, + /// Whether whitespace should be ignored. When enabled, comments are + /// also permitted. + ignore_whitespace: Cell<bool>, + /// A list of comments, in order of appearance. + comments: RefCell<Vec<ast::Comment>>, + /// A stack of grouped sub-expressions, including alternations. + stack_group: RefCell<Vec<GroupState>>, + /// A stack of nested character classes. This is only non-empty when + /// parsing a class. + stack_class: RefCell<Vec<ClassState>>, + /// A sorted sequence of capture names. This is used to detect duplicate + /// capture names and report an error if one is detected. + capture_names: RefCell<Vec<ast::CaptureName>>, + /// A scratch buffer used in various places. Mostly this is used to + /// accumulate relevant characters from parts of a pattern. + scratch: RefCell<String>, +} + +/// ParserI is the internal parser implementation. +/// +/// We use this separate type so that we can carry the provided pattern string +/// along with us. In particular, a `Parser` internal state is not tied to any +/// one pattern, but `ParserI` is. +/// +/// This type also lets us use `ParserI<&Parser>` in production code while +/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes +/// work against the internal interface of the parser. +#[derive(Clone, Debug)] +struct ParserI<'s, P> { + /// The parser state/configuration. + parser: P, + /// The full regular expression provided by the user. + pattern: &'s str, +} + +/// GroupState represents a single stack frame while parsing nested groups +/// and alternations. Each frame records the state up to an opening parenthesis +/// or a alternating bracket `|`. +#[derive(Clone, Debug)] +enum GroupState { + /// This state is pushed whenever an opening group is found. + Group { + /// The concatenation immediately preceding the opening group. + concat: ast::Concat, + /// The group that has been opened. Its sub-AST is always empty. + group: ast::Group, + /// Whether this group has the `x` flag enabled or not. + ignore_whitespace: bool, + }, + /// This state is pushed whenever a new alternation branch is found. If + /// an alternation branch is found and this state is at the top of the + /// stack, then this state should be modified to include the new + /// alternation. + Alternation(ast::Alternation), +} + +/// ClassState represents a single stack frame while parsing character classes. +/// Each frame records the state up to an intersection, difference, symmetric +/// difference or nested class. +/// +/// Note that a parser's character class stack is only non-empty when parsing +/// a character class. In all other cases, it is empty. +#[derive(Clone, Debug)] +enum ClassState { + /// This state is pushed whenever an opening bracket is found. + Open { + /// The union of class items immediately preceding this class. + union: ast::ClassSetUnion, + /// The class that has been opened. Typically this just corresponds + /// to the `[`, but it can also include `[^` since `^` indicates + /// negation of the class. + set: ast::ClassBracketed, + }, + /// This state is pushed when a operator is seen. When popped, the stored + /// set becomes the left hand side of the operator. + Op { + /// The type of the operation, i.e., &&, -- or ~~. + kind: ast::ClassSetBinaryOpKind, + /// The left-hand side of the operator. + lhs: ast::ClassSet, + }, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with either the `parse` or `parse_with_comments` + /// methods. The parse methods return an abstract syntax tree. + /// + /// To set configuration options on the parser, use + /// [`ParserBuilder`](struct.ParserBuilder.html). + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into an abstract syntax tree. + pub fn parse(&mut self, pattern: &str) -> Result<Ast> { + ParserI::new(self, pattern).parse() + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + pub fn parse_with_comments( + &mut self, + pattern: &str, + ) -> Result<ast::WithComments> { + ParserI::new(self, pattern).parse_with_comments() + } + + /// Reset the internal state of a parser. + /// + /// This is called at the beginning of every parse. This prevents the + /// parser from running with inconsistent state (say, if a previous + /// invocation returned an error and the parser is reused). + fn reset(&self) { + // These settings should be in line with the construction + // in `ParserBuilder::build`. + self.pos.set(Position { offset: 0, line: 1, column: 1 }); + self.ignore_whitespace.set(self.initial_ignore_whitespace); + self.comments.borrow_mut().clear(); + self.stack_group.borrow_mut().clear(); + self.stack_class.borrow_mut().clear(); + } +} + +impl<'s, P: Borrow<Parser>> ParserI<'s, P> { + /// Build an internal parser from a parser configuration and a pattern. + fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { + ParserI { parser: parser, pattern: pattern } + } + + /// Return a reference to the parser state. + fn parser(&self) -> &Parser { + self.parser.borrow() + } + + /// Return a reference to the pattern being parsed. + fn pattern(&self) -> &str { + self.pattern.borrow() + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { + ast::Error { + kind: kind, + pattern: self.pattern().to_string(), + span: span, + } + } + + /// Return the current offset of the parser. + /// + /// The offset starts at `0` from the beginning of the regular expression + /// pattern string. + fn offset(&self) -> usize { + self.parser().pos.get().offset + } + + /// Return the current line number of the parser. + /// + /// The line number starts at `1`. + fn line(&self) -> usize { + self.parser().pos.get().line + } + + /// Return the current column of the parser. + /// + /// The column number starts at `1` and is reset whenever a `\n` is seen. + fn column(&self) -> usize { + self.parser().pos.get().column + } + + /// Return the next capturing index. Each subsequent call increments the + /// internal index. + /// + /// The span given should correspond to the location of the opening + /// parenthesis. + /// + /// If the capture limit is exceeded, then an error is returned. + fn next_capture_index(&self, span: Span) -> Result<u32> { + let current = self.parser().capture_index.get(); + let i = current.checked_add(1).ok_or_else(|| { + self.error(span, ast::ErrorKind::CaptureLimitExceeded) + })?; + self.parser().capture_index.set(i); + Ok(i) + } + + /// Adds the given capture name to this parser. If this capture name has + /// already been used, then an error is returned. + fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { + let mut names = self.parser().capture_names.borrow_mut(); + match names + .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) + { + Err(i) => { + names.insert(i, cap.clone()); + Ok(()) + } + Ok(i) => Err(self.error( + cap.span, + ast::ErrorKind::GroupNameDuplicate { original: names[i].span }, + )), + } + } + + /// Return whether the parser should ignore whitespace or not. + fn ignore_whitespace(&self) -> bool { + self.parser().ignore_whitespace.get() + } + + /// Return the character at the current position of the parser. + /// + /// This panics if the current position does not point to a valid char. + fn char(&self) -> char { + self.char_at(self.offset()) + } + + /// Return the character at the given position. + /// + /// This panics if the given position does not point to a valid char. + fn char_at(&self, i: usize) -> char { + self.pattern()[i..] + .chars() + .next() + .unwrap_or_else(|| panic!("expected char at offset {}", i)) + } + + /// Bump the parser to the next Unicode scalar value. + /// + /// If the end of the input has been reached, then `false` is returned. + fn bump(&self) -> bool { + if self.is_eof() { + return false; + } + let Position { mut offset, mut line, mut column } = self.pos(); + if self.char() == '\n' { + line = line.checked_add(1).unwrap(); + column = 1; + } else { + column = column.checked_add(1).unwrap(); + } + offset += self.char().len_utf8(); + self.parser().pos.set(Position { + offset: offset, + line: line, + column: column, + }); + self.pattern()[self.offset()..].chars().next().is_some() + } + + /// If the substring starting at the current position of the parser has + /// the given prefix, then bump the parser to the character immediately + /// following the prefix and return true. Otherwise, don't bump the parser + /// and return false. + fn bump_if(&self, prefix: &str) -> bool { + if self.pattern()[self.offset()..].starts_with(prefix) { + for _ in 0..prefix.chars().count() { + self.bump(); + } + true + } else { + false + } + } + + /// Returns true if and only if the parser is positioned at a look-around + /// prefix. The conditions under which this returns true must always + /// correspond to a regular expression that would otherwise be consider + /// invalid. + /// + /// This should only be called immediately after parsing the opening of + /// a group or a set of flags. + fn is_lookaround_prefix(&self) -> bool { + self.bump_if("?=") + || self.bump_if("?!") + || self.bump_if("?<=") + || self.bump_if("?<!") + } + + /// Bump the parser, and if the `x` flag is enabled, bump through any + /// subsequent spaces. Return true if and only if the parser is not at + /// EOF. + fn bump_and_bump_space(&self) -> bool { + if !self.bump() { + return false; + } + self.bump_space(); + !self.is_eof() + } + + /// If the `x` flag is enabled (i.e., whitespace insensitivity with + /// comments), then this will advance the parser through all whitespace + /// and comments to the next non-whitespace non-comment byte. + /// + /// If the `x` flag is disabled, then this is a no-op. + /// + /// This should be used selectively throughout the parser where + /// arbitrary whitespace is permitted when the `x` flag is enabled. For + /// example, `{ 5 , 6}` is equivalent to `{5,6}`. + fn bump_space(&self) { + if !self.ignore_whitespace() { + return; + } + while !self.is_eof() { + if self.char().is_whitespace() { + self.bump(); + } else if self.char() == '#' { + let start = self.pos(); + let mut comment_text = String::new(); + self.bump(); + while !self.is_eof() { + let c = self.char(); + self.bump(); + if c == '\n' { + break; + } + comment_text.push(c); + } + let comment = ast::Comment { + span: Span::new(start, self.pos()), + comment: comment_text, + }; + self.parser().comments.borrow_mut().push(comment); + } else { + break; + } + } + } + + /// Peek at the next character in the input without advancing the parser. + /// + /// If the input has been exhausted, then this returns `None`. + fn peek(&self) -> Option<char> { + if self.is_eof() { + return None; + } + self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() + } + + /// Like peek, but will ignore spaces when the parser is in whitespace + /// insensitive mode. + fn peek_space(&self) -> Option<char> { + if !self.ignore_whitespace() { + return self.peek(); + } + if self.is_eof() { + return None; + } + let mut start = self.offset() + self.char().len_utf8(); + let mut in_comment = false; + for (i, c) in self.pattern()[start..].char_indices() { + if c.is_whitespace() { + continue; + } else if !in_comment && c == '#' { + in_comment = true; + } else if in_comment && c == '\n' { + in_comment = false; + } else { + start += i; + break; + } + } + self.pattern()[start..].chars().next() + } + + /// Returns true if the next call to `bump` would return false. + fn is_eof(&self) -> bool { + self.offset() == self.pattern().len() + } + + /// Return the current position of the parser, which includes the offset, + /// line and column. + fn pos(&self) -> Position { + self.parser().pos.get() + } + + /// Create a span at the current position of the parser. Both the start + /// and end of the span are set. + fn span(&self) -> Span { + Span::splat(self.pos()) + } + + /// Create a span that covers the current character. + fn span_char(&self) -> Span { + let mut next = Position { + offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), + line: self.line(), + column: self.column().checked_add(1).unwrap(), + }; + if self.char() == '\n' { + next.line += 1; + next.column = 1; + } + Span::new(self.pos(), next) + } + + /// Parse and push a single alternation on to the parser's internal stack. + /// If the top of the stack already has an alternation, then add to that + /// instead of pushing a new one. + /// + /// The concatenation given corresponds to a single alternation branch. + /// The concatenation returned starts the next branch and is empty. + /// + /// This assumes the parser is currently positioned at `|` and will advance + /// the parser to the character following `|`. + #[inline(never)] + fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> { + assert_eq!(self.char(), '|'); + concat.span.end = self.pos(); + self.push_or_add_alternation(concat); + self.bump(); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } + + /// Pushes or adds the given branch of an alternation to the parser's + /// internal stack of state. + fn push_or_add_alternation(&self, concat: ast::Concat) { + use self::GroupState::*; + + let mut stack = self.parser().stack_group.borrow_mut(); + if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { + alts.asts.push(concat.into_ast()); + return; + } + stack.push(Alternation(ast::Alternation { + span: Span::new(concat.span.start, self.pos()), + asts: vec![concat.into_ast()], + })); + } + + /// Parse and push a group AST (and its parent concatenation) on to the + /// parser's internal stack. Return a fresh concatenation corresponding + /// to the group's sub-AST. + /// + /// If a set of flags was found (with no group), then the concatenation + /// is returned with that set of flags added. + /// + /// This assumes that the parser is currently positioned on the opening + /// parenthesis. It advances the parser to the character at the start + /// of the sub-expression (or adjoining expression). + /// + /// If there was a problem parsing the start of the group, then an error + /// is returned. + #[inline(never)] + fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> { + assert_eq!(self.char(), '('); + match self.parse_group()? { + Either::Left(set) => { + let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); + if let Some(v) = ignore { + self.parser().ignore_whitespace.set(v); + } + + concat.asts.push(Ast::Flags(set)); + Ok(concat) + } + Either::Right(group) => { + let old_ignore_whitespace = self.ignore_whitespace(); + let new_ignore_whitespace = group + .flags() + .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) + .unwrap_or(old_ignore_whitespace); + self.parser().stack_group.borrow_mut().push( + GroupState::Group { + concat: concat, + group: group, + ignore_whitespace: old_ignore_whitespace, + }, + ); + self.parser().ignore_whitespace.set(new_ignore_whitespace); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } + } + } + + /// Pop a group AST from the parser's internal stack and set the group's + /// AST to the given concatenation. Return the concatenation containing + /// the group. + /// + /// This assumes that the parser is currently positioned on the closing + /// parenthesis and advances the parser to the character following the `)`. + /// + /// If no such group could be popped, then an unopened group error is + /// returned. + #[inline(never)] + fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> { + use self::GroupState::*; + + assert_eq!(self.char(), ')'); + let mut stack = self.parser().stack_group.borrow_mut(); + let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack + .pop() + { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, None) + } + Some(Alternation(alt)) => match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, Some(alt)) + } + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }, + None => { + return Err(self + .error(self.span_char(), ast::ErrorKind::GroupUnopened)); + } + }; + self.parser().ignore_whitespace.set(ignore_whitespace); + group_concat.span.end = self.pos(); + self.bump(); + group.span.end = self.pos(); + match alt { + Some(mut alt) => { + alt.span.end = group_concat.span.end; + alt.asts.push(group_concat.into_ast()); + group.ast = Box::new(alt.into_ast()); + } + None => { + group.ast = Box::new(group_concat.into_ast()); + } + } + prior_concat.asts.push(Ast::Group(group)); + Ok(prior_concat) + } + + /// Pop the last state from the parser's internal stack, if it exists, and + /// add the given concatenation to it. There either must be no state or a + /// single alternation item on the stack. Any other scenario produces an + /// error. + /// + /// This assumes that the parser has advanced to the end. + #[inline(never)] + fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> { + concat.span.end = self.pos(); + let mut stack = self.parser().stack_group.borrow_mut(); + let ast = match stack.pop() { + None => Ok(concat.into_ast()), + Some(GroupState::Alternation(mut alt)) => { + alt.span.end = self.pos(); + alt.asts.push(concat.into_ast()); + Ok(Ast::Alternation(alt)) + } + Some(GroupState::Group { group, .. }) => { + return Err( + self.error(group.span, ast::ErrorKind::GroupUnclosed) + ); + } + }; + // If we try to pop again, there should be nothing. + match stack.pop() { + None => ast, + Some(GroupState::Alternation(_)) => { + // This unreachable is unfortunate. This case can't happen + // because the only way we can be here is if there were two + // `GroupState::Alternation`s adjacent in the parser's stack, + // which we guarantee to never happen because we never push a + // `GroupState::Alternation` if one is already at the top of + // the stack. + unreachable!() + } + Some(GroupState::Group { group, .. }) => { + Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) + } + } + } + + /// Parse the opening of a character class and push the current class + /// parsing context onto the parser's stack. This assumes that the parser + /// is positioned at an opening `[`. The given union should correspond to + /// the union of set items built up before seeing the `[`. + /// + /// If there was a problem parsing the opening of the class, then an error + /// is returned. Otherwise, a new union of set items for the class is + /// returned (which may be populated with either a `]` or a `-`). + #[inline(never)] + fn push_class_open( + &self, + parent_union: ast::ClassSetUnion, + ) -> Result<ast::ClassSetUnion> { + assert_eq!(self.char(), '['); + + let (nested_set, nested_union) = self.parse_set_class_open()?; + self.parser() + .stack_class + .borrow_mut() + .push(ClassState::Open { union: parent_union, set: nested_set }); + Ok(nested_union) + } + + /// Parse the end of a character class set and pop the character class + /// parser stack. The union given corresponds to the last union built + /// before seeing the closing `]`. The union returned corresponds to the + /// parent character class set with the nested class added to it. + /// + /// This assumes that the parser is positioned at a `]` and will advance + /// the parser to the byte immediately following the `]`. + /// + /// If the stack is empty after popping, then this returns the final + /// "top-level" character class AST (where a "top-level" character class + /// is one that is not nested inside any other character class). + /// + /// If there is no corresponding opening bracket on the parser's stack, + /// then an error is returned. + #[inline(never)] + fn pop_class( + &self, + nested_union: ast::ClassSetUnion, + ) -> Result<Either<ast::ClassSetUnion, ast::Class>> { + assert_eq!(self.char(), ']'); + + let item = ast::ClassSet::Item(nested_union.into_item()); + let prevset = self.pop_class_op(item); + let mut stack = self.parser().stack_class.borrow_mut(); + match stack.pop() { + None => { + // We can never observe an empty stack: + // + // 1) We are guaranteed to start with a non-empty stack since + // the character class parser is only initiated when it sees + // a `[`. + // 2) If we ever observe an empty stack while popping after + // seeing a `]`, then we signal the character class parser + // to terminate. + panic!("unexpected empty character class stack") + } + Some(ClassState::Op { .. }) => { + // This panic is unfortunate, but this case is impossible + // since we already popped the Op state if one exists above. + // Namely, every push to the class parser stack is guarded by + // whether an existing Op is already on the top of the stack. + // If it is, the existing Op is modified. That is, the stack + // can never have consecutive Op states. + panic!("unexpected ClassState::Op") + } + Some(ClassState::Open { mut union, mut set }) => { + self.bump(); + set.span.end = self.pos(); + set.kind = prevset; + if stack.is_empty() { + Ok(Either::Right(ast::Class::Bracketed(set))) + } else { + union.push(ast::ClassSetItem::Bracketed(Box::new(set))); + Ok(Either::Left(union)) + } + } + } + } + + /// Return an "unclosed class" error whose span points to the most + /// recently opened class. + /// + /// This should only be called while parsing a character class. + #[inline(never)] + fn unclosed_class_error(&self) -> ast::Error { + for state in self.parser().stack_class.borrow().iter().rev() { + match *state { + ClassState::Open { ref set, .. } => { + return self + .error(set.span, ast::ErrorKind::ClassUnclosed); + } + _ => {} + } + } + // We are guaranteed to have a non-empty stack with at least + // one open bracket, so we should never get here. + panic!("no open character class found") + } + + /// Push the current set of class items on to the class parser's stack as + /// the left hand side of the given operator. + /// + /// A fresh set union is returned, which should be used to build the right + /// hand side of this operator. + #[inline(never)] + fn push_class_op( + &self, + next_kind: ast::ClassSetBinaryOpKind, + next_union: ast::ClassSetUnion, + ) -> ast::ClassSetUnion { + let item = ast::ClassSet::Item(next_union.into_item()); + let new_lhs = self.pop_class_op(item); + self.parser() + .stack_class + .borrow_mut() + .push(ClassState::Op { kind: next_kind, lhs: new_lhs }); + ast::ClassSetUnion { span: self.span(), items: vec![] } + } + + /// Pop a character class set from the character class parser stack. If the + /// top of the stack is just an item (not an operation), then return the + /// given set unchanged. If the top of the stack is an operation, then the + /// given set will be used as the rhs of the operation on the top of the + /// stack. In that case, the binary operation is returned as a set. + #[inline(never)] + fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { + let mut stack = self.parser().stack_class.borrow_mut(); + let (kind, lhs) = match stack.pop() { + Some(ClassState::Op { kind, lhs }) => (kind, lhs), + Some(state @ ClassState::Open { .. }) => { + stack.push(state); + return rhs; + } + None => unreachable!(), + }; + let span = Span::new(lhs.span().start, rhs.span().end); + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: kind, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } +} + +impl<'s, P: Borrow<Parser>> ParserI<'s, P> { + /// Parse the regular expression into an abstract syntax tree. + fn parse(&self) -> Result<Ast> { + self.parse_with_comments().map(|astc| astc.ast) + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + fn parse_with_comments(&self) -> Result<ast::WithComments> { + assert_eq!(self.offset(), 0, "parser can only be used once"); + self.parser().reset(); + let mut concat = ast::Concat { span: self.span(), asts: vec![] }; + loop { + self.bump_space(); + if self.is_eof() { + break; + } + match self.char() { + '(' => concat = self.push_group(concat)?, + ')' => concat = self.pop_group(concat)?, + '|' => concat = self.push_alternate(concat)?, + '[' => { + let class = self.parse_set_class()?; + concat.asts.push(Ast::Class(class)); + } + '?' => { + concat = self.parse_uncounted_repetition( + concat, + ast::RepetitionKind::ZeroOrOne, + )?; + } + '*' => { + concat = self.parse_uncounted_repetition( + concat, + ast::RepetitionKind::ZeroOrMore, + )?; + } + '+' => { + concat = self.parse_uncounted_repetition( + concat, + ast::RepetitionKind::OneOrMore, + )?; + } + '{' => { + concat = self.parse_counted_repetition(concat)?; + } + _ => concat.asts.push(self.parse_primitive()?.into_ast()), + } + } + let ast = self.pop_group_end(concat)?; + NestLimiter::new(self).check(&ast)?; + Ok(ast::WithComments { + ast: ast, + comments: mem::replace( + &mut *self.parser().comments.borrow_mut(), + vec![], + ), + }) + } + + /// Parses an uncounted repetition operation. An uncounted repetition + /// operator includes ?, * and +, but does not include the {m,n} syntax. + /// The given `kind` should correspond to the operator observed by the + /// caller. + /// + /// This assumes that the parser is currently positioned at the repetition + /// operator and advances the parser to the first character after the + /// operator. (Note that the operator may include a single additional `?`, + /// which makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + #[inline(never)] + fn parse_uncounted_repetition( + &self, + mut concat: ast::Concat, + kind: ast::RepetitionKind, + ) -> Result<ast::Concat> { + assert!( + self.char() == '?' || self.char() == '*' || self.char() == '+' + ); + let op_start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + }; + match ast { + Ast::Empty(_) | Ast::Flags(_) => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + _ => {} + } + let mut greedy = true; + if self.bump() && self.char() == '?' { + greedy = false; + self.bump(); + } + concat.asts.push(Ast::Repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: Span::new(op_start, self.pos()), + kind: kind, + }, + greedy: greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parses a counted repetition operation. A counted repetition operator + /// corresponds to the {m,n} syntax, and does not include the ?, * or + + /// operators. + /// + /// This assumes that the parser is currently positioned at the opening `{` + /// and advances the parser to the first character after the operator. + /// (Note that the operator may include a single additional `?`, which + /// makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + #[inline(never)] + fn parse_counted_repetition( + &self, + mut concat: ast::Concat, + ) -> Result<ast::Concat> { + assert!(self.char() == '{'); + let start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + }; + match ast { + Ast::Empty(_) | Ast::Flags(_) => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + _ => {} + } + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + let count_start = specialize_err( + self.parse_decimal(), + ast::ErrorKind::DecimalEmpty, + ast::ErrorKind::RepetitionCountDecimalEmpty, + )?; + let mut range = ast::RepetitionRange::Exactly(count_start); + if self.is_eof() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() == ',' { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() != '}' { + let count_end = specialize_err( + self.parse_decimal(), + ast::ErrorKind::DecimalEmpty, + ast::ErrorKind::RepetitionCountDecimalEmpty, + )?; + range = ast::RepetitionRange::Bounded(count_start, count_end); + } else { + range = ast::RepetitionRange::AtLeast(count_start); + } + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + + let mut greedy = true; + if self.bump_and_bump_space() && self.char() == '?' { + greedy = false; + self.bump(); + } + + let op_span = Span::new(start, self.pos()); + if !range.is_valid() { + return Err( + self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) + ); + } + concat.asts.push(Ast::Repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: op_span, + kind: ast::RepetitionKind::Range(range), + }, + greedy: greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parse a group (which contains a sub-expression) or a set of flags. + /// + /// If a group was found, then it is returned with an empty AST. If a set + /// of flags is found, then that set is returned. + /// + /// The parser should be positioned at the opening parenthesis. + /// + /// This advances the parser to the character before the start of the + /// sub-expression (in the case of a group) or to the closing parenthesis + /// immediately following the set of flags. + /// + /// # Errors + /// + /// If flags are given and incorrectly specified, then a corresponding + /// error is returned. + /// + /// If a capture name is given and it is incorrectly specified, then a + /// corresponding error is returned. + #[inline(never)] + fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> { + assert_eq!(self.char(), '('); + let open_span = self.span_char(); + self.bump(); + self.bump_space(); + if self.is_lookaround_prefix() { + return Err(self.error( + Span::new(open_span.start, self.span().end), + ast::ErrorKind::UnsupportedLookAround, + )); + } + let inner_span = self.span(); + if self.bump_if("?P<") { + let capture_index = self.next_capture_index(open_span)?; + let cap = self.parse_capture_name(capture_index)?; + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureName(cap), + ast: Box::new(Ast::Empty(self.span())), + })) + } else if self.bump_if("?") { + if self.is_eof() { + return Err( + self.error(open_span, ast::ErrorKind::GroupUnclosed) + ); + } + let flags = self.parse_flags()?; + let char_end = self.char(); + self.bump(); + if char_end == ')' { + // We don't allow empty flags, e.g., `(?)`. We instead + // interpret it as a repetition operator missing its argument. + if flags.items.is_empty() { + return Err(self.error( + inner_span, + ast::ErrorKind::RepetitionMissing, + )); + } + Ok(Either::Left(ast::SetFlags { + span: Span { end: self.pos(), ..open_span }, + flags: flags, + })) + } else { + assert_eq!(char_end, ':'); + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::NonCapturing(flags), + ast: Box::new(Ast::Empty(self.span())), + })) + } + } else { + let capture_index = self.next_capture_index(open_span)?; + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureIndex(capture_index), + ast: Box::new(Ast::Empty(self.span())), + })) + } + } + + /// Parses a capture group name. Assumes that the parser is positioned at + /// the first character in the name following the opening `<` (and may + /// possibly be EOF). This advances the parser to the first character + /// following the closing `>`. + /// + /// The caller must provide the capture index of the group for this name. + #[inline(never)] + fn parse_capture_name( + &self, + capture_index: u32, + ) -> Result<ast::CaptureName> { + if self.is_eof() { + return Err(self + .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); + } + let start = self.pos(); + loop { + if self.char() == '>' { + break; + } + if !is_capture_char(self.char(), self.pos() == start) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupNameInvalid, + )); + } + if !self.bump() { + break; + } + } + let end = self.pos(); + if self.is_eof() { + return Err(self + .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); + } + assert_eq!(self.char(), '>'); + self.bump(); + let name = &self.pattern()[start.offset..end.offset]; + if name.is_empty() { + return Err(self.error( + Span::new(start, start), + ast::ErrorKind::GroupNameEmpty, + )); + } + let capname = ast::CaptureName { + span: Span::new(start, end), + name: name.to_string(), + index: capture_index, + }; + self.add_capture_name(&capname)?; + Ok(capname) + } + + /// Parse a sequence of flags starting at the current character. + /// + /// This advances the parser to the character immediately following the + /// flags, which is guaranteed to be either `:` or `)`. + /// + /// # Errors + /// + /// If any flags are duplicated, then an error is returned. + /// + /// If the negation operator is used more than once, then an error is + /// returned. + /// + /// If no flags could be found or if the negation operation is not followed + /// by any flags, then an error is returned. + #[inline(never)] + fn parse_flags(&self) -> Result<ast::Flags> { + let mut flags = ast::Flags { span: self.span(), items: vec![] }; + let mut last_was_negation = None; + while self.char() != ':' && self.char() != ')' { + if self.char() == '-' { + last_was_negation = Some(self.span_char()); + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Negation, + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagRepeatedNegation { + original: flags.items[i].span, + }, + )); + } + } else { + last_was_negation = None; + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Flag(self.parse_flag()?), + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagDuplicate { + original: flags.items[i].span, + }, + )); + } + } + if !self.bump() { + return Err( + self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof) + ); + } + } + if let Some(span) = last_was_negation { + return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); + } + flags.span.end = self.pos(); + Ok(flags) + } + + /// Parse the current character as a flag. Do not advance the parser. + /// + /// # Errors + /// + /// If the flag is not recognized, then an error is returned. + #[inline(never)] + fn parse_flag(&self) -> Result<ast::Flag> { + match self.char() { + 'i' => Ok(ast::Flag::CaseInsensitive), + 'm' => Ok(ast::Flag::MultiLine), + 's' => Ok(ast::Flag::DotMatchesNewLine), + 'U' => Ok(ast::Flag::SwapGreed), + 'u' => Ok(ast::Flag::Unicode), + 'x' => Ok(ast::Flag::IgnoreWhitespace), + _ => { + Err(self + .error(self.span_char(), ast::ErrorKind::FlagUnrecognized)) + } + } + } + + /// Parse a primitive AST. e.g., A literal, non-set character class or + /// assertion. + /// + /// This assumes that the parser expects a primitive at the current + /// location. i.e., All other non-primitive cases have been handled. + /// For example, if the parser's position is at `|`, then `|` will be + /// treated as a literal (e.g., inside a character class). + /// + /// This advances the parser to the first character immediately following + /// the primitive. + fn parse_primitive(&self) -> Result<Primitive> { + match self.char() { + '\\' => self.parse_escape(), + '.' => { + let ast = Primitive::Dot(self.span_char()); + self.bump(); + Ok(ast) + } + '^' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::StartLine, + }); + self.bump(); + Ok(ast) + } + '$' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::EndLine, + }); + self.bump(); + Ok(ast) + } + c => { + let ast = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: c, + }); + self.bump(); + Ok(ast) + } + } + } + + /// Parse an escape sequence as a primitive AST. + /// + /// This assumes the parser is positioned at the start of the escape + /// sequence, i.e., `\`. It advances the parser to the first position + /// immediately following the escape sequence. + #[inline(never)] + fn parse_escape(&self) -> Result<Primitive> { + assert_eq!(self.char(), '\\'); + let start = self.pos(); + if !self.bump() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let c = self.char(); + // Put some of the more complicated routines into helpers. + match c { + '0'..='7' => { + if !self.parser().octal { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + let mut lit = self.parse_octal(); + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + '8'..='9' if !self.parser().octal => { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + 'x' | 'u' | 'U' => { + let mut lit = self.parse_hex()?; + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + 'p' | 'P' => { + let mut cls = self.parse_unicode_class()?; + cls.span.start = start; + return Ok(Primitive::Unicode(cls)); + } + 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { + let mut cls = self.parse_perl_class(); + cls.span.start = start; + return Ok(Primitive::Perl(cls)); + } + _ => {} + } + + // Handle all of the one letter sequences inline. + self.bump(); + let span = Span::new(start, self.pos()); + if is_meta_character(c) { + return Ok(Primitive::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Punctuation, + c: c, + })); + } + let special = |kind, c| { + Ok(Primitive::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Special(kind), + c: c, + })) + }; + match c { + 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), + 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), + 't' => special(ast::SpecialLiteralKind::Tab, '\t'), + 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), + 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), + 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), + ' ' if self.ignore_whitespace() => { + special(ast::SpecialLiteralKind::Space, ' ') + } + 'A' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::StartText, + })), + 'z' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::EndText, + })), + 'b' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::WordBoundary, + })), + 'B' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::NotWordBoundary, + })), + _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), + } + } + + /// Parse an octal representation of a Unicode codepoint up to 3 digits + /// long. This expects the parser to be positioned at the first octal + /// digit and advances the parser to the first character immediately + /// following the octal number. This also assumes that parsing octal + /// escapes is enabled. + /// + /// Assuming the preconditions are met, this routine can never fail. + #[inline(never)] + fn parse_octal(&self) -> ast::Literal { + use std::char; + use std::u32; + + assert!(self.parser().octal); + assert!('0' <= self.char() && self.char() <= '7'); + let start = self.pos(); + // Parse up to two more digits. + while self.bump() + && '0' <= self.char() + && self.char() <= '7' + && self.pos().offset - start.offset <= 2 + {} + let end = self.pos(); + let octal = &self.pattern()[start.offset..end.offset]; + // Parsing the octal should never fail since the above guarantees a + // valid number. + let codepoint = + u32::from_str_radix(octal, 8).expect("valid octal number"); + // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no + // invalid Unicode scalar values. + let c = char::from_u32(codepoint).expect("Unicode scalar value"); + ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::Octal, + c: c, + } + } + + /// Parse a hex representation of a Unicode codepoint. This handles both + /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to + /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to + /// the first character immediately following the hexadecimal literal. + #[inline(never)] + fn parse_hex(&self) -> Result<ast::Literal> { + assert!( + self.char() == 'x' || self.char() == 'u' || self.char() == 'U' + ); + + let hex_kind = match self.char() { + 'x' => ast::HexLiteralKind::X, + 'u' => ast::HexLiteralKind::UnicodeShort, + _ => ast::HexLiteralKind::UnicodeLong, + }; + if !self.bump_and_bump_space() { + return Err( + self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) + ); + } + if self.char() == '{' { + self.parse_hex_brace(hex_kind) + } else { + self.parse_hex_digits(hex_kind) + } + } + + /// Parse an N-digit hex representation of a Unicode codepoint. This + /// expects the parser to be positioned at the first digit and will advance + /// the parser to the first character immediately following the escape + /// sequence. + /// + /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) + /// or 8 (for `\UNNNNNNNN`). + #[inline(never)] + fn parse_hex_digits( + &self, + kind: ast::HexLiteralKind, + ) -> Result<ast::Literal> { + use std::char; + use std::u32; + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let start = self.pos(); + for i in 0..kind.digits() { + if i > 0 && !self.bump_and_bump_space() { + return Err(self + .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); + } + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + // The final bump just moves the parser past the literal, which may + // be EOF. + self.bump_and_bump_space(); + let end = self.pos(); + let hex = scratch.as_str(); + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::HexFixed(kind), + c: c, + }), + } + } + + /// Parse a hex representation of any Unicode scalar value. This expects + /// the parser to be positioned at the opening brace `{` and will advance + /// the parser to the first character following the closing brace `}`. + #[inline(never)] + fn parse_hex_brace( + &self, + kind: ast::HexLiteralKind, + ) -> Result<ast::Literal> { + use std::char; + use std::u32; + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let brace_pos = self.pos(); + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let end = self.pos(); + let hex = scratch.as_str(); + assert_eq!(self.char(), '}'); + self.bump_and_bump_space(); + + if hex.is_empty() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeHexEmpty, + )); + } + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, self.pos()), + kind: ast::LiteralKind::HexBrace(kind), + c: c, + }), + } + } + + /// Parse a decimal number into a u32 while trimming leading and trailing + /// whitespace. + /// + /// This expects the parser to be positioned at the first position where + /// a decimal digit could occur. This will advance the parser to the byte + /// immediately following the last contiguous decimal digit. + /// + /// If no decimal digit could be found or if there was a problem parsing + /// the complete set of digits into a u32, then an error is returned. + fn parse_decimal(&self) -> Result<u32> { + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + while !self.is_eof() && self.char().is_whitespace() { + self.bump(); + } + let start = self.pos(); + while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + let span = Span::new(start, self.pos()); + while !self.is_eof() && self.char().is_whitespace() { + self.bump_and_bump_space(); + } + let digits = scratch.as_str(); + if digits.is_empty() { + return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); + } + match u32::from_str_radix(digits, 10).ok() { + Some(n) => Ok(n), + None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), + } + } + + /// Parse a standard character class consisting primarily of characters or + /// character ranges, but can also contain nested character classes of + /// any type (sans `.`). + /// + /// This assumes the parser is positioned at the opening `[`. If parsing + /// is successful, then the parser is advanced to the position immediately + /// following the closing `]`. + #[inline(never)] + fn parse_set_class(&self) -> Result<ast::Class> { + assert_eq!(self.char(), '['); + + let mut union = + ast::ClassSetUnion { span: self.span(), items: vec![] }; + loop { + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + match self.char() { + '[' => { + // If we've already parsed the opening bracket, then + // attempt to treat this as the beginning of an ASCII + // class. If ASCII class parsing fails, then the parser + // backs up to `[`. + if !self.parser().stack_class.borrow().is_empty() { + if let Some(cls) = self.maybe_parse_ascii_class() { + union.push(ast::ClassSetItem::Ascii(cls)); + continue; + } + } + union = self.push_class_open(union)?; + } + ']' => match self.pop_class(union)? { + Either::Left(nested_union) => { + union = nested_union; + } + Either::Right(class) => return Ok(class), + }, + '&' if self.peek() == Some('&') => { + assert!(self.bump_if("&&")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Intersection, + union, + ); + } + '-' if self.peek() == Some('-') => { + assert!(self.bump_if("--")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Difference, + union, + ); + } + '~' if self.peek() == Some('~') => { + assert!(self.bump_if("~~")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::SymmetricDifference, + union, + ); + } + _ => { + union.push(self.parse_set_class_range()?); + } + } + } + } + + /// Parse a single primitive item in a character class set. The item to + /// be parsed can either be one of a simple literal character, a range + /// between two simple literal characters or a "primitive" character + /// class like \w or \p{Greek}. + /// + /// If an invalid escape is found, or if a character class is found where + /// a simple literal is expected (e.g., in a range), then an error is + /// returned. + #[inline(never)] + fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> { + let prim1 = self.parse_set_class_item()?; + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + // If the next char isn't a `-`, then we don't have a range. + // There are two exceptions. If the char after a `-` is a `]`, then + // `-` is interpreted as a literal `-`. Alternatively, if the char + // after a `-` is a `-`, then `--` corresponds to a "difference" + // operation. + if self.char() != '-' + || self.peek_space() == Some(']') + || self.peek_space() == Some('-') + { + return prim1.into_class_set_item(self); + } + // OK, now we're parsing a range, so bump past the `-` and parse the + // second half of the range. + if !self.bump_and_bump_space() { + return Err(self.unclosed_class_error()); + } + let prim2 = self.parse_set_class_item()?; + let range = ast::ClassSetRange { + span: Span::new(prim1.span().start, prim2.span().end), + start: prim1.into_class_literal(self)?, + end: prim2.into_class_literal(self)?, + }; + if !range.is_valid() { + return Err( + self.error(range.span, ast::ErrorKind::ClassRangeInvalid) + ); + } + Ok(ast::ClassSetItem::Range(range)) + } + + /// Parse a single item in a character class as a primitive, where the + /// primitive either consists of a verbatim literal or a single escape + /// sequence. + /// + /// This assumes the parser is positioned at the beginning of a primitive, + /// and advances the parser to the first position after the primitive if + /// successful. + /// + /// Note that it is the caller's responsibility to report an error if an + /// illegal primitive was parsed. + #[inline(never)] + fn parse_set_class_item(&self) -> Result<Primitive> { + if self.char() == '\\' { + self.parse_escape() + } else { + let x = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: self.char(), + }); + self.bump(); + Ok(x) + } + } + + /// Parses the opening of a character class set. This includes the opening + /// bracket along with `^` if present to indicate negation. This also + /// starts parsing the opening set of unioned items if applicable, since + /// there are special rules applied to certain characters in the opening + /// of a character class. For example, `[^]]` is the class of all + /// characters not equal to `]`. (`]` would need to be escaped in any other + /// position.) Similarly for `-`. + /// + /// In all cases, the op inside the returned `ast::ClassBracketed` is an + /// empty union. This empty union should be replaced with the actual item + /// when it is popped from the parser's stack. + /// + /// This assumes the parser is positioned at the opening `[` and advances + /// the parser to the first non-special byte of the character class. + /// + /// An error is returned if EOF is found. + #[inline(never)] + fn parse_set_class_open( + &self, + ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { + assert_eq!(self.char(), '['); + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + + let negated = if self.char() != '^' { + false + } else { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + true + }; + // Accept any number of `-` as literal `-`. + let mut union = + ast::ClassSetUnion { span: self.span(), items: vec![] }; + while self.char() == '-' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: '-', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + } + // If `]` is the *first* char in a set, then interpret it as a literal + // `]`. That is, an empty class is impossible to write. + if union.items.is_empty() && self.char() == ']' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: ']', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + } + let set = ast::ClassBracketed { + span: Span::new(start, self.pos()), + negated: negated, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: Span::new(union.span.start, union.span.start), + items: vec![], + }), + }; + Ok((set, union)) + } + + /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. + /// + /// This assumes the parser is positioned at the opening `[`. + /// + /// If no valid ASCII character class could be found, then this does not + /// advance the parser and `None` is returned. Otherwise, the parser is + /// advanced to the first byte following the closing `]` and the + /// corresponding ASCII class is returned. + #[inline(never)] + fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> { + // ASCII character classes are interesting from a parsing perspective + // because parsing cannot fail with any interesting error. For example, + // in order to use an ASCII character class, it must be enclosed in + // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think + // of it as "ASCII character characters have the syntax `[:NAME:]` + // which can only appear within character brackets." This means that + // things like `[[:lower:]A]` are legal constructs. + // + // However, if one types an incorrect ASCII character class, e.g., + // `[[:loower:]]`, then we treat that as a normal nested character + // class containing the characters `:elorw`. One might argue that we + // should return an error instead since the repeated colons give away + // the intent to write an ASCII class. But what if the user typed + // `[[:lower]]` instead? How can we tell that was intended to be an + // ASCII class and not just a normal nested class? + // + // Reasonable people can probably disagree over this, but for better + // or worse, we implement semantics that never fails at the expense + // of better failure modes. + assert_eq!(self.char(), '['); + // If parsing fails, then we back up the parser to this starting point. + let start = self.pos(); + let mut negated = false; + if !self.bump() || self.char() != ':' { + self.parser().pos.set(start); + return None; + } + if !self.bump() { + self.parser().pos.set(start); + return None; + } + if self.char() == '^' { + negated = true; + if !self.bump() { + self.parser().pos.set(start); + return None; + } + } + let name_start = self.offset(); + while self.char() != ':' && self.bump() {} + if self.is_eof() { + self.parser().pos.set(start); + return None; + } + let name = &self.pattern()[name_start..self.offset()]; + if !self.bump_if(":]") { + self.parser().pos.set(start); + return None; + } + let kind = match ast::ClassAsciiKind::from_name(name) { + Some(kind) => kind, + None => { + self.parser().pos.set(start); + return None; + } + }; + Some(ast::ClassAscii { + span: Span::new(start, self.pos()), + kind: kind, + negated: negated, + }) + } + + /// Parse a Unicode class in either the single character notation, `\pN` + /// or the multi-character bracketed notation, `\p{Greek}`. This assumes + /// the parser is positioned at the `p` (or `P` for negation) and will + /// advance the parser to the character immediately following the class. + /// + /// Note that this does not check whether the class name is valid or not. + #[inline(never)] + fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> { + assert!(self.char() == 'p' || self.char() == 'P'); + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let negated = self.char() == 'P'; + if !self.bump_and_bump_space() { + return Err( + self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) + ); + } + let (start, kind) = if self.char() == '{' { + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self + .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); + } + assert_eq!(self.char(), '}'); + self.bump(); + + let name = scratch.as_str(); + if let Some(i) = name.find("!=") { + ( + start, + ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: name[..i].to_string(), + value: name[i + 2..].to_string(), + }, + ) + } else if let Some(i) = name.find(':') { + ( + start, + ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: name[..i].to_string(), + value: name[i + 1..].to_string(), + }, + ) + } else if let Some(i) = name.find('=') { + ( + start, + ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: name[..i].to_string(), + value: name[i + 1..].to_string(), + }, + ) + } else { + (start, ast::ClassUnicodeKind::Named(name.to_string())) + } + } else { + let start = self.pos(); + let c = self.char(); + if c == '\\' { + return Err(self.error( + self.span_char(), + ast::ErrorKind::UnicodeClassInvalid, + )); + } + self.bump_and_bump_space(); + let kind = ast::ClassUnicodeKind::OneLetter(c); + (start, kind) + }; + Ok(ast::ClassUnicode { + span: Span::new(start, self.pos()), + negated: negated, + kind: kind, + }) + } + + /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the + /// parser is currently at a valid character class name and will be + /// advanced to the character immediately following the class. + #[inline(never)] + fn parse_perl_class(&self) -> ast::ClassPerl { + let c = self.char(); + let span = self.span_char(); + self.bump(); + let (negated, kind) = match c { + 'd' => (false, ast::ClassPerlKind::Digit), + 'D' => (true, ast::ClassPerlKind::Digit), + 's' => (false, ast::ClassPerlKind::Space), + 'S' => (true, ast::ClassPerlKind::Space), + 'w' => (false, ast::ClassPerlKind::Word), + 'W' => (true, ast::ClassPerlKind::Word), + c => panic!("expected valid Perl class but got '{}'", c), + }; + ast::ClassPerl { span: span, kind: kind, negated: negated } + } +} + +/// A type that traverses a fully parsed Ast and checks whether its depth +/// exceeds the specified nesting limit. If it does, then an error is returned. +#[derive(Debug)] +struct NestLimiter<'p, 's, P> { + /// The parser that is checking the nest limit. + p: &'p ParserI<'s, P>, + /// The current depth while walking an Ast. + depth: u32, +} + +impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> { + fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { + NestLimiter { p: p, depth: 0 } + } + + #[inline(never)] + fn check(self, ast: &Ast) -> Result<()> { + ast::visit(ast, self) + } + + fn increment_depth(&mut self, span: &Span) -> Result<()> { + let new = self.depth.checked_add(1).ok_or_else(|| { + self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(::std::u32::MAX), + ) + })?; + let limit = self.p.parser().nest_limit; + if new > limit { + return Err(self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(limit), + )); + } + self.depth = new; + Ok(()) + } + + fn decrement_depth(&mut self) { + // Assuming the correctness of the visitor, this should never drop + // below 0. + self.depth = self.depth.checked_sub(1).unwrap(); + } +} + +impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> { + type Output = (); + type Err = ast::Error; + + fn finish(self) -> Result<()> { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + let span = match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + Ast::Class(ast::Class::Bracketed(_)) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + let span = match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + ast::ClassSetItem::Bracketed(ref x) => &x.span, + ast::ClassSetItem::Union(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_binary_op_pre( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.increment_depth(&ast.span) + } + + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.decrement_depth(); + Ok(()) + } +} + +/// When the result is an error, transforms the ast::ErrorKind from the source +/// Result into another one. This function is used to return clearer error +/// messages when possible. +fn specialize_err<T>( + result: Result<T>, + from: ast::ErrorKind, + to: ast::ErrorKind, +) -> Result<T> { + if let Err(e) = result { + if e.kind == from { + Err(ast::Error { kind: to, pattern: e.pattern, span: e.span }) + } else { + Err(e) + } + } else { + result + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use super::{Parser, ParserBuilder, ParserI, Primitive}; + use crate::ast::{self, Ast, Position, Span}; + + // Our own assert_eq, which has slightly better formatting (but honestly + // still kind of crappy). + macro_rules! assert_eq { + ($left:expr, $right:expr) => {{ + match (&$left, &$right) { + (left_val, right_val) => { + if !(*left_val == *right_val) { + panic!( + "assertion failed: `(left == right)`\n\n\ + left: `{:?}`\nright: `{:?}`\n\n", + left_val, right_val + ) + } + } + } + }}; + } + + // We create these errors to compare with real ast::Errors in the tests. + // We define equality between TestError and ast::Error to disregard the + // pattern string in ast::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: ast::ErrorKind, + } + + impl PartialEq<ast::Error> for TestError { + fn eq(&self, other: &ast::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq<TestError> for ast::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn s(str: &str) -> String { + str.to_string() + } + + fn parser(pattern: &str) -> ParserI<'_, Parser> { + ParserI::new(Parser::new(), pattern) + } + + fn parser_octal(pattern: &str) -> ParserI<'_, Parser> { + let parser = ParserBuilder::new().octal(true).build(); + ParserI::new(parser, pattern) + } + + fn parser_nest_limit( + pattern: &str, + nest_limit: u32, + ) -> ParserI<'_, Parser> { + let p = ParserBuilder::new().nest_limit(nest_limit).build(); + ParserI::new(p, pattern) + } + + fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> { + let p = ParserBuilder::new().ignore_whitespace(true).build(); + ParserI::new(p, pattern) + } + + /// Short alias for creating a new span. + fn nspan(start: Position, end: Position) -> Span { + Span::new(start, end) + } + + /// Short alias for creating a new position. + fn npos(offset: usize, line: usize, column: usize) -> Position { + Position::new(offset, line, column) + } + + /// Create a new span from the given offset range. This assumes a single + /// line and sets the columns based on the offsets. i.e., This only works + /// out of the box for ASCII, which is fine for most tests. + fn span(range: Range<usize>) -> Span { + let start = Position::new(range.start, 1, range.start + 1); + let end = Position::new(range.end, 1, range.end + 1); + Span::new(start, end) + } + + /// Create a new span for the corresponding byte range in the given string. + fn span_range(subject: &str, range: Range<usize>) -> Span { + let start = Position { + offset: range.start, + line: 1 + subject[..range.start].matches('\n').count(), + column: 1 + subject[..range.start] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.start].chars().count()), + }; + let end = Position { + offset: range.end, + line: 1 + subject[..range.end].matches('\n').count(), + column: 1 + subject[..range.end] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.end].chars().count()), + }; + Span::new(start, end) + } + + /// Create a verbatim literal starting at the given position. + fn lit(c: char, start: usize) -> Ast { + lit_with(c, span(start..start + c.len_utf8())) + } + + /// Create a punctuation literal starting at the given position. + fn punct_lit(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Punctuation, + c: c, + }) + } + + /// Create a verbatim literal with the given span. + fn lit_with(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Verbatim, + c: c, + }) + } + + /// Create a concatenation with the given range. + fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast { + concat_with(span(range), asts) + } + + /// Create a concatenation with the given span. + fn concat_with(span: Span, asts: Vec<Ast>) -> Ast { + Ast::Concat(ast::Concat { span: span, asts: asts }) + } + + /// Create an alternation with the given span. + fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast { + Ast::Alternation(ast::Alternation { span: span(range), asts: asts }) + } + + /// Create a capturing group with the given span. + fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast { + Ast::Group(ast::Group { + span: span(range), + kind: ast::GroupKind::CaptureIndex(index), + ast: Box::new(ast), + }) + } + + /// Create an ast::SetFlags. + /// + /// The given pattern should be the full pattern string. The range given + /// should correspond to the byte offsets where the flag set occurs. + /// + /// If negated is true, then the set is interpreted as beginning with a + /// negation. + fn flag_set( + pat: &str, + range: Range<usize>, + flag: ast::Flag, + negated: bool, + ) -> Ast { + let mut items = vec![ast::FlagsItem { + span: span_range(pat, (range.end - 2)..(range.end - 1)), + kind: ast::FlagsItemKind::Flag(flag), + }]; + if negated { + items.insert( + 0, + ast::FlagsItem { + span: span_range(pat, (range.start + 2)..(range.end - 2)), + kind: ast::FlagsItemKind::Negation, + }, + ); + } + Ast::Flags(ast::SetFlags { + span: span_range(pat, range.clone()), + flags: ast::Flags { + span: span_range(pat, (range.start + 2)..(range.end - 1)), + items: items, + }, + }) + } + + #[test] + fn parse_nest_limit() { + // A nest limit of 0 still allows some types of regexes. + assert_eq!( + parser_nest_limit("", 0).parse(), + Ok(Ast::Empty(span(0..0))) + ); + assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); + + // Test repetition operations, which require one level of nesting. + assert_eq!( + parser_nest_limit("a+", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("a+", 1).parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser_nest_limit("(a)+", 1).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("a+*", 1).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("a+*", 2).parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })), + })) + ); + + // Test concatenations. A concatenation requires one level of nesting. + assert_eq!( + parser_nest_limit("ab", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("ab", 1).parse(), + Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])) + ); + assert_eq!( + parser_nest_limit("abc", 1).parse(), + Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])) + ); + + // Test alternations. An alternation requires one level of nesting. + assert_eq!( + parser_nest_limit("a|b", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("a|b", 1).parse(), + Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])) + ); + assert_eq!( + parser_nest_limit("a|b|c", 1).parse(), + Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])) + ); + + // Test character classes. Classes form their own mini-recursive + // syntax! + assert_eq!( + parser_nest_limit("[a]", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("[a]", 1).parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( + ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: 'a', + } + )), + }))) + ); + assert_eq!( + parser_nest_limit("[ab]", 1).parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::NestLimitExceeded(2), + } + ); + assert_eq!( + parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(3), + } + ); + assert_eq!( + parser_nest_limit("[a--b]", 1).parse().unwrap_err(), + TestError { + span: span(1..5), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(2), + } + ); + } + + #[test] + fn parse_comments() { + let pat = "(?x) +# This is comment 1. +foo # This is comment 2. + # This is comment 3. +bar +# This is comment 4."; + let astc = parser(pat).parse_with_comments().unwrap(); + assert_eq!( + astc.ast, + concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('f', span_range(pat, 26..27)), + lit_with('o', span_range(pat, 27..28)), + lit_with('o', span_range(pat, 28..29)), + lit_with('b', span_range(pat, 74..75)), + lit_with('a', span_range(pat, 75..76)), + lit_with('r', span_range(pat, 76..77)), + ] + ) + ); + assert_eq!( + astc.comments, + vec![ + ast::Comment { + span: span_range(pat, 5..26), + comment: s(" This is comment 1."), + }, + ast::Comment { + span: span_range(pat, 30..51), + comment: s(" This is comment 2."), + }, + ast::Comment { + span: span_range(pat, 53..74), + comment: s(" This is comment 3."), + }, + ast::Comment { + span: span_range(pat, 78..98), + comment: s(" This is comment 4."), + }, + ] + ); + } + + #[test] + fn parse_holistic() { + assert_eq!(parser("]").parse(), Ok(lit(']', 0))); + assert_eq!( + parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), + Ok(concat( + 0..36, + vec![ + punct_lit('\\', span(0..2)), + punct_lit('.', span(2..4)), + punct_lit('+', span(4..6)), + punct_lit('*', span(6..8)), + punct_lit('?', span(8..10)), + punct_lit('(', span(10..12)), + punct_lit(')', span(12..14)), + punct_lit('|', span(14..16)), + punct_lit('[', span(16..18)), + punct_lit(']', span(18..20)), + punct_lit('{', span(20..22)), + punct_lit('}', span(22..24)), + punct_lit('^', span(24..26)), + punct_lit('$', span(26..28)), + punct_lit('#', span(28..30)), + punct_lit('&', span(30..32)), + punct_lit('-', span(32..34)), + punct_lit('~', span(34..36)), + ] + )) + ); + } + + #[test] + fn parse_ignore_whitespace() { + // Test that basic whitespace insensitivity works. + let pat = "(?x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + nspan(npos(0, 1, 1), npos(7, 1, 8)), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + ] + )) + ); + + // Test that we can toggle whitespace insensitivity. + let pat = "(?x)a b(?-x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + nspan(npos(0, 1, 1), npos(15, 1, 16)), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), + lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), + lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), + lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), + ] + )) + ); + + // Test that nesting whitespace insensitive flags works. + let pat = "a (?x:a )a "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..11), + vec![ + lit_with('a', span_range(pat, 0..1)), + lit_with(' ', span_range(pat, 1..2)), + Ast::Group(ast::Group { + span: span_range(pat, 2..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 4..5), + items: vec![ast::FlagsItem { + span: span_range(pat, 4..5), + kind: ast::FlagsItemKind::Flag( + ast::Flag::IgnoreWhitespace + ), + },], + }), + ast: Box::new(lit_with('a', span_range(pat, 6..7))), + }), + lit_with('a', span_range(pat, 9..10)), + lit_with(' ', span_range(pat, 10..11)), + ] + )) + ); + + // Test that whitespace after an opening paren is insignificant. + let pat = "(?x)( ?P<foo> a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span_range(pat, 9..12), + name: s("foo"), + index: 1, + }), + ast: Box::new(lit_with('a', span_range(pat, 14..15))), + }), + ] + )) + ); + let pat = "(?x)( a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureIndex(1), + ast: Box::new(lit_with('a', span_range(pat, 7..8))), + }), + ] + )) + ); + let pat = "(?x)( ?: a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 8..8), + items: vec![], + }), + ast: Box::new(lit_with('a', span_range(pat, 11..12))), + }), + ] + )) + ); + let pat = r"(?x)\x { 53 }"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Literal(ast::Literal { + span: span(4..13), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::X + ), + c: 'S', + }), + ] + )) + ); + + // Test that whitespace after an escape is OK. + let pat = r"(?x)\ "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Literal(ast::Literal { + span: span_range(pat, 4..6), + kind: ast::LiteralKind::Special( + ast::SpecialLiteralKind::Space + ), + c: ' ', + }), + ] + )) + ); + // ... but only when `x` mode is enabled. + let pat = r"\ "; + assert_eq!( + parser(pat).parse().unwrap_err(), + TestError { + span: span_range(pat, 0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + } + + #[test] + fn parse_newlines() { + let pat = ".\n."; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..3), + vec![ + Ast::Dot(span_range(pat, 0..1)), + lit_with('\n', span_range(pat, 1..2)), + Ast::Dot(span_range(pat, 2..3)), + ] + )) + ); + + let pat = "foobar\nbaz\nquux\n"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), + lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), + lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), + lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), + lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), + lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), + lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), + lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), + lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), + lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), + lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), + lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), + lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), + lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), + ] + )) + ); + } + + #[test] + fn parse_uncounted_repetition() { + assert_eq!( + parser(r"a*").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a+").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a??").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a?b").parse(), + Ok(concat( + 0..3, + vec![ + Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }), + lit('b', 2), + ] + )) + ); + assert_eq!( + parser(r"a??b").parse(), + Ok(concat( + 0..4, + vec![ + Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }), + lit('b', 3), + ] + )) + ); + assert_eq!( + parser(r"ab?").parse(), + Ok(concat( + 0..3, + vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ] + )) + ); + assert_eq!( + parser(r"(ab)?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(4..5), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(group( + 0..4, + 1, + concat(1..3, vec![lit('a', 1), lit('b', 2),]) + )), + })) + ); + assert_eq!( + parser(r"|a?").parse(), + Ok(alt( + 0..3, + vec![ + Ast::Empty(span(0..0)), + Ast::Repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 1)), + }), + ] + )) + ); + + assert_eq!( + parser(r"*").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?i)*").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(*)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?:?)").parse().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"+").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"?").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|*").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|+").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|?").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + } + + #[test] + fn parse_counted_repetition() { + assert_eq!( + parser(r"a{5}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..4), + op: ast::RepetitionOp { + span: span(1..4), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{5,}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::AtLeast(5) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{5,9}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{5}?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"ab{5}").parse(), + Ok(concat( + 0..5, + vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ] + )) + ); + assert_eq!( + parser(r"ab{5}c").parse(), + Ok(concat( + 0..6, + vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + lit('c', 5), + ] + )) + ); + + assert_eq!( + parser(r"a{ 5 }").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{ 5 , 9 }").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..10), + op: ast::RepetitionOp { + span: span(1..10), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser_ignore_whitespace(r"a{5,9} ?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..8), + op: ast::RepetitionOp { + span: span(1..8), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + })) + ); + + assert_eq!( + parser(r"(?i){0}").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?m){1,1}").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"a{]}").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{1,]}").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{}").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{a").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{9999999999}").parse().unwrap_err(), + TestError { + span: span(2..12), + kind: ast::ErrorKind::DecimalInvalid, + } + ); + assert_eq!( + parser(r"a{9").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{9,a").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{9,9999999999}").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::DecimalInvalid, + } + ); + assert_eq!( + parser(r"a{9,").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{9,11").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{2,1}").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountInvalid, + } + ); + assert_eq!( + parser(r"{5}").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|{5}").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + } + + #[test] + fn parse_alternate() { + assert_eq!( + parser(r"a|b").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..3), + asts: vec![lit('a', 0), lit('b', 2)], + })) + ); + assert_eq!( + parser(r"(a|b)").parse(), + Ok(group( + 0..5, + 1, + Ast::Alternation(ast::Alternation { + span: span(1..4), + asts: vec![lit('a', 1), lit('b', 3)], + }) + )) + ); + + assert_eq!( + parser(r"a|b|c").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..5), + asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], + })) + ); + assert_eq!( + parser(r"ax|by|cz").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..8), + asts: vec![ + concat(0..2, vec![lit('a', 0), lit('x', 1)]), + concat(3..5, vec![lit('b', 3), lit('y', 4)]), + concat(6..8, vec![lit('c', 6), lit('z', 7)]), + ], + })) + ); + assert_eq!( + parser(r"(ax|by|cz)").parse(), + Ok(group( + 0..10, + 1, + Ast::Alternation(ast::Alternation { + span: span(1..9), + asts: vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + concat(4..6, vec![lit('b', 4), lit('y', 5)]), + concat(7..9, vec![lit('c', 7), lit('z', 8)]), + ], + }) + )) + ); + assert_eq!( + parser(r"(ax|(by|(cz)))").parse(), + Ok(group( + 0..14, + 1, + alt( + 1..13, + vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + group( + 4..13, + 2, + alt( + 5..12, + vec![ + concat( + 5..7, + vec![lit('b', 5), lit('y', 6)] + ), + group( + 8..12, + 3, + concat( + 9..11, + vec![lit('c', 9), lit('z', 10),] + ) + ), + ] + ) + ), + ] + ) + )) + ); + + assert_eq!( + parser(r"|").parse(), + Ok(alt( + 0..1, + vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),] + )) + ); + assert_eq!( + parser(r"||").parse(), + Ok(alt( + 0..2, + vec![ + Ast::Empty(span(0..0)), + Ast::Empty(span(1..1)), + Ast::Empty(span(2..2)), + ] + )) + ); + assert_eq!( + parser(r"a|").parse(), + Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),])) + ); + assert_eq!( + parser(r"|a").parse(), + Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),])) + ); + + assert_eq!( + parser(r"(|)").parse(), + Ok(group( + 0..3, + 1, + alt( + 1..2, + vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),] + ) + )) + ); + assert_eq!( + parser(r"(a|)").parse(), + Ok(group( + 0..4, + 1, + alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),]) + )) + ); + assert_eq!( + parser(r"(|a)").parse(), + Ok(group( + 0..4, + 1, + alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),]) + )) + ); + + assert_eq!( + parser(r"a|b)").parse().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::GroupUnopened, + } + ); + assert_eq!( + parser(r"(a|b").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnclosed, + } + ); + } + + #[test] + fn parse_unsupported_lookaround() { + assert_eq!( + parser(r"(?=a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + assert_eq!( + parser(r"(?!a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + assert_eq!( + parser(r"(?<=a)").parse().unwrap_err(), + TestError { + span: span(0..4), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + assert_eq!( + parser(r"(?<!a)").parse().unwrap_err(), + TestError { + span: span(0..4), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + } + + #[test] + fn parse_group() { + assert_eq!( + parser("(?i)").parse(), + Ok(Ast::Flags(ast::SetFlags { + span: span(0..4), + flags: ast::Flags { + span: span(2..3), + items: vec![ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }], + }, + })) + ); + assert_eq!( + parser("(?iU)").parse(), + Ok(Ast::Flags(ast::SetFlags { + span: span(0..5), + flags: ast::Flags { + span: span(2..4), + items: vec![ + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag( + ast::Flag::SwapGreed + ), + }, + ], + }, + })) + ); + assert_eq!( + parser("(?i-U)").parse(), + Ok(Ast::Flags(ast::SetFlags { + span: span(0..6), + flags: ast::Flags { + span: span(2..5), + items: vec![ + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(4..5), + kind: ast::FlagsItemKind::Flag( + ast::Flag::SwapGreed + ), + }, + ], + }, + })) + ); + + assert_eq!( + parser("()").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..2), + kind: ast::GroupKind::CaptureIndex(1), + ast: Box::new(Ast::Empty(span(1..1))), + })) + ); + assert_eq!( + parser("(a)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..3), + kind: ast::GroupKind::CaptureIndex(1), + ast: Box::new(lit('a', 1)), + })) + ); + assert_eq!( + parser("(())").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..4), + kind: ast::GroupKind::CaptureIndex(1), + ast: Box::new(Ast::Group(ast::Group { + span: span(1..3), + kind: ast::GroupKind::CaptureIndex(2), + ast: Box::new(Ast::Empty(span(2..2))), + })), + })) + ); + + assert_eq!( + parser("(?:a)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..5), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span(2..2), + items: vec![], + }), + ast: Box::new(lit('a', 3)), + })) + ); + + assert_eq!( + parser("(?i:a)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..6), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span(2..3), + items: vec![ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + },], + }), + ast: Box::new(lit('a', 4)), + })) + ); + assert_eq!( + parser("(?i-U:a)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..8), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span(2..5), + items: vec![ + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(4..5), + kind: ast::FlagsItemKind::Flag( + ast::Flag::SwapGreed + ), + }, + ], + }), + ast: Box::new(lit('a', 6)), + })) + ); + + assert_eq!( + parser("(").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnclosed, + } + ); + assert_eq!( + parser("(?").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnclosed, + } + ); + assert_eq!( + parser("(?P").parse().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + assert_eq!( + parser("(?P<").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameUnexpectedEof, + } + ); + assert_eq!( + parser("(a").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnclosed, + } + ); + assert_eq!( + parser("(()").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnclosed, + } + ); + assert_eq!( + parser(")").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnopened, + } + ); + assert_eq!( + parser("a)").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::GroupUnopened, + } + ); + } + + #[test] + fn parse_capture_name() { + assert_eq!( + parser("(?P<a>z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..8), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..5), + name: s("a"), + index: 1, + }), + ast: Box::new(lit('z', 6)), + })) + ); + assert_eq!( + parser("(?P<abc>z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("abc"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?P<a_1>z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("a_1"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?P<a.1>z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("a.1"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?P<a[1]>z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..11), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..8), + name: s("a[1]"), + index: 1, + }), + ast: Box::new(lit('z', 9)), + })) + ); + + assert_eq!( + parser("(?P<").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameUnexpectedEof, + } + ); + assert_eq!( + parser("(?P<>z)").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameEmpty, + } + ); + assert_eq!( + parser("(?P<a").parse().unwrap_err(), + TestError { + span: span(5..5), + kind: ast::ErrorKind::GroupNameUnexpectedEof, + } + ); + assert_eq!( + parser("(?P<ab").parse().unwrap_err(), + TestError { + span: span(6..6), + kind: ast::ErrorKind::GroupNameUnexpectedEof, + } + ); + assert_eq!( + parser("(?P<0a").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<~").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<abc~").parse().unwrap_err(), + TestError { + span: span(7..8), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(), + TestError { + span: span(12..13), + kind: ast::ErrorKind::GroupNameDuplicate { + original: span(4..5), + }, + } + ); + } + + #[test] + fn parse_flags() { + assert_eq!( + parser("i:").parse_flags(), + Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + }) + ); + assert_eq!( + parser("i)").parse_flags(), + Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + }) + ); + + assert_eq!( + parser("isU:").parse_flags(), + Ok(ast::Flags { + span: span(0..3), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + }) + ); + + assert_eq!( + parser("-isU:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + }) + ); + assert_eq!( + parser("i-sU:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + }) + ); + + assert_eq!( + parser("isU").parse_flags().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::FlagUnexpectedEof, + } + ); + assert_eq!( + parser("isUa:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + assert_eq!( + parser("isUi:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) }, + } + ); + assert_eq!( + parser("i-sU-i:").parse_flags().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::FlagRepeatedNegation { + original: span(1..2), + }, + } + ); + assert_eq!( + parser("-)").parse_flags().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagDanglingNegation, + } + ); + assert_eq!( + parser("i-)").parse_flags().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::FlagDanglingNegation, + } + ); + assert_eq!( + parser("iU-)").parse_flags().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::FlagDanglingNegation, + } + ); + } + + #[test] + fn parse_flag() { + assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); + assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); + assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); + assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); + assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); + assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); + + assert_eq!( + parser("a").parse_flag().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + assert_eq!( + parser("â").parse_flag().unwrap_err(), + TestError { + span: span_range("â", 0..3), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + } + + #[test] + fn parse_primitive_non_escape() { + assert_eq!( + parser(r".").parse_primitive(), + Ok(Primitive::Dot(span(0..1))) + ); + assert_eq!( + parser(r"^").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::StartLine, + })) + ); + assert_eq!( + parser(r"$").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::EndLine, + })) + ); + + assert_eq!( + parser(r"a").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: 'a', + })) + ); + assert_eq!( + parser(r"|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: '|', + })) + ); + assert_eq!( + parser(r"â").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span_range("â", 0..3), + kind: ast::LiteralKind::Verbatim, + c: 'â', + })) + ); + } + + #[test] + fn parse_escape() { + assert_eq!( + parser(r"\|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Punctuation, + c: '|', + })) + ); + let specials = &[ + (r"\a", '\x07', ast::SpecialLiteralKind::Bell), + (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), + (r"\t", '\t', ast::SpecialLiteralKind::Tab), + (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), + (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), + (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), + ]; + for &(pat, c, ref kind) in specials { + assert_eq!( + parser(pat).parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Special(kind.clone()), + c: c, + })) + ); + } + assert_eq!( + parser(r"\A").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::StartText, + })) + ); + assert_eq!( + parser(r"\z").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::EndText, + })) + ); + assert_eq!( + parser(r"\b").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + })) + ); + assert_eq!( + parser(r"\B").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::NotWordBoundary, + })) + ); + + assert_eq!( + parser(r"\").parse_escape().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\y").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + } + + #[test] + fn parse_unsupported_backreference() { + assert_eq!( + parser(r"\0").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + } + ); + assert_eq!( + parser(r"\9").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + } + ); + } + + #[test] + fn parse_octal() { + for i in 0..511 { + let pat = format!(r"\{:o}", i); + assert_eq!( + parser_octal(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::Octal, + c: ::std::char::from_u32(i).unwrap(), + })) + ); + } + assert_eq!( + parser_octal(r"\778").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + })) + ); + assert_eq!( + parser_octal(r"\7777").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + })) + ); + assert_eq!( + parser_octal(r"\778").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::Literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + }), + Ast::Literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: '8', + }), + ], + })) + ); + assert_eq!( + parser_octal(r"\7777").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..5), + asts: vec![ + Ast::Literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + }), + Ast::Literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: '7', + }), + ], + })) + ); + + assert_eq!( + parser_octal(r"\8").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + } + + #[test] + fn parse_hex_two() { + for i in 0..256 { + let pat = format!(r"\x{:02x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), + c: ::std::char::from_u32(i).unwrap(), + })) + ); + } + + assert_eq!( + parser(r"\xF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\xG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\xFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + } + + #[test] + fn parse_hex_four() { + for i in 0..65536 { + let c = match ::std::char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\u{:04x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeShort + ), + c: c, + })) + ); + } + + assert_eq!( + parser(r"\uF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\uG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uD800").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + } + + #[test] + fn parse_hex_eight() { + for i in 0..65536 { + let c = match ::std::char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\U{:08x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeLong + ), + c: c, + })) + ); + } + + assert_eq!( + parser(r"\UF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\UG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(6..7), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(7..8), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(8..9), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(9..10), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + } + + #[test] + fn parse_hex_brace() { + assert_eq!( + parser(r"\u{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeShort + ), + c: 'â', + })) + ); + assert_eq!( + parser(r"\U{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeLong + ), + c: 'â', + })) + ); + assert_eq!( + parser(r"\x{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: 'â', + })) + ); + assert_eq!( + parser(r"\x{26C4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: 'â', + })) + ); + assert_eq!( + parser(r"\x{10fFfF}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..10), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '\u{10FFFF}', + })) + ); + + assert_eq!( + parser(r"\x").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\x{").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\x{FF").parse_escape().unwrap_err(), + TestError { + span: span(2..5), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\x{}").parse_escape().unwrap_err(), + TestError { + span: span(2..4), + kind: ast::ErrorKind::EscapeHexEmpty, + } + ); + assert_eq!( + parser(r"\x{FGF}").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..9), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + assert_eq!( + parser(r"\x{D800}").parse_escape().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + assert_eq!( + parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..12), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + } + + #[test] + fn parse_decimal() { + assert_eq!(parser("123").parse_decimal(), Ok(123)); + assert_eq!(parser("0").parse_decimal(), Ok(0)); + assert_eq!(parser("01").parse_decimal(), Ok(1)); + + assert_eq!( + parser("-1").parse_decimal().unwrap_err(), + TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } + ); + assert_eq!( + parser("").parse_decimal().unwrap_err(), + TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } + ); + assert_eq!( + parser("9999999999").parse_decimal().unwrap_err(), + TestError { + span: span(0..10), + kind: ast::ErrorKind::DecimalInvalid, + } + ); + } + + #[test] + fn parse_set_class() { + fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet { + ast::ClassSet::union(ast::ClassSetUnion { + span: span, + items: items, + }) + } + + fn intersection( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: ast::ClassSetBinaryOpKind::Intersection, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn difference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: ast::ClassSetBinaryOpKind::Difference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn symdifference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: ast::ClassSetBinaryOpKind::SymmetricDifference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { + ast::ClassSet::Item(item) + } + + fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { + ast::ClassSetItem::Ascii(cls) + } + + fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { + ast::ClassSetItem::Unicode(cls) + } + + fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { + ast::ClassSetItem::Perl(cls) + } + + fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { + ast::ClassSetItem::Bracketed(Box::new(cls)) + } + + fn lit(span: Span, c: char) -> ast::ClassSetItem { + ast::ClassSetItem::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Verbatim, + c: c, + }) + } + + fn empty(span: Span) -> ast::ClassSetItem { + ast::ClassSetItem::Empty(span) + } + + fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { + let pos1 = Position { + offset: span.start.offset + start.len_utf8(), + column: span.start.column + 1, + ..span.start + }; + let pos2 = Position { + offset: span.end.offset - end.len_utf8(), + column: span.end.column - 1, + ..span.end + }; + ast::ClassSetItem::Range(ast::ClassSetRange { + span: span, + start: ast::Literal { + span: Span { end: pos1, ..span }, + kind: ast::LiteralKind::Verbatim, + c: start, + }, + end: ast::Literal { + span: Span { start: pos2, ..span }, + kind: ast::LiteralKind::Verbatim, + c: end, + }, + }) + } + + fn alnum(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { + span: span, + kind: ast::ClassAsciiKind::Alnum, + negated: negated, + } + } + + fn lower(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { + span: span, + kind: ast::ClassAsciiKind::Lower, + negated: negated, + } + } + + assert_eq!( + parser("[[:alnum:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..11), + negated: false, + kind: itemset(item_ascii(alnum(span(1..10), false))), + }))) + ); + assert_eq!( + parser("[[[:alnum:]]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..13), + negated: false, + kind: itemset(item_bracket(ast::ClassBracketed { + span: span(1..12), + negated: false, + kind: itemset(item_ascii(alnum(span(2..11), false))), + })), + }))) + ); + assert_eq!( + parser("[[:alnum:]&&[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: intersection( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + }))) + ); + assert_eq!( + parser("[[:alnum:]--[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: difference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + }))) + ); + assert_eq!( + parser("[[:alnum:]~~[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: symdifference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + }))) + ); + + assert_eq!( + parser("[a]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), 'a')), + }))) + ); + assert_eq!( + parser(r"[a\]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union( + span(1..4), + vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: ']', + }), + ] + ), + }))) + ); + assert_eq!( + parser(r"[a\-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union( + span(1..5), + vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: '-', + }), + lit(span(4..5), 'z'), + ] + ), + }))) + ); + assert_eq!( + parser("[ab]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union( + span(1..3), + vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] + ), + }))) + ); + assert_eq!( + parser("[a-]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union( + span(1..3), + vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] + ), + }))) + ); + assert_eq!( + parser("[-a]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union( + span(1..3), + vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] + ), + }))) + ); + assert_eq!( + parser(r"[\pL]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(item_unicode(ast::ClassUnicode { + span: span(1..4), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('L'), + })), + }))) + ); + assert_eq!( + parser(r"[\w]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + }))) + ); + assert_eq!( + parser(r"[a\wz]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union( + span(1..5), + vec![ + lit(span(1..2), 'a'), + item_perl(ast::ClassPerl { + span: span(2..4), + kind: ast::ClassPerlKind::Word, + negated: false, + }), + lit(span(4..5), 'z'), + ] + ), + }))) + ); + + assert_eq!( + parser("[a-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(range(span(1..4), 'a', 'z')), + }))) + ); + assert_eq!( + parser("[a-cx-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..8), + negated: false, + kind: union( + span(1..7), + vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ] + ), + }))) + ); + assert_eq!( + parser(r"[\w&&a-cx-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + union( + span(5..11), + vec![ + range(span(5..8), 'a', 'c'), + range(span(8..11), 'x', 'z'), + ] + ), + ), + }))) + ); + assert_eq!( + parser(r"[a-cx-z&&\w]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + union( + span(1..7), + vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ] + ), + itemset(item_perl(ast::ClassPerl { + span: span(9..11), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + ), + }))) + ); + assert_eq!( + parser(r"[a--b--c]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: difference( + span(1..8), + difference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + }))) + ); + assert_eq!( + parser(r"[a~~b~~c]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: symdifference( + span(1..8), + symdifference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + }))) + ); + assert_eq!( + parser(r"[\^&&^]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '^', + })), + itemset(lit(span(5..6), '^')), + ), + }))) + ); + assert_eq!( + parser(r"[\&&&&]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '&', + })), + itemset(lit(span(5..6), '&')), + ), + }))) + ); + assert_eq!( + parser(r"[&&&&]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: intersection( + span(1..5), + intersection( + span(1..3), + itemset(empty(span(1..1))), + itemset(empty(span(3..3))), + ), + itemset(empty(span(5..5))), + ), + }))) + ); + + let pat = "[â-â]"; + assert_eq!( + parser(pat).parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span_range(pat, 0..9), + negated: false, + kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { + span: span_range(pat, 1..8), + start: ast::Literal { + span: span_range(pat, 1..4), + kind: ast::LiteralKind::Verbatim, + c: 'â', + }, + end: ast::Literal { + span: span_range(pat, 5..8), + kind: ast::LiteralKind::Verbatim, + c: 'â', + }, + })), + }))) + ); + + assert_eq!( + parser(r"[]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), ']')), + }))) + ); + assert_eq!( + parser(r"[]\[]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union( + span(1..4), + vec![ + lit(span(1..2), ']'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: '[', + }), + ] + ), + }))) + ); + assert_eq!( + parser(r"[\[]]").parse(), + Ok(concat( + 0..5, + vec![ + Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(ast::ClassSetItem::Literal( + ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '[', + } + )), + })), + Ast::Literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ] + )) + ); + + assert_eq!( + parser("[").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[[").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[[-]").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[[[:alnum:]").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser(r"[\b]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassEscapeInvalid, + } + ); + assert_eq!( + parser(r"[\w-a]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassRangeLiteral, + } + ); + assert_eq!( + parser(r"[a-\w]").parse().unwrap_err(), + TestError { + span: span(3..5), + kind: ast::ErrorKind::ClassRangeLiteral, + } + ); + assert_eq!( + parser(r"[z-a]").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::ClassRangeInvalid, + } + ); + + assert_eq!( + parser_ignore_whitespace("[a ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser_ignore_whitespace("[a- ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + } + + #[test] + fn parse_set_class_open() { + assert_eq!(parser("[a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..1), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { span: span(1..1), items: vec![] }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = + ast::ClassSetUnion { span: span(4..4), items: vec![] }; + Ok((set, union)) + } + ); + assert_eq!(parser("[^a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { span: span(2..2), items: vec![] }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = + ast::ClassSetUnion { span: span(4..4), items: vec![] }; + Ok((set, union)) + } + ); + assert_eq!(parser("[-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ - a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + } + ); + assert_eq!(parser("[^-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + }); + assert_eq!(parser("[--a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!(parser("[]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: ']', + })], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ] a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + })], + }; + Ok((set, union)) + } + ); + assert_eq!(parser("[^]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + })], + }; + Ok((set, union)) + }); + assert_eq!(parser("[-]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + }); + + assert_eq!( + parser("[").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser_ignore_whitespace("[ ") + .parse_set_class_open() + .unwrap_err(), + TestError { + span: span(0..5), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[^").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[]").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[-").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[--").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + } + + #[test] + fn maybe_parse_ascii_class() { + assert_eq!( + parser(r"[:alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + }) + ); + assert_eq!( + parser(r"[:alnum:]A").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + }) + ); + assert_eq!( + parser(r"[:^alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..10), + kind: ast::ClassAsciiKind::Alnum, + negated: true, + }) + ); + + let p = parser(r"[:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:^"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[^:alnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + } + + #[test] + fn parse_unicode_class() { + assert_eq!( + parser(r"\pN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })) + ); + assert_eq!( + parser(r"\PN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: true, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })) + ); + assert_eq!( + parser(r"\p{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("N")), + })) + ); + assert_eq!( + parser(r"\P{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: true, + kind: ast::ClassUnicodeKind::Named(s("N")), + })) + ); + assert_eq!( + parser(r"\p{Greek}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + })) + ); + + assert_eq!( + parser(r"\p{scx:Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s("scx"), + value: s("Katakana"), + }, + })) + ); + assert_eq!( + parser(r"\p{scx=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s("scx"), + value: s("Katakana"), + }, + })) + ); + assert_eq!( + parser(r"\p{scx!=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..17), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s("scx"), + value: s("Katakana"), + }, + })) + ); + + assert_eq!( + parser(r"\p{:}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s(""), + value: s(""), + }, + })) + ); + assert_eq!( + parser(r"\p{=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s(""), + value: s(""), + }, + })) + ); + assert_eq!( + parser(r"\p{!=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..6), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s(""), + value: s(""), + }, + })) + ); + + assert_eq!( + parser(r"\p").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\p{").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\p{N").parse_escape().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\p{Greek").parse_escape().unwrap_err(), + TestError { + span: span(8..8), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + + assert_eq!( + parser(r"\pNz").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })), + Ast::Literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + })) + ); + assert_eq!( + parser(r"\p{Greek}z").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..10), + asts: vec![ + Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + })), + Ast::Literal(ast::Literal { + span: span(9..10), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + })) + ); + assert_eq!( + parser(r"\p\{").parse().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::UnicodeClassInvalid, + } + ); + assert_eq!( + parser(r"\P\{").parse().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::UnicodeClassInvalid, + } + ); + } + + #[test] + fn parse_perl_class() { + assert_eq!( + parser(r"\d").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })) + ); + assert_eq!( + parser(r"\D").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: true, + })) + ); + assert_eq!( + parser(r"\s").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: false, + })) + ); + assert_eq!( + parser(r"\S").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: true, + })) + ); + assert_eq!( + parser(r"\w").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: false, + })) + ); + assert_eq!( + parser(r"\W").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: true, + })) + ); + + assert_eq!( + parser(r"\d").parse(), + Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + }))) + ); + assert_eq!( + parser(r"\dz").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..3), + asts: vec![ + Ast::Class(ast::Class::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })), + Ast::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + })) + ); + } + + // This tests a bug fix where the nest limit checker wasn't decrementing + // its depth during post-traversal, which causes long regexes to trip + // the default limit too aggressively. + #[test] + fn regression_454_nest_too_big() { + let pattern = r#" + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4} + "#; + assert!(parser_nest_limit(pattern, 50).parse().is_ok()); + } + + // This tests that we treat a trailing `-` in a character class as a + // literal `-` even when whitespace mode is enabled and there is whitespace + // after the trailing `-`. + #[test] + fn regression_455_trailing_dash_ignore_whitespace() { + assert!(parser("(?x)[ / - ]").parse().is_ok()); + assert!(parser("(?x)[ a - ]").parse().is_ok()); + assert!(parser( + "(?x)[ + a + - ] + " + ) + .parse() + .is_ok()); + assert!(parser( + "(?x)[ + a # wat + - ] + " + ) + .parse() + .is_ok()); + + assert!(parser("(?x)[ / -").parse().is_err()); + assert!(parser("(?x)[ / - ").parse().is_err()); + assert!(parser( + "(?x)[ + / - + " + ) + .parse() + .is_err()); + assert!(parser( + "(?x)[ + / - # wat + " + ) + .parse() + .is_err()); + } +} diff --git a/vendor/regex-syntax/src/ast/print.rs b/vendor/regex-syntax/src/ast/print.rs new file mode 100644 index 000000000..045de2eaf --- /dev/null +++ b/vendor/regex-syntax/src/ast/print.rs @@ -0,0 +1,568 @@ +/*! +This module provides a regular expression printer for `Ast`. +*/ + +use std::fmt; + +use crate::ast::visitor::{self, Visitor}; +use crate::ast::{self, Ast}; + +/// A builder for constructing a printer. +/// +/// Note that since a printer doesn't have any configuration knobs, this type +/// remains unexported. +#[derive(Clone, Debug)] +struct PrinterBuilder { + _priv: (), +} + +impl Default for PrinterBuilder { + fn default() -> PrinterBuilder { + PrinterBuilder::new() + } +} + +impl PrinterBuilder { + fn new() -> PrinterBuilder { + PrinterBuilder { _priv: () } + } + + fn build(&self) -> Printer { + Printer { _priv: () } + } +} + +/// A printer for a regular expression abstract syntax tree. +/// +/// A printer converts an abstract syntax tree (AST) to a regular expression +/// pattern string. This particular printer uses constant stack space and heap +/// space proportional to the size of the AST. +/// +/// This printer will not necessarily preserve the original formatting of the +/// regular expression pattern string. For example, all whitespace and comments +/// are ignored. +#[derive(Debug)] +pub struct Printer { + _priv: (), +} + +impl Printer { + /// Create a new printer. + pub fn new() -> Printer { + PrinterBuilder::new().build() + } + + /// Print the given `Ast` to the given writer. The writer must implement + /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used + /// here are a `fmt::Formatter` (which is available in `fmt::Display` + /// implementations) or a `&mut String`. + pub fn print<W: fmt::Write>(&mut self, ast: &Ast, wtr: W) -> fmt::Result { + visitor::visit(ast, Writer { wtr }) + } +} + +#[derive(Debug)] +struct Writer<W> { + wtr: W, +} + +impl<W: fmt::Write> Visitor for Writer<W> { + type Output = (); + type Err = fmt::Error; + + fn finish(self) -> fmt::Result { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { + match *ast { + Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::Class(ast::Class::Bracketed(ref x)) => { + self.fmt_class_bracketed_pre(x) + } + _ => Ok(()), + } + } + + fn visit_post(&mut self, ast: &Ast) -> fmt::Result { + use crate::ast::Class; + + match *ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(ref x) => self.fmt_set_flags(x), + Ast::Literal(ref x) => self.fmt_literal(x), + Ast::Dot(_) => self.wtr.write_str("."), + Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), + Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), + Ast::Class(Class::Bracketed(ref x)) => { + self.fmt_class_bracketed_post(x) + } + Ast::Repetition(ref x) => self.fmt_repetition(x), + Ast::Group(ref x) => self.fmt_group_post(x), + Ast::Alternation(_) => Ok(()), + Ast::Concat(_) => Ok(()), + } + } + + fn visit_alternation_in(&mut self) -> fmt::Result { + self.wtr.write_str("|") + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + match *ast { + ast::ClassSetItem::Bracketed(ref x) => { + self.fmt_class_bracketed_pre(x) + } + _ => Ok(()), + } + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + use crate::ast::ClassSetItem::*; + + match *ast { + Empty(_) => Ok(()), + Literal(ref x) => self.fmt_literal(x), + Range(ref x) => { + self.fmt_literal(&x.start)?; + self.wtr.write_str("-")?; + self.fmt_literal(&x.end)?; + Ok(()) + } + Ascii(ref x) => self.fmt_class_ascii(x), + Unicode(ref x) => self.fmt_class_unicode(x), + Perl(ref x) => self.fmt_class_perl(x), + Bracketed(ref x) => self.fmt_class_bracketed_post(x), + Union(_) => Ok(()), + } + } + + fn visit_class_set_binary_op_in( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + self.fmt_class_set_binary_op_kind(&ast.kind) + } +} + +impl<W: fmt::Write> Writer<W> { + fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result { + use crate::ast::GroupKind::*; + match ast.kind { + CaptureIndex(_) => self.wtr.write_str("("), + CaptureName(ref x) => { + self.wtr.write_str("(?P<")?; + self.wtr.write_str(&x.name)?; + self.wtr.write_str(">")?; + Ok(()) + } + NonCapturing(ref flags) => { + self.wtr.write_str("(?")?; + self.fmt_flags(flags)?; + self.wtr.write_str(":")?; + Ok(()) + } + } + } + + fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result { + self.wtr.write_str(")") + } + + fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { + use crate::ast::RepetitionKind::*; + match ast.op.kind { + ZeroOrOne if ast.greedy => self.wtr.write_str("?"), + ZeroOrOne => self.wtr.write_str("??"), + ZeroOrMore if ast.greedy => self.wtr.write_str("*"), + ZeroOrMore => self.wtr.write_str("*?"), + OneOrMore if ast.greedy => self.wtr.write_str("+"), + OneOrMore => self.wtr.write_str("+?"), + Range(ref x) => { + self.fmt_repetition_range(x)?; + if !ast.greedy { + self.wtr.write_str("?")?; + } + Ok(()) + } + } + } + + fn fmt_repetition_range( + &mut self, + ast: &ast::RepetitionRange, + ) -> fmt::Result { + use crate::ast::RepetitionRange::*; + match *ast { + Exactly(x) => write!(self.wtr, "{{{}}}", x), + AtLeast(x) => write!(self.wtr, "{{{},}}", x), + Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y), + } + } + + fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result { + use crate::ast::LiteralKind::*; + + match ast.kind { + Verbatim => self.wtr.write_char(ast.c), + Punctuation => write!(self.wtr, r"\{}", ast.c), + Octal => write!(self.wtr, r"\{:o}", ast.c as u32), + HexFixed(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{:02X}", ast.c as u32) + } + HexFixed(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{:04X}", ast.c as u32) + } + HexFixed(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{:08X}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{{{:X}}}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{{{:X}}}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{{{:X}}}", ast.c as u32) + } + Special(ast::SpecialLiteralKind::Bell) => { + self.wtr.write_str(r"\a") + } + Special(ast::SpecialLiteralKind::FormFeed) => { + self.wtr.write_str(r"\f") + } + Special(ast::SpecialLiteralKind::Tab) => self.wtr.write_str(r"\t"), + Special(ast::SpecialLiteralKind::LineFeed) => { + self.wtr.write_str(r"\n") + } + Special(ast::SpecialLiteralKind::CarriageReturn) => { + self.wtr.write_str(r"\r") + } + Special(ast::SpecialLiteralKind::VerticalTab) => { + self.wtr.write_str(r"\v") + } + Special(ast::SpecialLiteralKind::Space) => { + self.wtr.write_str(r"\ ") + } + } + } + + fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result { + use crate::ast::AssertionKind::*; + match ast.kind { + StartLine => self.wtr.write_str("^"), + EndLine => self.wtr.write_str("$"), + StartText => self.wtr.write_str(r"\A"), + EndText => self.wtr.write_str(r"\z"), + WordBoundary => self.wtr.write_str(r"\b"), + NotWordBoundary => self.wtr.write_str(r"\B"), + } + } + + fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result { + self.wtr.write_str("(?")?; + self.fmt_flags(&ast.flags)?; + self.wtr.write_str(")")?; + Ok(()) + } + + fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result { + use crate::ast::{Flag, FlagsItemKind}; + + for item in &ast.items { + match item.kind { + FlagsItemKind::Negation => self.wtr.write_str("-"), + FlagsItemKind::Flag(ref flag) => match *flag { + Flag::CaseInsensitive => self.wtr.write_str("i"), + Flag::MultiLine => self.wtr.write_str("m"), + Flag::DotMatchesNewLine => self.wtr.write_str("s"), + Flag::SwapGreed => self.wtr.write_str("U"), + Flag::Unicode => self.wtr.write_str("u"), + Flag::IgnoreWhitespace => self.wtr.write_str("x"), + }, + }?; + } + Ok(()) + } + + fn fmt_class_bracketed_pre( + &mut self, + ast: &ast::ClassBracketed, + ) -> fmt::Result { + if ast.negated { + self.wtr.write_str("[^") + } else { + self.wtr.write_str("[") + } + } + + fn fmt_class_bracketed_post( + &mut self, + _ast: &ast::ClassBracketed, + ) -> fmt::Result { + self.wtr.write_str("]") + } + + fn fmt_class_set_binary_op_kind( + &mut self, + ast: &ast::ClassSetBinaryOpKind, + ) -> fmt::Result { + use crate::ast::ClassSetBinaryOpKind::*; + match *ast { + Intersection => self.wtr.write_str("&&"), + Difference => self.wtr.write_str("--"), + SymmetricDifference => self.wtr.write_str("~~"), + } + } + + fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result { + use crate::ast::ClassPerlKind::*; + match ast.kind { + Digit if ast.negated => self.wtr.write_str(r"\D"), + Digit => self.wtr.write_str(r"\d"), + Space if ast.negated => self.wtr.write_str(r"\S"), + Space => self.wtr.write_str(r"\s"), + Word if ast.negated => self.wtr.write_str(r"\W"), + Word => self.wtr.write_str(r"\w"), + } + } + + fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result { + use crate::ast::ClassAsciiKind::*; + match ast.kind { + Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"), + Alnum => self.wtr.write_str("[:alnum:]"), + Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"), + Alpha => self.wtr.write_str("[:alpha:]"), + Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"), + Ascii => self.wtr.write_str("[:ascii:]"), + Blank if ast.negated => self.wtr.write_str("[:^blank:]"), + Blank => self.wtr.write_str("[:blank:]"), + Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"), + Cntrl => self.wtr.write_str("[:cntrl:]"), + Digit if ast.negated => self.wtr.write_str("[:^digit:]"), + Digit => self.wtr.write_str("[:digit:]"), + Graph if ast.negated => self.wtr.write_str("[:^graph:]"), + Graph => self.wtr.write_str("[:graph:]"), + Lower if ast.negated => self.wtr.write_str("[:^lower:]"), + Lower => self.wtr.write_str("[:lower:]"), + Print if ast.negated => self.wtr.write_str("[:^print:]"), + Print => self.wtr.write_str("[:print:]"), + Punct if ast.negated => self.wtr.write_str("[:^punct:]"), + Punct => self.wtr.write_str("[:punct:]"), + Space if ast.negated => self.wtr.write_str("[:^space:]"), + Space => self.wtr.write_str("[:space:]"), + Upper if ast.negated => self.wtr.write_str("[:^upper:]"), + Upper => self.wtr.write_str("[:upper:]"), + Word if ast.negated => self.wtr.write_str("[:^word:]"), + Word => self.wtr.write_str("[:word:]"), + Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"), + Xdigit => self.wtr.write_str("[:xdigit:]"), + } + } + + fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result { + use crate::ast::ClassUnicodeKind::*; + use crate::ast::ClassUnicodeOpKind::*; + + if ast.negated { + self.wtr.write_str(r"\P")?; + } else { + self.wtr.write_str(r"\p")?; + } + match ast.kind { + OneLetter(c) => self.wtr.write_char(c), + Named(ref x) => write!(self.wtr, "{{{}}}", x), + NamedValue { op: Equal, ref name, ref value } => { + write!(self.wtr, "{{{}={}}}", name, value) + } + NamedValue { op: Colon, ref name, ref value } => { + write!(self.wtr, "{{{}:{}}}", name, value) + } + NamedValue { op: NotEqual, ref name, ref value } => { + write!(self.wtr, "{{{}!={}}}", name, value) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::Printer; + use crate::ast::parse::ParserBuilder; + + fn roundtrip(given: &str) { + roundtrip_with(|b| b, given); + } + + fn roundtrip_with<F>(mut f: F, given: &str) + where + F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, + { + let mut builder = ParserBuilder::new(); + f(&mut builder); + let ast = builder.build().parse(given).unwrap(); + + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&ast, &mut dst).unwrap(); + assert_eq!(given, dst); + } + + #[test] + fn print_literal() { + roundtrip("a"); + roundtrip(r"\["); + roundtrip_with(|b| b.octal(true), r"\141"); + roundtrip(r"\x61"); + roundtrip(r"\x7F"); + roundtrip(r"\u0061"); + roundtrip(r"\U00000061"); + roundtrip(r"\x{61}"); + roundtrip(r"\x{7F}"); + roundtrip(r"\u{61}"); + roundtrip(r"\U{61}"); + + roundtrip(r"\a"); + roundtrip(r"\f"); + roundtrip(r"\t"); + roundtrip(r"\n"); + roundtrip(r"\r"); + roundtrip(r"\v"); + roundtrip(r"(?x)\ "); + } + + #[test] + fn print_dot() { + roundtrip("."); + } + + #[test] + fn print_concat() { + roundtrip("ab"); + roundtrip("abcde"); + roundtrip("a(bcd)ef"); + } + + #[test] + fn print_alternation() { + roundtrip("a|b"); + roundtrip("a|b|c|d|e"); + roundtrip("|a|b|c|d|e"); + roundtrip("|a|b|c|d|e|"); + roundtrip("a(b|c|d)|e|f"); + } + + #[test] + fn print_assertion() { + roundtrip(r"^"); + roundtrip(r"$"); + roundtrip(r"\A"); + roundtrip(r"\z"); + roundtrip(r"\b"); + roundtrip(r"\B"); + } + + #[test] + fn print_repetition() { + roundtrip("a?"); + roundtrip("a??"); + roundtrip("a*"); + roundtrip("a*?"); + roundtrip("a+"); + roundtrip("a+?"); + roundtrip("a{5}"); + roundtrip("a{5}?"); + roundtrip("a{5,}"); + roundtrip("a{5,}?"); + roundtrip("a{5,10}"); + roundtrip("a{5,10}?"); + } + + #[test] + fn print_flags() { + roundtrip("(?i)"); + roundtrip("(?-i)"); + roundtrip("(?s-i)"); + roundtrip("(?-si)"); + roundtrip("(?siUmux)"); + } + + #[test] + fn print_group() { + roundtrip("(?i:a)"); + roundtrip("(?P<foo>a)"); + roundtrip("(a)"); + } + + #[test] + fn print_class() { + roundtrip(r"[abc]"); + roundtrip(r"[a-z]"); + roundtrip(r"[^a-z]"); + roundtrip(r"[a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[a-z0-9---]"); + roundtrip(r"[a-z&&m-n]"); + roundtrip(r"[[a-z&&m-n]]"); + roundtrip(r"[a-z--m-n]"); + roundtrip(r"[a-z~~m-n]"); + roundtrip(r"[a-z[0-9]]"); + roundtrip(r"[a-z[^0-9]]"); + + roundtrip(r"\d"); + roundtrip(r"\D"); + roundtrip(r"\s"); + roundtrip(r"\S"); + roundtrip(r"\w"); + roundtrip(r"\W"); + + roundtrip(r"[[:alnum:]]"); + roundtrip(r"[[:^alnum:]]"); + roundtrip(r"[[:alpha:]]"); + roundtrip(r"[[:^alpha:]]"); + roundtrip(r"[[:ascii:]]"); + roundtrip(r"[[:^ascii:]]"); + roundtrip(r"[[:blank:]]"); + roundtrip(r"[[:^blank:]]"); + roundtrip(r"[[:cntrl:]]"); + roundtrip(r"[[:^cntrl:]]"); + roundtrip(r"[[:digit:]]"); + roundtrip(r"[[:^digit:]]"); + roundtrip(r"[[:graph:]]"); + roundtrip(r"[[:^graph:]]"); + roundtrip(r"[[:lower:]]"); + roundtrip(r"[[:^lower:]]"); + roundtrip(r"[[:print:]]"); + roundtrip(r"[[:^print:]]"); + roundtrip(r"[[:punct:]]"); + roundtrip(r"[[:^punct:]]"); + roundtrip(r"[[:space:]]"); + roundtrip(r"[[:^space:]]"); + roundtrip(r"[[:upper:]]"); + roundtrip(r"[[:^upper:]]"); + roundtrip(r"[[:word:]]"); + roundtrip(r"[[:^word:]]"); + roundtrip(r"[[:xdigit:]]"); + roundtrip(r"[[:^xdigit:]]"); + + roundtrip(r"\pL"); + roundtrip(r"\PL"); + roundtrip(r"\p{L}"); + roundtrip(r"\P{L}"); + roundtrip(r"\p{X=Y}"); + roundtrip(r"\P{X=Y}"); + roundtrip(r"\p{X:Y}"); + roundtrip(r"\P{X:Y}"); + roundtrip(r"\p{X!=Y}"); + roundtrip(r"\P{X!=Y}"); + } +} diff --git a/vendor/regex-syntax/src/ast/visitor.rs b/vendor/regex-syntax/src/ast/visitor.rs new file mode 100644 index 000000000..a0d1e7dd5 --- /dev/null +++ b/vendor/regex-syntax/src/ast/visitor.rs @@ -0,0 +1,519 @@ +use std::fmt; + +use crate::ast::{self, Ast}; + +/// A trait for visiting an abstract syntax tree (AST) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on an abstract syntax tree without necessarily using recursion. +/// In particular, this permits callers to do case analysis with constant stack +/// usage, which can be important since the size of an abstract syntax tree +/// may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`](fn.visit.html) function. +/// +/// Note that the abstract syntax tree for a regular expression is quite +/// complex. Unless you specifically need it, you might be able to use the +/// much simpler +/// [high-level intermediate representation](../hir/struct.Hir.html) +/// and its +/// [corresponding `Visitor` trait](../hir/trait.Visitor.html) +/// instead. +pub trait Visitor { + /// The result of visiting an AST. + type Output; + /// An error that visiting an AST might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the AST or an error. + fn finish(self) -> Result<Self::Output, Self::Err>; + + /// This method is called before beginning traversal of the AST. + fn start(&mut self) {} + + /// This method is called on an `Ast` before descending into child `Ast` + /// nodes. + fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Ast` after descending all of its child + /// `Ast` nodes. + fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an + /// [`Alternation`](struct.Alternation.html). + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetItem`](enum.ClassSetItem.html) + /// before descending into child nodes. + fn visit_class_set_item_pre( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetItem`](enum.ClassSetItem.html) + /// after descending into child nodes. + fn visit_class_set_item_post( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) + /// before descending into child nodes. + fn visit_class_set_binary_op_pre( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) + /// after descending into child nodes. + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between the left hand and right hand child nodes + /// of a [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html). + fn visit_class_set_binary_op_in( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Ast` while calling the +/// appropriate methods provided by the +/// [`Visitor`](trait.Visitor.html) trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Ast` without using a stack size proportional to the depth +/// of the `Ast`. Namely, this method will instead use constant stack size, but +/// will use heap space proportional to the size of the `Ast`. This may be +/// desirable in cases where the size of `Ast` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit<V: Visitor>(ast: &Ast, visitor: V) -> Result<V::Output, V::Err> { + HeapVisitor::new().visit(ast, visitor) +} + +/// HeapVisitor visits every item in an `Ast` recursively using constant stack +/// size and a heap size proportional to the size of the `Ast`. +struct HeapVisitor<'a> { + /// A stack of `Ast` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Ast, Frame<'a>)>, + /// Similar to the `Ast` stack above, but is used only for character + /// classes. In particular, character classes embed their own mini + /// recursive syntax. + stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Ast`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a ast::Repetition), + /// A stack frame allocated just before descending into a group's child + /// node. + Group(&'a ast::Group), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, +} + +/// Represents a single stack frame while performing structural induction over +/// a character class. +enum ClassFrame<'a> { + /// The stack frame used while visiting every child node of a union of + /// character class items. + Union { + /// The child node we are currently visiting. + head: &'a ast::ClassSetItem, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [ast::ClassSetItem], + }, + /// The stack frame used while a binary class operation. + Binary { op: &'a ast::ClassSetBinaryOp }, + /// A stack frame allocated just before descending into a binary operator's + /// left hand child node. + BinaryLHS { + op: &'a ast::ClassSetBinaryOp, + lhs: &'a ast::ClassSet, + rhs: &'a ast::ClassSet, + }, + /// A stack frame allocated just before descending into a binary operator's + /// right hand child node. + BinaryRHS { op: &'a ast::ClassSetBinaryOp, rhs: &'a ast::ClassSet }, +} + +/// A representation of the inductive step when performing structural induction +/// over a character class. +/// +/// Note that there is no analogous explicit type for the inductive step for +/// `Ast` nodes because the inductive step is just an `Ast`. For character +/// classes, the inductive step can produce one of two possible child nodes: +/// an item or a binary operation. (An item cannot be a binary operation +/// because that would imply binary operations can be unioned in the concrete +/// syntax, which is not possible.) +enum ClassInduct<'a> { + Item(&'a ast::ClassSetItem), + BinaryOp(&'a ast::ClassSetBinaryOp), +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![], stack_class: vec![] } + } + + fn visit<V: Visitor>( + &mut self, + mut ast: &'a Ast, + mut visitor: V, + ) -> Result<V::Output, V::Err> { + self.stack.clear(); + self.stack_class.clear(); + + visitor.start(); + loop { + visitor.visit_pre(ast)?; + if let Some(x) = self.induct(ast, &mut visitor)? { + let child = x.child(); + self.stack.push((ast, x)); + ast = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + visitor.visit_post(ast)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + if let Frame::Alternation { .. } = x { + visitor.visit_alternation_in()?; + } + ast = x.child(); + self.stack.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this AST, so we can post visit it now. + visitor.visit_post(post_ast)?; + } + } + } + + /// Build a stack frame for the given AST if one is needed (which occurs if + /// and only if there are child nodes in the AST). Otherwise, return None. + /// + /// If this visits a class, then the underlying visitor implementation may + /// return an error which will be passed on here. + fn induct<V: Visitor>( + &mut self, + ast: &'a Ast, + visitor: &mut V, + ) -> Result<Option<Frame<'a>>, V::Err> { + Ok(match *ast { + Ast::Class(ast::Class::Bracketed(ref x)) => { + self.visit_class(x, visitor)?; + None + } + Ast::Repetition(ref x) => Some(Frame::Repetition(x)), + Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::Concat(ref x) if x.asts.is_empty() => None, + Ast::Concat(ref x) => { + Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) + } + Ast::Alternation(ref x) if x.asts.is_empty() => None, + Ast::Alternation(ref x) => Some(Frame::Alternation { + head: &x.asts[0], + tail: &x.asts[1..], + }), + _ => None, + }) + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> { + match induct { + Frame::Repetition(_) => None, + Frame::Group(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { head: &tail[0], tail: &tail[1..] }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } + + fn visit_class<V: Visitor>( + &mut self, + ast: &'a ast::ClassBracketed, + visitor: &mut V, + ) -> Result<(), V::Err> { + let mut ast = ClassInduct::from_bracketed(ast); + loop { + self.visit_class_pre(&ast, visitor)?; + if let Some(x) = self.induct_class(&ast) { + let child = x.child(); + self.stack_class.push((ast, x)); + ast = child; + continue; + } + self.visit_class_post(&ast, visitor)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack_class.pop() { + None => return Ok(()), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a union or a binary op, then we might have + // additional inductive steps to process. + if let Some(x) = self.pop_class(frame) { + if let ClassFrame::BinaryRHS { ref op, .. } = x { + visitor.visit_class_set_binary_op_in(op)?; + } + ast = x.child(); + self.stack_class.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this class node, so we can post visit it now. + self.visit_class_post(&post_ast, visitor)?; + } + } + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_pre<V: Visitor>( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + visitor.visit_class_set_item_pre(item)?; + } + ClassInduct::BinaryOp(op) => { + visitor.visit_class_set_binary_op_pre(op)?; + } + } + Ok(()) + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_post<V: Visitor>( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + visitor.visit_class_set_item_post(item)?; + } + ClassInduct::BinaryOp(op) => { + visitor.visit_class_set_binary_op_post(op)?; + } + } + Ok(()) + } + + /// Build a stack frame for the given class node if one is needed (which + /// occurs if and only if there are child nodes). Otherwise, return None. + fn induct_class(&self, ast: &ClassInduct<'a>) -> Option<ClassFrame<'a>> { + match *ast { + ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => { + match x.kind { + ast::ClassSet::Item(ref item) => { + Some(ClassFrame::Union { head: item, tail: &[] }) + } + ast::ClassSet::BinaryOp(ref op) => { + Some(ClassFrame::Binary { op: op }) + } + } + } + ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => { + if x.items.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &x.items[0], + tail: &x.items[1..], + }) + } + } + ClassInduct::BinaryOp(op) => Some(ClassFrame::BinaryLHS { + op: op, + lhs: &op.lhs, + rhs: &op.rhs, + }), + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop_class(&self, induct: ClassFrame<'a>) -> Option<ClassFrame<'a>> { + match induct { + ClassFrame::Union { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &tail[0], + tail: &tail[1..], + }) + } + } + ClassFrame::Binary { .. } => None, + ClassFrame::BinaryLHS { op, rhs, .. } => { + Some(ClassFrame::BinaryRHS { op: op, rhs: rhs }) + } + ClassFrame::BinaryRHS { .. } => None, + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child AST node to visit. + fn child(&self) -> &'a Ast { + match *self { + Frame::Repetition(rep) => &rep.ast, + Frame::Group(group) => &group.ast, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} + +impl<'a> ClassFrame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child class node to visit. + fn child(&self) -> ClassInduct<'a> { + match *self { + ClassFrame::Union { head, .. } => ClassInduct::Item(head), + ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op), + ClassFrame::BinaryLHS { ref lhs, .. } => { + ClassInduct::from_set(lhs) + } + ClassFrame::BinaryRHS { ref rhs, .. } => { + ClassInduct::from_set(rhs) + } + } + } +} + +impl<'a> ClassInduct<'a> { + fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> { + ClassInduct::from_set(&ast.kind) + } + + fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> { + match *ast { + ast::ClassSet::Item(ref item) => ClassInduct::Item(item), + ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op), + } + } +} + +impl<'a> fmt::Debug for ClassFrame<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let x = match *self { + ClassFrame::Union { .. } => "Union", + ClassFrame::Binary { .. } => "Binary", + ClassFrame::BinaryLHS { .. } => "BinaryLHS", + ClassFrame::BinaryRHS { .. } => "BinaryRHS", + }; + write!(f, "{}", x) + } +} + +impl<'a> fmt::Debug for ClassInduct<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let x = match *self { + ClassInduct::Item(it) => match *it { + ast::ClassSetItem::Empty(_) => "Item(Empty)", + ast::ClassSetItem::Literal(_) => "Item(Literal)", + ast::ClassSetItem::Range(_) => "Item(Range)", + ast::ClassSetItem::Ascii(_) => "Item(Ascii)", + ast::ClassSetItem::Perl(_) => "Item(Perl)", + ast::ClassSetItem::Unicode(_) => "Item(Unicode)", + ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)", + ast::ClassSetItem::Union(_) => "Item(Union)", + }, + ClassInduct::BinaryOp(it) => match it.kind { + ast::ClassSetBinaryOpKind::Intersection => { + "BinaryOp(Intersection)" + } + ast::ClassSetBinaryOpKind::Difference => { + "BinaryOp(Difference)" + } + ast::ClassSetBinaryOpKind::SymmetricDifference => { + "BinaryOp(SymmetricDifference)" + } + }, + }; + write!(f, "{}", x) + } +} diff --git a/vendor/regex-syntax/src/either.rs b/vendor/regex-syntax/src/either.rs new file mode 100644 index 000000000..7ae41e4ce --- /dev/null +++ b/vendor/regex-syntax/src/either.rs @@ -0,0 +1,8 @@ +/// A simple binary sum type. +/// +/// This is occasionally useful in an ad hoc fashion. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Either<Left, Right> { + Left(Left), + Right(Right), +} diff --git a/vendor/regex-syntax/src/error.rs b/vendor/regex-syntax/src/error.rs new file mode 100644 index 000000000..71cfa426a --- /dev/null +++ b/vendor/regex-syntax/src/error.rs @@ -0,0 +1,324 @@ +use std::cmp; +use std::error; +use std::fmt; +use std::result; + +use crate::ast; +use crate::hir; + +/// A type alias for dealing with errors returned by this crate. +pub type Result<T> = result::Result<T, Error>; + +/// This error type encompasses any error that can be returned by this crate. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Error { + /// An error that occurred while translating concrete syntax into abstract + /// syntax (AST). + Parse(ast::Error), + /// An error that occurred while translating abstract syntax into a high + /// level intermediate representation (HIR). + Translate(hir::Error), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl From<ast::Error> for Error { + fn from(err: ast::Error) -> Error { + Error::Parse(err) + } +} + +impl From<hir::Error> for Error { + fn from(err: hir::Error) -> Error { + Error::Translate(err) + } +} + +impl error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + match *self { + Error::Parse(ref x) => x.description(), + Error::Translate(ref x) => x.description(), + _ => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Error::Parse(ref x) => x.fmt(f), + Error::Translate(ref x) => x.fmt(f), + _ => unreachable!(), + } + } +} + +/// A helper type for formatting nice error messages. +/// +/// This type is responsible for reporting regex parse errors in a nice human +/// readable format. Most of its complexity is from interspersing notational +/// markers pointing out the position where an error occurred. +#[derive(Debug)] +pub struct Formatter<'e, E> { + /// The original regex pattern in which the error occurred. + pattern: &'e str, + /// The error kind. It must impl fmt::Display. + err: &'e E, + /// The primary span of the error. + span: &'e ast::Span, + /// An auxiliary and optional span, in case the error needs to point to + /// two locations (e.g., when reporting a duplicate capture group name). + aux_span: Option<&'e ast::Span>, +} + +impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> { + fn from(err: &'e ast::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: err.auxiliary_span(), + } + } +} + +impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> { + fn from(err: &'e hir::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: None, + } + } +} + +impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let spans = Spans::from_formatter(self); + if self.pattern.contains('\n') { + let divider = repeat_char('~', 79); + + writeln!(f, "regex parse error:")?; + writeln!(f, "{}", divider)?; + let notated = spans.notate(); + write!(f, "{}", notated)?; + writeln!(f, "{}", divider)?; + // If we have error spans that cover multiple lines, then we just + // note the line numbers. + if !spans.multi_line.is_empty() { + let mut notes = vec![]; + for span in &spans.multi_line { + notes.push(format!( + "on line {} (column {}) through line {} (column {})", + span.start.line, + span.start.column, + span.end.line, + span.end.column - 1 + )); + } + writeln!(f, "{}", notes.join("\n"))?; + } + write!(f, "error: {}", self.err)?; + } else { + writeln!(f, "regex parse error:")?; + let notated = Spans::from_formatter(self).notate(); + write!(f, "{}", notated)?; + write!(f, "error: {}", self.err)?; + } + Ok(()) + } +} + +/// This type represents an arbitrary number of error spans in a way that makes +/// it convenient to notate the regex pattern. ("Notate" means "point out +/// exactly where the error occurred in the regex pattern.") +/// +/// Technically, we can only ever have two spans given our current error +/// structure. However, after toiling with a specific algorithm for handling +/// two spans, it became obvious that an algorithm to handle an arbitrary +/// number of spans was actually much simpler. +struct Spans<'p> { + /// The original regex pattern string. + pattern: &'p str, + /// The total width that should be used for line numbers. The width is + /// used for left padding the line numbers for alignment. + /// + /// A value of `0` means line numbers should not be displayed. That is, + /// the pattern is itself only one line. + line_number_width: usize, + /// All error spans that occur on a single line. This sequence always has + /// length equivalent to the number of lines in `pattern`, where the index + /// of the sequence represents a line number, starting at `0`. The spans + /// in each line are sorted in ascending order. + by_line: Vec<Vec<ast::Span>>, + /// All error spans that occur over one or more lines. That is, the start + /// and end position of the span have different line numbers. The spans are + /// sorted in ascending order. + multi_line: Vec<ast::Span>, +} + +impl<'p> Spans<'p> { + /// Build a sequence of spans from a formatter. + fn from_formatter<'e, E: fmt::Display>( + fmter: &'p Formatter<'e, E>, + ) -> Spans<'p> { + let mut line_count = fmter.pattern.lines().count(); + // If the pattern ends with a `\n` literal, then our line count is + // off by one, since a span can occur immediately after the last `\n`, + // which is consider to be an additional line. + if fmter.pattern.ends_with('\n') { + line_count += 1; + } + let line_number_width = + if line_count <= 1 { 0 } else { line_count.to_string().len() }; + let mut spans = Spans { + pattern: &fmter.pattern, + line_number_width: line_number_width, + by_line: vec![vec![]; line_count], + multi_line: vec![], + }; + spans.add(fmter.span.clone()); + if let Some(span) = fmter.aux_span { + spans.add(span.clone()); + } + spans + } + + /// Add the given span to this sequence, putting it in the right place. + fn add(&mut self, span: ast::Span) { + // This is grossly inefficient since we sort after each add, but right + // now, we only ever add two spans at most. + if span.is_one_line() { + let i = span.start.line - 1; // because lines are 1-indexed + self.by_line[i].push(span); + self.by_line[i].sort(); + } else { + self.multi_line.push(span); + self.multi_line.sort(); + } + } + + /// Notate the pattern string with carents (`^`) pointing at each span + /// location. This only applies to spans that occur within a single line. + fn notate(&self) -> String { + let mut notated = String::new(); + for (i, line) in self.pattern.lines().enumerate() { + if self.line_number_width > 0 { + notated.push_str(&self.left_pad_line_number(i + 1)); + notated.push_str(": "); + } else { + notated.push_str(" "); + } + notated.push_str(line); + notated.push('\n'); + if let Some(notes) = self.notate_line(i) { + notated.push_str(¬es); + notated.push('\n'); + } + } + notated + } + + /// Return notes for the line indexed at `i` (zero-based). If there are no + /// spans for the given line, then `None` is returned. Otherwise, an + /// appropriately space padded string with correctly positioned `^` is + /// returned, accounting for line numbers. + fn notate_line(&self, i: usize) -> Option<String> { + let spans = &self.by_line[i]; + if spans.is_empty() { + return None; + } + let mut notes = String::new(); + for _ in 0..self.line_number_padding() { + notes.push(' '); + } + let mut pos = 0; + for span in spans { + for _ in pos..(span.start.column - 1) { + notes.push(' '); + pos += 1; + } + let note_len = span.end.column.saturating_sub(span.start.column); + for _ in 0..cmp::max(1, note_len) { + notes.push('^'); + pos += 1; + } + } + Some(notes) + } + + /// Left pad the given line number with spaces such that it is aligned with + /// other line numbers. + fn left_pad_line_number(&self, n: usize) -> String { + let n = n.to_string(); + let pad = self.line_number_width.checked_sub(n.len()).unwrap(); + let mut result = repeat_char(' ', pad); + result.push_str(&n); + result + } + + /// Return the line number padding beginning at the start of each line of + /// the pattern. + /// + /// If the pattern is only one line, then this returns a fixed padding + /// for visual indentation. + fn line_number_padding(&self) -> usize { + if self.line_number_width == 0 { + 4 + } else { + 2 + self.line_number_width + } + } +} + +fn repeat_char(c: char, count: usize) -> String { + ::std::iter::repeat(c).take(count).collect() +} + +#[cfg(test)] +mod tests { + use crate::ast::parse::Parser; + + fn assert_panic_message(pattern: &str, expected_msg: &str) -> () { + let result = Parser::new().parse(pattern); + match result { + Ok(_) => { + panic!("regex should not have parsed"); + } + Err(err) => { + assert_eq!(err.to_string(), expected_msg.trim()); + } + } + } + + // See: https://github.com/rust-lang/regex/issues/464 + #[test] + fn regression_464() { + let err = Parser::new().parse("a{\n").unwrap_err(); + // This test checks that the error formatter doesn't panic. + assert!(!err.to_string().is_empty()); + } + + // See: https://github.com/rust-lang/regex/issues/545 + #[test] + fn repetition_quantifier_expects_a_valid_decimal() { + assert_panic_message( + r"\\u{[^}]*}", + r#" +regex parse error: + \\u{[^}]*} + ^ +error: repetition quantifier expects a valid decimal +"#, + ); + } +} diff --git a/vendor/regex-syntax/src/hir/interval.rs b/vendor/regex-syntax/src/hir/interval.rs new file mode 100644 index 000000000..cfaa2cb45 --- /dev/null +++ b/vendor/regex-syntax/src/hir/interval.rs @@ -0,0 +1,520 @@ +use std::char; +use std::cmp; +use std::fmt::Debug; +use std::slice; +use std::u8; + +use crate::unicode; + +// This module contains an *internal* implementation of interval sets. +// +// The primary invariant that interval sets guards is canonical ordering. That +// is, every interval set contains an ordered sequence of intervals where +// no two intervals are overlapping or adjacent. While this invariant is +// occasionally broken within the implementation, it should be impossible for +// callers to observe it. +// +// Since case folding (as implemented below) breaks that invariant, we roll +// that into this API even though it is a little out of place in an otherwise +// generic interval set. (Hence the reason why the `unicode` module is imported +// here.) +// +// Some of the implementation complexity here is a result of me wanting to +// preserve the sequential representation without using additional memory. +// In many cases, we do use linear extra memory, but it is at most 2x and it +// is amortized. If we relaxed the memory requirements, this implementation +// could become much simpler. The extra memory is honestly probably OK, but +// character classes (especially of the Unicode variety) can become quite +// large, and it would be nice to keep regex compilation snappy even in debug +// builds. (In the past, I have been careless with this area of code and it has +// caused slow regex compilations in debug mode, so this isn't entirely +// unwarranted.) +// +// Tests on this are relegated to the public API of HIR in src/hir.rs. + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct IntervalSet<I> { + ranges: Vec<I>, +} + +impl<I: Interval> IntervalSet<I> { + /// Create a new set from a sequence of intervals. Each interval is + /// specified as a pair of bounds, where both bounds are inclusive. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> { + let mut set = IntervalSet { ranges: intervals.into_iter().collect() }; + set.canonicalize(); + set + } + + /// Add a new interval to this set. + pub fn push(&mut self, interval: I) { + // TODO: This could be faster. e.g., Push the interval such that + // it preserves canonicalization. + self.ranges.push(interval); + self.canonicalize(); + } + + /// Return an iterator over all intervals in this set. + /// + /// The iterator yields intervals in ascending order. + pub fn iter(&self) -> IntervalSetIter<'_, I> { + IntervalSetIter(self.ranges.iter()) + } + + /// Return an immutable slice of intervals in this set. + /// + /// The sequence returned is in canonical ordering. + pub fn intervals(&self) -> &[I] { + &self.ranges + } + + /// Expand this interval set such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// This returns an error if the necessary case mapping data is not + /// available. + pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { + let len = self.ranges.len(); + for i in 0..len { + let range = self.ranges[i]; + if let Err(err) = range.case_fold_simple(&mut self.ranges) { + self.canonicalize(); + return Err(err); + } + } + self.canonicalize(); + Ok(()) + } + + /// Union this set with the given set, in place. + pub fn union(&mut self, other: &IntervalSet<I>) { + // This could almost certainly be done more efficiently. + self.ranges.extend(&other.ranges); + self.canonicalize(); + } + + /// Intersect this set with the given set, in place. + pub fn intersect(&mut self, other: &IntervalSet<I>) { + if self.ranges.is_empty() { + return; + } + if other.ranges.is_empty() { + self.ranges.clear(); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the intersection to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + let mut ita = (0..drain_end).into_iter(); + let mut itb = (0..other.ranges.len()).into_iter(); + let mut a = ita.next().unwrap(); + let mut b = itb.next().unwrap(); + loop { + if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { + self.ranges.push(ab); + } + let (it, aorb) = + if self.ranges[a].upper() < other.ranges[b].upper() { + (&mut ita, &mut a) + } else { + (&mut itb, &mut b) + }; + match it.next() { + Some(v) => *aorb = v, + None => break, + } + } + self.ranges.drain(..drain_end); + } + + /// Subtract the given set from this set, in place. + pub fn difference(&mut self, other: &IntervalSet<I>) { + if self.ranges.is_empty() || other.ranges.is_empty() { + return; + } + + // This algorithm is (to me) surprisingly complex. A search of the + // interwebs indicate that this is a potentially interesting problem. + // Folks seem to suggest interval or segment trees, but I'd like to + // avoid the overhead (both runtime and conceptual) of that. + // + // The following is basically my Shitty First Draft. Therefore, in + // order to grok it, you probably need to read each line carefully. + // Simplifications are most welcome! + // + // Remember, we can assume the canonical format invariant here, which + // says that all ranges are sorted, not overlapping and not adjacent in + // each class. + let drain_end = self.ranges.len(); + let (mut a, mut b) = (0, 0); + 'LOOP: while a < drain_end && b < other.ranges.len() { + // Basically, the easy cases are when neither range overlaps with + // each other. If the `b` range is less than our current `a` + // range, then we can skip it and move on. + if other.ranges[b].upper() < self.ranges[a].lower() { + b += 1; + continue; + } + // ... similarly for the `a` range. If it's less than the smallest + // `b` range, then we can add it as-is. + if self.ranges[a].upper() < other.ranges[b].lower() { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + continue; + } + // Otherwise, we have overlapping ranges. + assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); + + // This part is tricky and was non-obvious to me without looking + // at explicit examples (see the tests). The trickiness stems from + // two things: 1) subtracting a range from another range could + // yield two ranges and 2) after subtracting a range, it's possible + // that future ranges can have an impact. The loop below advances + // the `b` ranges until they can't possible impact the current + // range. + // + // For example, if our `a` range is `a-t` and our next three `b` + // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply + // subtraction three times before moving on to the next `a` range. + let mut range = self.ranges[a]; + while b < other.ranges.len() + && !range.is_intersection_empty(&other.ranges[b]) + { + let old_range = range; + range = match range.difference(&other.ranges[b]) { + (None, None) => { + // We lost the entire range, so move on to the next + // without adding this one. + a += 1; + continue 'LOOP; + } + (Some(range1), None) | (None, Some(range1)) => range1, + (Some(range1), Some(range2)) => { + self.ranges.push(range1); + range2 + } + }; + // It's possible that the `b` range has more to contribute + // here. In particular, if it is greater than the original + // range, then it might impact the next `a` range *and* it + // has impacted the current `a` range as much as possible, + // so we can quit. We don't bump `b` so that the next `a` + // range can apply it. + if other.ranges[b].upper() > old_range.upper() { + break; + } + // Otherwise, the next `b` range might apply to the current + // `a` range. + b += 1; + } + self.ranges.push(range); + a += 1; + } + while a < drain_end { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + } + self.ranges.drain(..drain_end); + } + + /// Compute the symmetric difference of the two sets, in place. + /// + /// This computes the symmetric difference of two interval sets. This + /// removes all elements in this set that are also in the given set, + /// but also adds all elements from the given set that aren't in this + /// set. That is, the set will contain all elements in either set, + /// but will not contain any elements that are in both sets. + pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) { + // TODO(burntsushi): Fix this so that it amortizes allocation. + let mut intersection = self.clone(); + intersection.intersect(other); + self.union(other); + self.difference(&intersection); + } + + /// Negate this interval set. + /// + /// For all `x` where `x` is any element, if `x` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + if self.ranges.is_empty() { + let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); + self.ranges.push(I::create(min, max)); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + // We do checked arithmetic below because of the canonical ordering + // invariant. + if self.ranges[0].lower() > I::Bound::min_value() { + let upper = self.ranges[0].lower().decrement(); + self.ranges.push(I::create(I::Bound::min_value(), upper)); + } + for i in 1..drain_end { + let lower = self.ranges[i - 1].upper().increment(); + let upper = self.ranges[i].lower().decrement(); + self.ranges.push(I::create(lower, upper)); + } + if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { + let lower = self.ranges[drain_end - 1].upper().increment(); + self.ranges.push(I::create(lower, I::Bound::max_value())); + } + self.ranges.drain(..drain_end); + } + + /// Converts this set into a canonical ordering. + fn canonicalize(&mut self) { + if self.is_canonical() { + return; + } + self.ranges.sort(); + assert!(!self.ranges.is_empty()); + + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } + } + let range = self.ranges[oldi]; + self.ranges.push(range); + } + self.ranges.drain(..drain_end); + } + + /// Returns true if and only if this class is in a canonical ordering. + fn is_canonical(&self) -> bool { + for pair in self.ranges.windows(2) { + if pair[0] >= pair[1] { + return false; + } + if pair[0].is_contiguous(&pair[1]) { + return false; + } + } + true + } +} + +/// An iterator over intervals. +#[derive(Debug)] +pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>); + +impl<'a, I> Iterator for IntervalSetIter<'a, I> { + type Item = &'a I; + + fn next(&mut self) -> Option<&'a I> { + self.0.next() + } +} + +pub trait Interval: + Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord +{ + type Bound: Bound; + + fn lower(&self) -> Self::Bound; + fn upper(&self) -> Self::Bound; + fn set_lower(&mut self, bound: Self::Bound); + fn set_upper(&mut self, bound: Self::Bound); + fn case_fold_simple( + &self, + intervals: &mut Vec<Self>, + ) -> Result<(), unicode::CaseFoldError>; + + /// Create a new interval. + fn create(lower: Self::Bound, upper: Self::Bound) -> Self { + let mut int = Self::default(); + if lower <= upper { + int.set_lower(lower); + int.set_upper(upper); + } else { + int.set_lower(upper); + int.set_upper(lower); + } + int + } + + /// Union the given overlapping range into this range. + /// + /// If the two ranges aren't contiguous, then this returns `None`. + fn union(&self, other: &Self) -> Option<Self> { + if !self.is_contiguous(other) { + return None; + } + let lower = cmp::min(self.lower(), other.lower()); + let upper = cmp::max(self.upper(), other.upper()); + Some(Self::create(lower, upper)) + } + + /// Intersect this range with the given range and return the result. + /// + /// If the intersection is empty, then this returns `None`. + fn intersect(&self, other: &Self) -> Option<Self> { + let lower = cmp::max(self.lower(), other.lower()); + let upper = cmp::min(self.upper(), other.upper()); + if lower <= upper { + Some(Self::create(lower, upper)) + } else { + None + } + } + + /// Subtract the given range from this range and return the resulting + /// ranges. + /// + /// If subtraction would result in an empty range, then no ranges are + /// returned. + fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) { + if self.is_subset(other) { + return (None, None); + } + if self.is_intersection_empty(other) { + return (Some(self.clone()), None); + } + let add_lower = other.lower() > self.lower(); + let add_upper = other.upper() < self.upper(); + // We know this because !self.is_subset(other) and the ranges have + // a non-empty intersection. + assert!(add_lower || add_upper); + let mut ret = (None, None); + if add_lower { + let upper = other.lower().decrement(); + ret.0 = Some(Self::create(self.lower(), upper)); + } + if add_upper { + let lower = other.upper().increment(); + let range = Self::create(lower, self.upper()); + if ret.0.is_none() { + ret.0 = Some(range); + } else { + ret.1 = Some(range); + } + } + ret + } + + /// Compute the symmetric difference the given range from this range. This + /// returns the union of the two ranges minus its intersection. + fn symmetric_difference( + &self, + other: &Self, + ) -> (Option<Self>, Option<Self>) { + let union = match self.union(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(union) => union, + }; + let intersection = match self.intersect(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(intersection) => intersection, + }; + union.difference(&intersection) + } + + /// Returns true if and only if the two ranges are contiguous. Two ranges + /// are contiguous if and only if the ranges are either overlapping or + /// adjacent. + fn is_contiguous(&self, other: &Self) -> bool { + let lower1 = self.lower().as_u32(); + let upper1 = self.upper().as_u32(); + let lower2 = other.lower().as_u32(); + let upper2 = other.upper().as_u32(); + cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) + } + + /// Returns true if and only if the intersection of this range and the + /// other range is empty. + fn is_intersection_empty(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + cmp::max(lower1, lower2) > cmp::min(upper1, upper2) + } + + /// Returns true if and only if this range is a subset of the other range. + fn is_subset(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + (lower2 <= lower1 && lower1 <= upper2) + && (lower2 <= upper1 && upper1 <= upper2) + } +} + +pub trait Bound: + Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord +{ + fn min_value() -> Self; + fn max_value() -> Self; + fn as_u32(self) -> u32; + fn increment(self) -> Self; + fn decrement(self) -> Self; +} + +impl Bound for u8 { + fn min_value() -> Self { + u8::MIN + } + fn max_value() -> Self { + u8::MAX + } + fn as_u32(self) -> u32 { + self as u32 + } + fn increment(self) -> Self { + self.checked_add(1).unwrap() + } + fn decrement(self) -> Self { + self.checked_sub(1).unwrap() + } +} + +impl Bound for char { + fn min_value() -> Self { + '\x00' + } + fn max_value() -> Self { + '\u{10FFFF}' + } + fn as_u32(self) -> u32 { + self as u32 + } + + fn increment(self) -> Self { + match self { + '\u{D7FF}' => '\u{E000}', + c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), + } + } + + fn decrement(self) -> Self { + match self { + '\u{E000}' => '\u{D7FF}', + c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), + } + } +} + +// Tests for interval sets are written in src/hir.rs against the public API. diff --git a/vendor/regex-syntax/src/hir/literal/mod.rs b/vendor/regex-syntax/src/hir/literal/mod.rs new file mode 100644 index 000000000..1e66d2cc3 --- /dev/null +++ b/vendor/regex-syntax/src/hir/literal/mod.rs @@ -0,0 +1,1690 @@ +/*! +Provides routines for extracting literal prefixes and suffixes from an `Hir`. +*/ + +use std::cmp; +use std::fmt; +use std::iter; +use std::mem; +use std::ops; + +use crate::hir::{self, Hir, HirKind}; + +/// A set of literal byte strings extracted from a regular expression. +/// +/// Every member of the set is a `Literal`, which is represented by a +/// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is +/// said to be either *complete* or *cut*. A complete literal means that +/// it extends until the beginning (or end) of the regular expression. In +/// some circumstances, this can be used to indicate a match in the regular +/// expression. +/// +/// A key aspect of literal extraction is knowing when to stop. It is not +/// feasible to blindly extract all literals from a regular expression, even if +/// there are finitely many. For example, the regular expression `[0-9]{10}` +/// has `10^10` distinct literals. For this reason, literal extraction is +/// bounded to some low number by default using heuristics, but the limits can +/// be tweaked. +/// +/// **WARNING**: Literal extraction uses stack space proportional to the size +/// of the `Hir` expression. At some point, this drawback will be eliminated. +/// To protect yourself, set a reasonable +/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). +/// This is done for you by default. +#[derive(Clone, Eq, PartialEq)] +pub struct Literals { + lits: Vec<Literal>, + limit_size: usize, + limit_class: usize, +} + +/// A single member of a set of literals extracted from a regular expression. +/// +/// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice +/// and `Vec` operations are available. +#[derive(Clone, Eq, Ord)] +pub struct Literal { + v: Vec<u8>, + cut: bool, +} + +impl Literals { + /// Returns a new empty set of literals using default limits. + pub fn empty() -> Literals { + Literals { lits: vec![], limit_size: 250, limit_class: 10 } + } + + /// Returns a set of literal prefixes extracted from the given `Hir`. + pub fn prefixes(expr: &Hir) -> Literals { + let mut lits = Literals::empty(); + lits.union_prefixes(expr); + lits + } + + /// Returns a set of literal suffixes extracted from the given `Hir`. + pub fn suffixes(expr: &Hir) -> Literals { + let mut lits = Literals::empty(); + lits.union_suffixes(expr); + lits + } + + /// Get the approximate size limit (in bytes) of this set. + pub fn limit_size(&self) -> usize { + self.limit_size + } + + /// Set the approximate size limit (in bytes) of this set. + /// + /// If extracting a literal would put the set over this limit, then + /// extraction stops. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { + self.limit_size = size; + self + } + + /// Get the character class size limit for this set. + pub fn limit_class(&self) -> usize { + self.limit_class + } + + /// Limits the size of character(or byte) classes considered. + /// + /// A value of `0` prevents all character classes from being considered. + /// + /// This limit also applies to case insensitive literals, since each + /// character in the case insensitive literal is converted to a class, and + /// then case folded. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { + self.limit_class = size; + self + } + + /// Returns the set of literals as a slice. Its order is unspecified. + pub fn literals(&self) -> &[Literal] { + &self.lits + } + + /// Returns the length of the smallest literal. + /// + /// Returns None is there are no literals in the set. + pub fn min_len(&self) -> Option<usize> { + let mut min = None; + for lit in &self.lits { + match min { + None => min = Some(lit.len()), + Some(m) if lit.len() < m => min = Some(lit.len()), + _ => {} + } + } + min + } + + /// Returns true if all members in this set are complete. + pub fn all_complete(&self) -> bool { + !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) + } + + /// Returns true if any member in this set is complete. + pub fn any_complete(&self) -> bool { + self.lits.iter().any(|lit| !lit.is_cut()) + } + + /// Returns true if this set contains an empty literal. + pub fn contains_empty(&self) -> bool { + self.lits.iter().any(|lit| lit.is_empty()) + } + + /// Returns true if this set is empty or if all of its members is empty. + pub fn is_empty(&self) -> bool { + self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) + } + + /// Returns a new empty set of literals using this set's limits. + pub fn to_empty(&self) -> Literals { + let mut lits = Literals::empty(); + lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class); + lits + } + + /// Returns the longest common prefix of all members in this set. + pub fn longest_common_prefix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(), + ); + } + &self.lits[0][..len] + } + + /// Returns the longest common suffix of all members in this set. + pub fn longest_common_suffix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter() + .rev() + .zip(lit0.iter().rev()) + .take_while(|&(a, b)| a == b) + .count(), + ); + } + &self.lits[0][self.lits[0].len() - len..] + } + + /// Returns a new set of literals with the given number of bytes trimmed + /// from the suffix of each literal. + /// + /// If any literal would be cut out completely by trimming, then None is + /// returned. + /// + /// Any duplicates that are created as a result of this transformation are + /// removed. + pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> { + if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { + return None; + } + let mut new = self.to_empty(); + for mut lit in self.lits.iter().cloned() { + let new_len = lit.len() - num_bytes; + lit.truncate(new_len); + lit.cut(); + new.lits.push(lit); + } + new.lits.sort(); + new.lits.dedup(); + Some(new) + } + + /// Returns a new set of prefixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same starting position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_prefixes(&self) -> Literals { + if self.lits.is_empty() { + return self.to_empty(); + } + let mut old: Vec<Literal> = self.lits.iter().cloned().collect(); + let mut new = self.to_empty(); + 'OUTER: while let Some(mut candidate) = old.pop() { + if candidate.is_empty() { + continue; + } + if new.lits.is_empty() { + new.lits.push(candidate); + continue; + } + for lit2 in &mut new.lits { + if lit2.is_empty() { + continue; + } + if &candidate == lit2 { + // If the literal is already in the set, then we can + // just drop it. But make sure that cut literals are + // infectious! + candidate.cut = candidate.cut || lit2.cut; + lit2.cut = candidate.cut; + continue 'OUTER; + } + if candidate.len() < lit2.len() { + if let Some(i) = position(&candidate, &lit2) { + candidate.cut(); + let mut lit3 = lit2.clone(); + lit3.truncate(i); + lit3.cut(); + old.push(lit3); + lit2.clear(); + } + } else { + if let Some(i) = position(&lit2, &candidate) { + lit2.cut(); + let mut new_candidate = candidate.clone(); + new_candidate.truncate(i); + new_candidate.cut(); + old.push(new_candidate); + candidate.clear(); + } + } + // Oops, the candidate is already represented in the set. + if candidate.is_empty() { + continue 'OUTER; + } + } + new.lits.push(candidate); + } + new.lits.retain(|lit| !lit.is_empty()); + new.lits.sort(); + new.lits.dedup(); + new + } + + /// Returns a new set of suffixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same ending position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_suffixes(&self) -> Literals { + // This is a touch wasteful... + let mut lits = self.clone(); + lits.reverse(); + let mut unamb = lits.unambiguous_prefixes(); + unamb.reverse(); + unamb + } + + /// Unions the prefixes from the given expression to this set. + /// + /// If prefixes could not be added (for example, this set would exceed its + /// size limits or the set of prefixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the beginning of `expr` to the + /// end of `expr`. + pub fn union_prefixes(&mut self, expr: &Hir) -> bool { + let mut lits = self.to_empty(); + prefixes(expr, &mut lits); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions the suffixes from the given expression to this set. + /// + /// If suffixes could not be added (for example, this set would exceed its + /// size limits or the set of suffixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the end of `expr` to the + /// beginning of `expr`. + pub fn union_suffixes(&mut self, expr: &Hir) -> bool { + let mut lits = self.to_empty(); + suffixes(expr, &mut lits); + lits.reverse(); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions this set with another set. + /// + /// If the union would cause the set to exceed its limits, then the union + /// is skipped and it returns false. Otherwise, if the union succeeds, it + /// returns true. + pub fn union(&mut self, lits: Literals) -> bool { + if self.num_bytes() + lits.num_bytes() > self.limit_size { + return false; + } + if lits.is_empty() { + self.lits.push(Literal::empty()); + } else { + self.lits.extend(lits.lits); + } + true + } + + /// Extends this set with another set. + /// + /// The set of literals is extended via a cross product. + /// + /// If a cross product would cause this set to exceed its limits, then the + /// cross product is skipped and it returns false. Otherwise, if the cross + /// product succeeds, it returns true. + pub fn cross_product(&mut self, lits: &Literals) -> bool { + if lits.is_empty() { + return true; + } + // Check that we make sure we stay in our limits. + let mut size_after; + if self.is_empty() || !self.any_complete() { + size_after = self.num_bytes(); + for lits_lit in lits.literals() { + size_after += lits_lit.len(); + } + } else { + size_after = self.lits.iter().fold(0, |accum, lit| { + accum + if lit.is_cut() { lit.len() } else { 0 } + }); + for lits_lit in lits.literals() { + for self_lit in self.literals() { + if !self_lit.is_cut() { + size_after += self_lit.len() + lits_lit.len(); + } + } + } + } + if size_after > self.limit_size { + return false; + } + + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for lits_lit in lits.literals() { + for mut self_lit in base.clone() { + self_lit.extend(&**lits_lit); + self_lit.cut = lits_lit.cut; + self.lits.push(self_lit); + } + } + true + } + + /// Extends each literal in this set with the bytes given. + /// + /// If the set is empty, then the given literal is added to the set. + /// + /// If adding any number of bytes to all members of this set causes a limit + /// to be exceeded, then no bytes are added and false is returned. If a + /// prefix of `bytes` can be fit into this set, then it is used and all + /// resulting literals are cut. + pub fn cross_add(&mut self, bytes: &[u8]) -> bool { + // N.B. This could be implemented by simply calling cross_product with + // a literal set containing just `bytes`, but we can be smarter about + // taking shorter prefixes of `bytes` if they'll fit. + if bytes.is_empty() { + return true; + } + if self.lits.is_empty() { + let i = cmp::min(self.limit_size, bytes.len()); + self.lits.push(Literal::new(bytes[..i].to_owned())); + self.lits[0].cut = i < bytes.len(); + return !self.lits[0].is_cut(); + } + let size = self.num_bytes(); + if size + self.lits.len() >= self.limit_size { + return false; + } + let mut i = 1; + while size + (i * self.lits.len()) <= self.limit_size + && i < bytes.len() + { + i += 1; + } + for lit in &mut self.lits { + if !lit.is_cut() { + lit.extend(&bytes[..i]); + if i < bytes.len() { + lit.cut(); + } + } + } + true + } + + /// Adds the given literal to this set. + /// + /// Returns false if adding this literal would cause the class to be too + /// big. + pub fn add(&mut self, lit: Literal) -> bool { + if self.num_bytes() + lit.len() > self.limit_size { + return false; + } + self.lits.push(lit); + true + } + + /// Extends each literal in this set with the character class given. + /// + /// Returns false if the character class was too big to add. + pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { + self._add_char_class(cls, false) + } + + /// Extends each literal in this set with the character class given, + /// writing the bytes of each character in reverse. + /// + /// Returns false if the character class was too big to add. + fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { + self._add_char_class(cls, true) + } + + fn _add_char_class( + &mut self, + cls: &hir::ClassUnicode, + reverse: bool, + ) -> bool { + use std::char; + + if self.class_exceeds_limits(cls_char_count(cls)) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for r in cls.iter() { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for c in (s..e).filter_map(char::from_u32) { + for mut lit in base.clone() { + let mut bytes = c.to_string().into_bytes(); + if reverse { + bytes.reverse(); + } + lit.extend(&bytes); + self.lits.push(lit); + } + } + } + true + } + + /// Extends each literal in this set with the byte class given. + /// + /// Returns false if the byte class was too big to add. + pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { + if self.class_exceeds_limits(cls_byte_count(cls)) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for r in cls.iter() { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for b in (s..e).map(|b| b as u8) { + for mut lit in base.clone() { + lit.push(b); + self.lits.push(lit); + } + } + } + true + } + + /// Cuts every member of this set. When a member is cut, it can never + /// be extended. + pub fn cut(&mut self) { + for lit in &mut self.lits { + lit.cut(); + } + } + + /// Reverses all members in place. + pub fn reverse(&mut self) { + for lit in &mut self.lits { + lit.reverse(); + } + } + + /// Clears this set of all members. + pub fn clear(&mut self) { + self.lits.clear(); + } + + /// Pops all complete literals out of this set. + fn remove_complete(&mut self) -> Vec<Literal> { + let mut base = vec![]; + for lit in mem::replace(&mut self.lits, vec![]) { + if lit.is_cut() { + self.lits.push(lit); + } else { + base.push(lit); + } + } + base + } + + /// Returns the total number of bytes in this set. + fn num_bytes(&self) -> usize { + self.lits.iter().fold(0, |accum, lit| accum + lit.len()) + } + + /// Returns true if a character class with the given size would cause this + /// set to exceed its limits. + /// + /// The size given should correspond to the number of items in the class. + fn class_exceeds_limits(&self, size: usize) -> bool { + if size > self.limit_class { + return true; + } + // This is an approximation since codepoints in a char class can encode + // to 1-4 bytes. + let new_byte_count = if self.lits.is_empty() { + size + } else { + self.lits.iter().fold(0, |accum, lit| { + accum + + if lit.is_cut() { + // If the literal is cut, then we'll never add + // anything to it, so don't count it. + 0 + } else { + (lit.len() + 1) * size + } + }) + }; + new_byte_count > self.limit_size + } +} + +fn prefixes(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0; 4]; + lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); + } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if !lits.add_char_class(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + HirKind::Group(hir::Group { ref hir, .. }) => { + prefixes(&**hir, lits); + } + HirKind::Repetition(ref x) => match x.kind { + hir::RepetitionKind::ZeroOrOne => { + repeat_zero_or_one_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::ZeroOrMore => { + repeat_zero_or_more_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::OneOrMore => { + repeat_one_or_more_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => (m, Some(m)), + hir::RepetitionRange::AtLeast(m) => (m, None), + hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, prefixes, + ) + } + }, + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), + HirKind::Concat(ref es) => { + for e in es { + if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Literal::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + prefixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, prefixes); + } + _ => lits.cut(), + } +} + +fn suffixes(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + let i = c.encode_utf8(&mut buf).len(); + let buf = &mut buf[..i]; + buf.reverse(); + lits.cross_add(buf); + } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if !lits.add_char_class_reverse(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + HirKind::Group(hir::Group { ref hir, .. }) => { + suffixes(&**hir, lits); + } + HirKind::Repetition(ref x) => match x.kind { + hir::RepetitionKind::ZeroOrOne => { + repeat_zero_or_one_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::ZeroOrMore => { + repeat_zero_or_more_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::OneOrMore => { + repeat_one_or_more_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => (m, Some(m)), + hir::RepetitionRange::AtLeast(m) => (m, None), + hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, suffixes, + ) + } + }, + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), + HirKind::Concat(ref es) => { + for e in es.iter().rev() { + if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Literal::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + suffixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, suffixes); + } + _ => lits.cut(), + } +} + +fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + f( + &Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + // FIXME: Our literal extraction doesn't care about greediness. + // Which is partially why we're treating 'e?' as 'e*'. Namely, + // 'ab??' yields [Complete(ab), Complete(a)], but it should yield + // [Complete(a), Complete(ab)] because of the non-greediness. + greedy: true, + hir: Box::new(e.clone()), + }), + lits, + ); +} + +fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); + lits3.set_limit_size(lits.limit_size() / 2); + f(e, &mut lits3); + + if lits3.is_empty() || !lits2.cross_product(&lits3) { + lits.cut(); + return; + } + lits2.cut(); + lits2.add(Literal::empty()); + if !lits.union(lits2) { + lits.cut(); + } +} + +fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + f(e, lits); + lits.cut(); +} + +fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( + e: &Hir, + min: u32, + max: Option<u32>, + greedy: bool, + lits: &mut Literals, + mut f: F, +) { + if min == 0 { + // This is a bit conservative. If `max` is set, then we could + // treat this as a finite set of alternations. For now, we + // just treat it as `e*`. + f( + &Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: greedy, + hir: Box::new(e.clone()), + }), + lits, + ); + } else { + if min > 0 { + let n = cmp::min(lits.limit_size, min as usize); + let es = iter::repeat(e.clone()).take(n).collect(); + f(&Hir::concat(es), lits); + if n < min as usize || lits.contains_empty() { + lits.cut(); + } + } + if max.map_or(true, |max| min < max) { + lits.cut(); + } + } +} + +fn alternate_literals<F: FnMut(&Hir, &mut Literals)>( + es: &[Hir], + lits: &mut Literals, + mut f: F, +) { + let mut lits2 = lits.to_empty(); + for e in es { + let mut lits3 = lits.to_empty(); + lits3.set_limit_size(lits.limit_size() / 5); + f(e, &mut lits3); + if lits3.is_empty() || !lits2.union(lits3) { + // If we couldn't find suffixes for *any* of the + // alternates, then the entire alternation has to be thrown + // away and any existing members must be frozen. Similarly, + // if the union couldn't complete, stop and freeze. + lits.cut(); + return; + } + } + if !lits.cross_product(&lits2) { + lits.cut(); + } +} + +impl fmt::Debug for Literals { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Literals") + .field("lits", &self.lits) + .field("limit_size", &self.limit_size) + .field("limit_class", &self.limit_class) + .finish() + } +} + +impl Literal { + /// Returns a new complete literal with the bytes given. + pub fn new(bytes: Vec<u8>) -> Literal { + Literal { v: bytes, cut: false } + } + + /// Returns a new complete empty literal. + pub fn empty() -> Literal { + Literal { v: vec![], cut: false } + } + + /// Returns true if this literal was "cut." + pub fn is_cut(&self) -> bool { + self.cut + } + + /// Cuts this literal. + pub fn cut(&mut self) { + self.cut = true; + } +} + +impl PartialEq for Literal { + fn eq(&self, other: &Literal) -> bool { + self.v == other.v + } +} + +impl PartialOrd for Literal { + fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> { + self.v.partial_cmp(&other.v) + } +} + +impl fmt::Debug for Literal { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", escape_unicode(&self.v)) + } else { + write!(f, "Complete({})", escape_unicode(&self.v)) + } + } +} + +impl AsRef<[u8]> for Literal { + fn as_ref(&self) -> &[u8] { + &self.v + } +} + +impl ops::Deref for Literal { + type Target = Vec<u8>; + fn deref(&self) -> &Vec<u8> { + &self.v + } +} + +impl ops::DerefMut for Literal { + fn deref_mut(&mut self) -> &mut Vec<u8> { + &mut self.v + } +} + +fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> { + let mut i = 0; + while haystack.len() >= needle.len() { + if needle == &haystack[..needle.len()] { + return Some(i); + } + i += 1; + haystack = &haystack[1..]; + } + None +} + +fn escape_unicode(bytes: &[u8]) -> String { + let show = match ::std::str::from_utf8(bytes) { + Ok(v) => v.to_string(), + Err(_) => escape_bytes(bytes), + }; + let mut space_escaped = String::new(); + for c in show.chars() { + if c.is_whitespace() { + let escaped = if c as u32 <= 0x7F { + escape_byte(c as u8) + } else { + if c as u32 <= 0xFFFF { + format!(r"\u{{{:04x}}}", c as u32) + } else { + format!(r"\U{{{:08x}}}", c as u32) + } + }; + space_escaped.push_str(&escaped); + } else { + space_escaped.push(c); + } + } + space_escaped +} + +fn escape_bytes(bytes: &[u8]) -> String { + let mut s = String::new(); + for &b in bytes { + s.push_str(&escape_byte(b)); + } + s +} + +fn escape_byte(byte: u8) -> String { + use std::ascii::escape_default; + + let escaped: Vec<u8> = escape_default(byte).collect(); + String::from_utf8_lossy(&escaped).into_owned() +} + +fn cls_char_count(cls: &hir::ClassUnicode) -> usize { + cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() + as usize +} + +fn cls_byte_count(cls: &hir::ClassBytes) -> usize { + cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() + as usize +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use super::{escape_bytes, Literal, Literals}; + use crate::hir::Hir; + use crate::ParserBuilder; + + // To make test failures easier to read. + #[derive(Debug, Eq, PartialEq)] + struct Bytes(Vec<ULiteral>); + #[derive(Debug, Eq, PartialEq)] + struct Unicode(Vec<ULiteral>); + + fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> { + let mut ulits = vec![]; + for blit in blits { + ulits + .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() }); + } + ulits + } + + fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals { + Literals { + lits: it.into_iter().collect(), + limit_size: 0, + limit_class: 0, + } + } + + // Needs to be pub for 1.3? + #[derive(Clone, Eq, PartialEq)] + pub struct ULiteral { + v: String, + cut: bool, + } + + impl ULiteral { + fn is_cut(&self) -> bool { + self.cut + } + } + + impl fmt::Debug for ULiteral { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", self.v) + } else { + write!(f, "Complete({})", self.v) + } + } + } + + impl PartialEq<Literal> for ULiteral { + fn eq(&self, other: &Literal) -> bool { + self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() + } + } + + impl PartialEq<ULiteral> for Literal { + fn eq(&self, other: &ULiteral) -> bool { + &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() + } + } + + #[allow(non_snake_case)] + fn C(s: &'static str) -> ULiteral { + ULiteral { v: s.to_owned(), cut: true } + } + #[allow(non_snake_case)] + fn M(s: &'static str) -> ULiteral { + ULiteral { v: s.to_owned(), cut: false } + } + + fn prefixes(lits: &mut Literals, expr: &Hir) { + lits.union_prefixes(expr); + } + + fn suffixes(lits: &mut Literals, expr: &Hir) { + lits.union_suffixes(expr); + } + + macro_rules! assert_lit_eq { + ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ + let expected: Vec<ULiteral> = vec![$($expected_lit),*]; + let lits = $got_lits; + assert_eq!( + $which(expected.clone()), + $which(escape_lits(lits.literals()))); + assert_eq!( + !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), + lits.all_complete()); + assert_eq!( + expected.iter().any(|l| !l.is_cut()), + lits.any_complete()); + }}; + } + + macro_rules! test_lit { + ($name:ident, $which:ident, $re:expr) => { + test_lit!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = ParserBuilder::new() + .build() + .parse($re) + .unwrap(); + let lits = Literals::$which(&expr); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) + .unicode(false) + .build() + .parse($re) + .unwrap(); + let lits = Literals::$which(&expr); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // ************************************************************************ + // Tests for prefix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(pfx_one_lit1, prefixes, "a", M("a")); + test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); + test_lit!(pfx_one_lit3, prefixes, "(?u)â", M("\\xe2\\x98\\x83")); + #[cfg(feature = "unicode-case")] + test_lit!(pfx_one_lit4, prefixes, "(?ui)â", M("\\xe2\\x98\\x83")); + test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); + test_lit!( + pfx_class2, + prefixes, + "(?u)[ââ
]", + M("\\xe2\\x85\\xa0"), + M("\\xe2\\x98\\x83") + ); + #[cfg(feature = "unicode-case")] + test_lit!( + pfx_class3, + prefixes, + "(?ui)[ââ
]", + M("\\xe2\\x85\\xa0"), + M("\\xe2\\x85\\xb0"), + M("\\xe2\\x98\\x83") + ); + test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); + test_lit!( + pfx_one_lit_casei2, + prefixes, + "(?i-u)abc", + M("ABC"), + M("aBC"), + M("AbC"), + M("abC"), + M("ABc"), + M("aBc"), + M("Abc"), + M("abc") + ); + test_lit!(pfx_group1, prefixes, "(a)", M("a")); + test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); + test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); + test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); + // FIXME: This should return [M("a"), M("ab")] because of the non-greedy + // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get + // a cut literal. + test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); + test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); + test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); + test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); + test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); + test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); + test_lit!(pfx_rep_range1, prefixes, "a{0}"); + test_lit!(pfx_rep_range2, prefixes, "a{0,}"); + test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); + test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); + test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); + test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); + test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); + test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); + test_lit!( + pfx_cat3, + prefixes, + "(?i-u)[ab]z", + M("AZ"), + M("BZ"), + M("aZ"), + M("bZ"), + M("Az"), + M("Bz"), + M("az"), + M("bz") + ); + test_lit!( + pfx_cat4, + prefixes, + "[ab][yz]", + M("ay"), + M("by"), + M("az"), + M("bz") + ); + test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); + test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); + test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); + test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); + test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); + test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); + test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); + test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); + test_lit!(pfx_cat14, prefixes, "a^", C("a")); + test_lit!(pfx_cat15, prefixes, "$a"); + test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); + test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); + test_lit!(pfx_cat19, prefixes, "a.z", C("a")); + + // Test regexes with alternations. + test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); + test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(pfx_alt4, prefixes, "a|b*"); + test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); + test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); + test_lit!( + pfx_alt7, + prefixes, + "(a|b)*c|(a|ab)*c", + C("a"), + C("b"), + M("c"), + C("a"), + C("ab"), + M("c") + ); + test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(pfx_empty1, prefixes, "^a", M("a")); + test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); + test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); + test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); + + // Make sure some curious regexes have no prefixes. + test_lit!(pfx_nothing1, prefixes, "."); + test_lit!(pfx_nothing2, prefixes, "(?s)."); + test_lit!(pfx_nothing3, prefixes, "^"); + test_lit!(pfx_nothing4, prefixes, "$"); + test_lit!(pfx_nothing6, prefixes, "(?m)$"); + test_lit!(pfx_nothing7, prefixes, r"\b"); + test_lit!(pfx_nothing8, prefixes, r"\B"); + + // Test a few regexes that defeat any prefix literal detection. + test_lit!(pfx_defeated1, prefixes, ".a"); + test_lit!(pfx_defeated2, prefixes, "(?s).a"); + test_lit!(pfx_defeated3, prefixes, "a*b*c*"); + test_lit!(pfx_defeated4, prefixes, "a|."); + test_lit!(pfx_defeated5, prefixes, ".|a"); + test_lit!(pfx_defeated6, prefixes, "a|^"); + test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); + test_lit!(pfx_defeated8, prefixes, "$a"); + test_lit!(pfx_defeated9, prefixes, "(?m)$a"); + test_lit!(pfx_defeated10, prefixes, r"\ba"); + test_lit!(pfx_defeated11, prefixes, r"\Ba"); + test_lit!(pfx_defeated12, prefixes, "^*a"); + test_lit!(pfx_defeated13, prefixes, "^+a"); + + test_lit!( + pfx_crazy1, + prefixes, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + C("Mo\\'"), + C("Mu\\'"), + C("Moam"), + C("Muam") + ); + + // ************************************************************************ + // Tests for quiting prefix literal search. + // ************************************************************************ + + macro_rules! test_exhausted { + ($name:ident, $which:ident, $re:expr) => { + test_exhausted!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = ParserBuilder::new() + .build() + .parse($re) + .unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) + .unicode(false) + .build() + .parse($re) + .unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); + test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); + test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); + test_exhausted!( + pfx_exhausted4, + prefixes, + "(?i-u)foobar", + C("FO"), + C("fO"), + C("Fo"), + C("fo") + ); + test_exhausted!( + pfx_exhausted5, + prefixes, + "(?:ab){100}", + C("abababababababababab") + ); + test_exhausted!( + pfx_exhausted6, + prefixes, + "(?:(?:ab){100})*cd", + C("ababababab"), + M("cd") + ); + test_exhausted!( + pfx_exhausted7, + prefixes, + "z(?:(?:ab){100})*cd", + C("zababababab"), + M("zcd") + ); + test_exhausted!( + pfx_exhausted8, + prefixes, + "aaaaaaaaaaaaaaaaaaaaz", + C("aaaaaaaaaaaaaaaaaaaa") + ); + + // ************************************************************************ + // Tests for suffix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(sfx_one_lit1, suffixes, "a", M("a")); + test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); + test_lit!(sfx_one_lit3, suffixes, "(?u)â", M("\\xe2\\x98\\x83")); + #[cfg(feature = "unicode-case")] + test_lit!(sfx_one_lit4, suffixes, "(?ui)â", M("\\xe2\\x98\\x83")); + test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); + test_lit!( + sfx_class2, + suffixes, + "(?u)[ââ
]", + M("\\xe2\\x85\\xa0"), + M("\\xe2\\x98\\x83") + ); + #[cfg(feature = "unicode-case")] + test_lit!( + sfx_class3, + suffixes, + "(?ui)[ââ
]", + M("\\xe2\\x85\\xa0"), + M("\\xe2\\x85\\xb0"), + M("\\xe2\\x98\\x83") + ); + test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); + test_lit!( + sfx_one_lit_casei2, + suffixes, + "(?i-u)abc", + M("ABC"), + M("ABc"), + M("AbC"), + M("Abc"), + M("aBC"), + M("aBc"), + M("abC"), + M("abc") + ); + test_lit!(sfx_group1, suffixes, "(a)", M("a")); + test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); + test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); + test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); + test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); + test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); + test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); + test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); + test_lit!(sfx_rep_range1, suffixes, "a{0}"); + test_lit!(sfx_rep_range2, suffixes, "a{0,}"); + test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); + test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); + test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); + test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); + test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); + test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); + test_lit!( + sfx_cat3, + suffixes, + "(?i-u)[ab]z", + M("AZ"), + M("Az"), + M("BZ"), + M("Bz"), + M("aZ"), + M("az"), + M("bZ"), + M("bz") + ); + test_lit!( + sfx_cat4, + suffixes, + "[ab][yz]", + M("ay"), + M("az"), + M("by"), + M("bz") + ); + test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); + test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); + test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); + test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); + test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); + test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); + test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat12, suffixes, "ab+", C("b")); + test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); + test_lit!(sfx_cat14, suffixes, "a^"); + test_lit!(sfx_cat15, suffixes, "$a", C("a")); + test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); + test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); + test_lit!(sfx_cat19, suffixes, "a.z", C("z")); + + // Test regexes with alternations. + test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); + test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(sfx_alt4, suffixes, "a|b*"); + test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); + test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); + test_lit!( + sfx_alt7, + suffixes, + "(a|b)*c|(a|ab)*c", + C("ac"), + C("bc"), + M("c"), + C("ac"), + C("abc"), + M("c") + ); + test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(sfx_empty1, suffixes, "a$", M("a")); + test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); + + // Make sure some curious regexes have no suffixes. + test_lit!(sfx_nothing1, suffixes, "."); + test_lit!(sfx_nothing2, suffixes, "(?s)."); + test_lit!(sfx_nothing3, suffixes, "^"); + test_lit!(sfx_nothing4, suffixes, "$"); + test_lit!(sfx_nothing6, suffixes, "(?m)$"); + test_lit!(sfx_nothing7, suffixes, r"\b"); + test_lit!(sfx_nothing8, suffixes, r"\B"); + + // Test a few regexes that defeat any suffix literal detection. + test_lit!(sfx_defeated1, suffixes, "a."); + test_lit!(sfx_defeated2, suffixes, "(?s)a."); + test_lit!(sfx_defeated3, suffixes, "a*b*c*"); + test_lit!(sfx_defeated4, suffixes, "a|."); + test_lit!(sfx_defeated5, suffixes, ".|a"); + test_lit!(sfx_defeated6, suffixes, "a|^"); + test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); + test_lit!(sfx_defeated8, suffixes, "a^"); + test_lit!(sfx_defeated9, suffixes, "(?m)a$"); + test_lit!(sfx_defeated10, suffixes, r"a\b"); + test_lit!(sfx_defeated11, suffixes, r"a\B"); + test_lit!(sfx_defeated12, suffixes, "a^*"); + test_lit!(sfx_defeated13, suffixes, "a^+"); + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); + test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); + test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); + test_exhausted!( + sfx_exhausted4, + suffixes, + "(?i-u)foobar", + C("AR"), + C("Ar"), + C("aR"), + C("ar") + ); + test_exhausted!( + sfx_exhausted5, + suffixes, + "(?:ab){100}", + C("abababababababababab") + ); + test_exhausted!( + sfx_exhausted6, + suffixes, + "cd(?:(?:ab){100})*", + C("ababababab"), + M("cd") + ); + test_exhausted!( + sfx_exhausted7, + suffixes, + "cd(?:(?:ab){100})*z", + C("abababababz"), + M("cdz") + ); + test_exhausted!( + sfx_exhausted8, + suffixes, + "zaaaaaaaaaaaaaaaaaaaa", + C("aaaaaaaaaaaaaaaaaaaa") + ); + + // ************************************************************************ + // Tests for generating unambiguous literal sets. + // ************************************************************************ + + macro_rules! test_unamb { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec<Literal> = $given + .into_iter() + .map(|ul| { + let cut = ul.is_cut(); + Literal { v: ul.v.into_bytes(), cut: cut } + }) + .collect(); + let lits = create_lits(given); + let got = lits.unambiguous_prefixes(); + assert_eq!($expected, escape_lits(got.literals())); + } + }; + } + + test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); + test_unamb!( + unambiguous2, + vec![M("zaaaaaa"), M("aa")], + vec![C("aa"), C("z")] + ); + test_unamb!( + unambiguous3, + vec![M("Sherlock"), M("Watson")], + vec![M("Sherlock"), M("Watson")] + ); + test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); + test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); + test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); + test_unamb!( + unambiguous9, + vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], + vec![C("a"), C("b"), C("c")] + ); + test_unamb!( + unambiguous10, + vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], + vec![C("Mo"), C("Mu")] + ); + test_unamb!( + unambiguous11, + vec![M("zazb"), M("azb")], + vec![C("a"), C("z")] + ); + test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); + test_unamb!( + unambiguous13, + vec![M("ABCX"), M("CDAX"), M("BCX")], + vec![C("A"), C("BCX"), C("CD")] + ); + test_unamb!( + unambiguous14, + vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], + vec![M("DSX"), C("I"), C("MGX"), C("MV")] + ); + test_unamb!( + unambiguous15, + vec![M("IMG_"), M("MG_"), M("CIMG")], + vec![C("C"), C("I"), C("MG_")] + ); + + // ************************************************************************ + // Tests for suffix trimming. + // ************************************************************************ + macro_rules! test_trim { + ($name:ident, $trim:expr, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec<Literal> = $given + .into_iter() + .map(|ul| { + let cut = ul.is_cut(); + Literal { v: ul.v.into_bytes(), cut: cut } + }) + .collect(); + let lits = create_lits(given); + let got = lits.trim_suffix($trim).unwrap(); + assert_eq!($expected, escape_lits(got.literals())); + } + }; + } + + test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); + test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); + test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); + test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); + + // ************************************************************************ + // Tests for longest common prefix. + // ************************************************************************ + + macro_rules! test_lcp { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec<Literal> = $given + .into_iter() + .map(|s: &str| Literal { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_prefix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcp!(lcp1, vec!["a"], "a"); + test_lcp!(lcp2, vec![], ""); + test_lcp!(lcp3, vec!["a", "b"], ""); + test_lcp!(lcp4, vec!["ab", "ab"], "ab"); + test_lcp!(lcp5, vec!["ab", "a"], "a"); + test_lcp!(lcp6, vec!["a", "ab"], "a"); + test_lcp!(lcp7, vec!["ab", "b"], ""); + test_lcp!(lcp8, vec!["b", "ab"], ""); + test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); + test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); + test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); + test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); + + // ************************************************************************ + // Tests for longest common suffix. + // ************************************************************************ + + macro_rules! test_lcs { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec<Literal> = $given + .into_iter() + .map(|s: &str| Literal { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_suffix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcs!(lcs1, vec!["a"], "a"); + test_lcs!(lcs2, vec![], ""); + test_lcs!(lcs3, vec!["a", "b"], ""); + test_lcs!(lcs4, vec!["ab", "ab"], "ab"); + test_lcs!(lcs5, vec!["ab", "a"], ""); + test_lcs!(lcs6, vec!["a", "ab"], ""); + test_lcs!(lcs7, vec!["ab", "b"], "b"); + test_lcs!(lcs8, vec!["b", "ab"], "b"); + test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); + test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); + test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); + test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); +} diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs new file mode 100644 index 000000000..f5cf992e5 --- /dev/null +++ b/vendor/regex-syntax/src/hir/mod.rs @@ -0,0 +1,2296 @@ +/*! +Defines a high-level intermediate representation for regular expressions. +*/ +use std::char; +use std::cmp; +use std::error; +use std::fmt; +use std::result; +use std::u8; + +use crate::ast::Span; +use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter}; +use crate::unicode; + +pub use crate::hir::visitor::{visit, Visitor}; +pub use crate::unicode::CaseFoldError; + +mod interval; +pub mod literal; +pub mod print; +pub mod translate; +mod visitor; + +/// An error that can occur while translating an `Ast` to a `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the translator's Ast was parsed from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error, derived from the Ast given to the translator. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } +} + +/// The type of an error that occurred while building an `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// This error occurs when a Unicode feature is used when Unicode + /// support is disabled. For example `(?-u:\pL)` would trigger this error. + UnicodeNotAllowed, + /// This error occurs when translating a pattern that could match a byte + /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. + InvalidUtf8, + /// This occurs when an unrecognized Unicode property name could not + /// be found. + UnicodePropertyNotFound, + /// This occurs when an unrecognized Unicode property value could not + /// be found. + UnicodePropertyValueNotFound, + /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or + /// `\d`) could not be found. This can occur when the `unicode-perl` + /// crate feature is not enabled. + UnicodePerlClassNotFound, + /// This occurs when the Unicode simple case mapping tables are not + /// available, and the regular expression required Unicode aware case + /// insensitivity. + UnicodeCaseUnavailable, + /// This occurs when the translator attempts to construct a character class + /// that is empty. + /// + /// Note that this restriction in the translator may be removed in the + /// future. + EmptyClassNotAllowed, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl ErrorKind { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + use self::ErrorKind::*; + match *self { + UnicodeNotAllowed => "Unicode not allowed here", + InvalidUtf8 => "pattern can match invalid UTF-8", + UnicodePropertyNotFound => "Unicode property not found", + UnicodePropertyValueNotFound => "Unicode property value not found", + UnicodePerlClassNotFound => { + "Unicode-aware Perl class not found \ + (make sure the unicode-perl feature is enabled)" + } + UnicodeCaseUnavailable => { + "Unicode-aware case insensitivity matching is not available \ + (make sure the unicode-case feature is enabled)" + } + EmptyClassNotAllowed => "empty character classes are not allowed", + __Nonexhaustive => unreachable!(), + } + } +} + +impl error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + self.kind.description() + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // TODO: Remove this on the next breaking semver release. + #[allow(deprecated)] + f.write_str(self.description()) + } +} + +/// A high-level intermediate representation (HIR) for a regular expression. +/// +/// The HIR of a regular expression represents an intermediate step between its +/// abstract syntax (a structured description of the concrete syntax) and +/// compiled byte codes. The purpose of HIR is to make regular expressions +/// easier to analyze. In particular, the AST is much more complex than the +/// HIR. For example, while an AST supports arbitrarily nested character +/// classes, the HIR will flatten all nested classes into a single set. The HIR +/// will also "compile away" every flag present in the concrete syntax. For +/// example, users of HIR expressions never need to worry about case folding; +/// it is handled automatically by the translator (e.g., by translating `(?i)A` +/// to `[aA]`). +/// +/// If the HIR was produced by a translator that disallows invalid UTF-8, then +/// the HIR is guaranteed to match UTF-8 exclusively. +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the HIR. +/// +/// The specific type of an HIR expression can be accessed via its `kind` +/// or `into_kind` methods. This extra level of indirection exists for two +/// reasons: +/// +/// 1. Construction of an HIR expression *must* use the constructor methods +/// on this `Hir` type instead of building the `HirKind` values directly. +/// This permits construction to enforce invariants like "concatenations +/// always consist of two or more sub-expressions." +/// 2. Every HIR expression contains attributes that are defined inductively, +/// and can be computed cheaply during the construction process. For +/// example, one such attribute is whether the expression must match at the +/// beginning of the text. +/// +/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular +/// expression pattern string, and uses constant stack space and heap space +/// proportional to the size of the `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Hir { + /// The underlying HIR kind. + kind: HirKind, + /// Analysis info about this HIR, computed during construction. + info: HirInfo, +} + +/// The kind of an arbitrary `Hir` expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HirKind { + /// The empty regular expression, which matches everything, including the + /// empty string. + Empty, + /// A single literal character that matches exactly this character. + Literal(Literal), + /// A single character class that matches any of the characters in the + /// class. A class can either consist of Unicode scalar values as + /// characters, or it can use bytes. + Class(Class), + /// An anchor assertion. An anchor assertion match always has zero length. + Anchor(Anchor), + /// A word boundary assertion, which may or may not be Unicode aware. A + /// word boundary assertion match always has zero length. + WordBoundary(WordBoundary), + /// A repetition operation applied to a child expression. + Repetition(Repetition), + /// A possibly capturing group, which contains a child expression. + Group(Group), + /// A concatenation of expressions. A concatenation always has at least two + /// child expressions. + /// + /// A concatenation matches only if each of its child expression matches + /// one after the other. + Concat(Vec<Hir>), + /// An alternation of expressions. An alternation always has at least two + /// child expressions. + /// + /// An alternation matches only if at least one of its child expression + /// matches. If multiple expressions match, then the leftmost is preferred. + Alternation(Vec<Hir>), +} + +impl Hir { + /// Returns a reference to the underlying HIR kind. + pub fn kind(&self) -> &HirKind { + &self.kind + } + + /// Consumes ownership of this HIR expression and returns its underlying + /// `HirKind`. + pub fn into_kind(mut self) -> HirKind { + use std::mem; + mem::replace(&mut self.kind, HirKind::Empty) + } + + /// Returns an empty HIR expression. + /// + /// An empty HIR expression always matches, including the empty string. + pub fn empty() -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + info.set_literal(false); + info.set_alternation_literal(false); + Hir { kind: HirKind::Empty, info: info } + } + + /// Creates a literal HIR expression. + /// + /// If the given literal has a `Byte` variant with an ASCII byte, then this + /// method panics. This enforces the invariant that `Byte` variants are + /// only used to express matching of invalid UTF-8. + pub fn literal(lit: Literal) -> Hir { + if let Literal::Byte(b) = lit { + assert!(b > 0x7F); + } + + let mut info = HirInfo::new(); + info.set_always_utf8(lit.is_unicode()); + info.set_all_assertions(false); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + info.set_literal(true); + info.set_alternation_literal(true); + Hir { kind: HirKind::Literal(lit), info: info } + } + + /// Creates a class HIR expression. + pub fn class(class: Class) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(class.is_always_utf8()); + info.set_all_assertions(false); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + info.set_literal(false); + info.set_alternation_literal(false); + Hir { kind: HirKind::Class(class), info: info } + } + + /// Creates an anchor assertion HIR expression. + pub fn anchor(anchor: Anchor) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + info.set_literal(false); + info.set_alternation_literal(false); + if let Anchor::StartText = anchor { + info.set_anchored_start(true); + info.set_line_anchored_start(true); + info.set_any_anchored_start(true); + } + if let Anchor::EndText = anchor { + info.set_anchored_end(true); + info.set_line_anchored_end(true); + info.set_any_anchored_end(true); + } + if let Anchor::StartLine = anchor { + info.set_line_anchored_start(true); + } + if let Anchor::EndLine = anchor { + info.set_line_anchored_end(true); + } + Hir { kind: HirKind::Anchor(anchor), info: info } + } + + /// Creates a word boundary assertion HIR expression. + pub fn word_boundary(word_boundary: WordBoundary) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_literal(false); + info.set_alternation_literal(false); + // A negated word boundary matches '', so that's fine. But \b does not + // match \b, so why do we say it can match the empty string? Well, + // because, if you search for \b against 'a', it will report [0, 0) and + // [1, 1) as matches, and both of those matches correspond to the empty + // string. Thus, only *certain* empty strings match \b, which similarly + // applies to \B. + info.set_match_empty(true); + // Negated ASCII word boundaries can match invalid UTF-8. + if let WordBoundary::AsciiNegate = word_boundary { + info.set_always_utf8(false); + } + Hir { kind: HirKind::WordBoundary(word_boundary), info: info } + } + + /// Creates a repetition HIR expression. + pub fn repetition(rep: Repetition) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(rep.hir.is_always_utf8()); + info.set_all_assertions(rep.hir.is_all_assertions()); + // If this operator can match the empty string, then it can never + // be anchored. + info.set_anchored_start( + !rep.is_match_empty() && rep.hir.is_anchored_start(), + ); + info.set_anchored_end( + !rep.is_match_empty() && rep.hir.is_anchored_end(), + ); + info.set_line_anchored_start( + !rep.is_match_empty() && rep.hir.is_anchored_start(), + ); + info.set_line_anchored_end( + !rep.is_match_empty() && rep.hir.is_anchored_end(), + ); + info.set_any_anchored_start(rep.hir.is_any_anchored_start()); + info.set_any_anchored_end(rep.hir.is_any_anchored_end()); + info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); + info.set_literal(false); + info.set_alternation_literal(false); + Hir { kind: HirKind::Repetition(rep), info: info } + } + + /// Creates a group HIR expression. + pub fn group(group: Group) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(group.hir.is_always_utf8()); + info.set_all_assertions(group.hir.is_all_assertions()); + info.set_anchored_start(group.hir.is_anchored_start()); + info.set_anchored_end(group.hir.is_anchored_end()); + info.set_line_anchored_start(group.hir.is_line_anchored_start()); + info.set_line_anchored_end(group.hir.is_line_anchored_end()); + info.set_any_anchored_start(group.hir.is_any_anchored_start()); + info.set_any_anchored_end(group.hir.is_any_anchored_end()); + info.set_match_empty(group.hir.is_match_empty()); + info.set_literal(false); + info.set_alternation_literal(false); + Hir { kind: HirKind::Group(group), info: info } + } + + /// Returns the concatenation of the given expressions. + /// + /// This flattens the concatenation as appropriate. + pub fn concat(mut exprs: Vec<Hir>) -> Hir { + match exprs.len() { + 0 => Hir::empty(), + 1 => exprs.pop().unwrap(), + _ => { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + info.set_literal(true); + info.set_alternation_literal(true); + + // Some attributes require analyzing all sub-expressions. + for e in &exprs { + let x = info.is_always_utf8() && e.is_always_utf8(); + info.set_always_utf8(x); + + let x = info.is_all_assertions() && e.is_all_assertions(); + info.set_all_assertions(x); + + let x = info.is_any_anchored_start() + || e.is_any_anchored_start(); + info.set_any_anchored_start(x); + + let x = + info.is_any_anchored_end() || e.is_any_anchored_end(); + info.set_any_anchored_end(x); + + let x = info.is_match_empty() && e.is_match_empty(); + info.set_match_empty(x); + + let x = info.is_literal() && e.is_literal(); + info.set_literal(x); + + let x = info.is_alternation_literal() + && e.is_alternation_literal(); + info.set_alternation_literal(x); + } + // Anchored attributes require something slightly more + // sophisticated. Normally, WLOG, to determine whether an + // expression is anchored to the start, we'd only need to check + // the first expression of a concatenation. However, + // expressions like `$\b^` are still anchored to the start, + // but the first expression in the concatenation *isn't* + // anchored to the start. So the "first" expression to look at + // is actually one that is either not an assertion or is + // specifically the StartText assertion. + info.set_anchored_start( + exprs + .iter() + .take_while(|e| { + e.is_anchored_start() || e.is_all_assertions() + }) + .any(|e| e.is_anchored_start()), + ); + // Similarly for the end anchor, but in reverse. + info.set_anchored_end( + exprs + .iter() + .rev() + .take_while(|e| { + e.is_anchored_end() || e.is_all_assertions() + }) + .any(|e| e.is_anchored_end()), + ); + // Repeat the process for line anchors. + info.set_line_anchored_start( + exprs + .iter() + .take_while(|e| { + e.is_line_anchored_start() || e.is_all_assertions() + }) + .any(|e| e.is_line_anchored_start()), + ); + info.set_line_anchored_end( + exprs + .iter() + .rev() + .take_while(|e| { + e.is_line_anchored_end() || e.is_all_assertions() + }) + .any(|e| e.is_line_anchored_end()), + ); + Hir { kind: HirKind::Concat(exprs), info: info } + } + } + } + + /// Returns the alternation of the given expressions. + /// + /// This flattens the alternation as appropriate. + pub fn alternation(mut exprs: Vec<Hir>) -> Hir { + match exprs.len() { + 0 => Hir::empty(), + 1 => exprs.pop().unwrap(), + _ => { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(true); + info.set_anchored_end(true); + info.set_line_anchored_start(true); + info.set_line_anchored_end(true); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + info.set_literal(false); + info.set_alternation_literal(true); + + // Some attributes require analyzing all sub-expressions. + for e in &exprs { + let x = info.is_always_utf8() && e.is_always_utf8(); + info.set_always_utf8(x); + + let x = info.is_all_assertions() && e.is_all_assertions(); + info.set_all_assertions(x); + + let x = info.is_anchored_start() && e.is_anchored_start(); + info.set_anchored_start(x); + + let x = info.is_anchored_end() && e.is_anchored_end(); + info.set_anchored_end(x); + + let x = info.is_line_anchored_start() + && e.is_line_anchored_start(); + info.set_line_anchored_start(x); + + let x = info.is_line_anchored_end() + && e.is_line_anchored_end(); + info.set_line_anchored_end(x); + + let x = info.is_any_anchored_start() + || e.is_any_anchored_start(); + info.set_any_anchored_start(x); + + let x = + info.is_any_anchored_end() || e.is_any_anchored_end(); + info.set_any_anchored_end(x); + + let x = info.is_match_empty() || e.is_match_empty(); + info.set_match_empty(x); + + let x = info.is_alternation_literal() && e.is_literal(); + info.set_alternation_literal(x); + } + Hir { kind: HirKind::Alternation(exprs), info: info } + } + } + } + + /// Build an HIR expression for `.`. + /// + /// A `.` expression matches any character except for `\n`. To build an + /// expression that matches any character, including `\n`, use the `any` + /// method. + /// + /// If `bytes` is `true`, then this assumes characters are limited to a + /// single byte. + pub fn dot(bytes: bool) -> Hir { + if bytes { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } else { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + } + + /// Build an HIR expression for `(?s).`. + /// + /// A `(?s).` expression matches any character, including `\n`. To build an + /// expression that matches any character except for `\n`, then use the + /// `dot` method. + /// + /// If `bytes` is `true`, then this assumes characters are limited to a + /// single byte. + pub fn any(bytes: bool) -> Hir { + if bytes { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } else { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + } + + /// Return true if and only if this HIR will always match valid UTF-8. + /// + /// When this returns false, then it is possible for this HIR expression + /// to match invalid UTF-8. + pub fn is_always_utf8(&self) -> bool { + self.info.is_always_utf8() + } + + /// Returns true if and only if this entire HIR expression is made up of + /// zero-width assertions. + /// + /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but + /// not `^a`. + pub fn is_all_assertions(&self) -> bool { + self.info.is_all_assertions() + } + + /// Return true if and only if this HIR is required to match from the + /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, + /// `^foo|^bar` but not `^foo|bar`. + pub fn is_anchored_start(&self) -> bool { + self.info.is_anchored_start() + } + + /// Return true if and only if this HIR is required to match at the end + /// of text. This includes expressions like `foo$`, `(foo|bar)$`, + /// `foo$|bar$` but not `foo$|bar`. + pub fn is_anchored_end(&self) -> bool { + self.info.is_anchored_end() + } + + /// Return true if and only if this HIR is required to match from the + /// beginning of text or the beginning of a line. This includes expressions + /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar` + /// but not `^foo|bar` or `(?m)^foo|bar`. + /// + /// Note that if `is_anchored_start` is `true`, then + /// `is_line_anchored_start` will also be `true`. The reverse implication + /// is not true. For example, `(?m)^foo` is line anchored, but not + /// `is_anchored_start`. + pub fn is_line_anchored_start(&self) -> bool { + self.info.is_line_anchored_start() + } + + /// Return true if and only if this HIR is required to match at the + /// end of text or the end of a line. This includes expressions like + /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`, + /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`. + /// + /// Note that if `is_anchored_end` is `true`, then + /// `is_line_anchored_end` will also be `true`. The reverse implication + /// is not true. For example, `(?m)foo$` is line anchored, but not + /// `is_anchored_end`. + pub fn is_line_anchored_end(&self) -> bool { + self.info.is_line_anchored_end() + } + + /// Return true if and only if this HIR contains any sub-expression that + /// is required to match at the beginning of text. Specifically, this + /// returns true if the `^` symbol (when multiline mode is disabled) or the + /// `\A` escape appear anywhere in the regex. + pub fn is_any_anchored_start(&self) -> bool { + self.info.is_any_anchored_start() + } + + /// Return true if and only if this HIR contains any sub-expression that is + /// required to match at the end of text. Specifically, this returns true + /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape + /// appear anywhere in the regex. + pub fn is_any_anchored_end(&self) -> bool { + self.info.is_any_anchored_end() + } + + /// Return true if and only if the empty string is part of the language + /// matched by this regular expression. + /// + /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b` + /// and `\B`, but not `a` or `a+`. + pub fn is_match_empty(&self) -> bool { + self.info.is_match_empty() + } + + /// Return true if and only if this HIR is a simple literal. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s. + /// + /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, + /// `` are not (even though that contain sub-expressions that are literals). + pub fn is_literal(&self) -> bool { + self.info.is_literal() + } + + /// Return true if and only if this HIR is either a simple literal or an + /// alternation of simple literals. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s or an alternation of only `Literal`s. + /// + /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation + /// literals, but `f+`, `(foo)`, `foo()`, `` + /// are not (even though that contain sub-expressions that are literals). + pub fn is_alternation_literal(&self) -> bool { + self.info.is_alternation_literal() + } +} + +impl HirKind { + /// Return true if and only if this HIR is the empty regular expression. + /// + /// Note that this is not defined inductively. That is, it only tests if + /// this kind is the `Empty` variant. To get the inductive definition, + /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). + pub fn is_empty(&self) -> bool { + match *self { + HirKind::Empty => true, + _ => false, + } + } + + /// Returns true if and only if this kind has any (including possibly + /// empty) subexpressions. + pub fn has_subexprs(&self) -> bool { + match *self { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => false, + HirKind::Group(_) + | HirKind::Repetition(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => true, + } + } +} + +/// Print a display representation of this Hir. +/// +/// The result of this is a valid regular expression pattern string. +/// +/// This implementation uses constant stack space and heap space proportional +/// to the size of the `Hir`. +impl fmt::Display for Hir { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use crate::hir::print::Printer; + Printer::new().print(self, f) + } +} + +/// The high-level intermediate representation of a literal. +/// +/// A literal corresponds to a single character, where a character is either +/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters +/// are preferred whenever possible. In particular, a `Byte` variant is only +/// ever produced when it could match invalid UTF-8. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Literal { + /// A single character represented by a Unicode scalar value. + Unicode(char), + /// A single character represented by an arbitrary byte. + Byte(u8), +} + +impl Literal { + /// Returns true if and only if this literal corresponds to a Unicode + /// scalar value. + pub fn is_unicode(&self) -> bool { + match *self { + Literal::Unicode(_) => true, + Literal::Byte(b) if b <= 0x7F => true, + Literal::Byte(_) => false, + } + } +} + +/// The high-level intermediate representation of a character class. +/// +/// A character class corresponds to a set of characters. A character is either +/// defined by a Unicode scalar value or a byte. Unicode characters are used +/// by default, while bytes are used when Unicode mode (via the `u` flag) is +/// disabled. +/// +/// A character class, regardless of its character type, is represented by a +/// sequence of non-overlapping non-adjacent ranges of characters. +/// +/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may +/// be produced even when it exclusively matches valid UTF-8. This is because +/// a `Bytes` variant represents an intention by the author of the regular +/// expression to disable Unicode mode, which in turn impacts the semantics of +/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not +/// match the same set of strings. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Class { + /// A set of characters represented by Unicode scalar values. + Unicode(ClassUnicode), + /// A set of characters represented by arbitrary bytes (one byte per + /// character). + Bytes(ClassBytes), +} + +impl Class { + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + pub fn case_fold_simple(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.case_fold_simple(), + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + } + + /// Negate this character class in place. + /// + /// After completion, this character class will contain precisely the + /// characters that weren't previously in the class. + pub fn negate(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.negate(), + Class::Bytes(ref mut x) => x.negate(), + } + } + + /// Returns true if and only if this character class will only ever match + /// valid UTF-8. + /// + /// A character class can match invalid UTF-8 only when the following + /// conditions are met: + /// + /// 1. The translator was configured to permit generating an expression + /// that can match invalid UTF-8. (By default, this is disabled.) + /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete + /// syntax or in the parser builder. By default, Unicode mode is + /// enabled. + pub fn is_always_utf8(&self) -> bool { + match *self { + Class::Unicode(_) => true, + Class::Bytes(ref x) => x.is_all_ascii(), + } + } +} + +/// A set of characters represented by Unicode scalar values. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + set: IntervalSet<ClassUnicodeRange>, +} + +impl ClassUnicode { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new<I>(ranges: I) -> ClassUnicode + where + I: IntoIterator<Item = ClassUnicodeRange>, + { + ClassUnicode { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + pub fn empty() -> ClassUnicode { + ClassUnicode::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassUnicodeRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassUnicodeIter<'_> { + ClassUnicodeIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassUnicodeRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. + pub fn case_fold_simple(&mut self) { + self.set + .case_fold_simple() + .expect("unicode-case feature must be enabled"); + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Error + /// + /// This routine returns an error when the case mapping data necessary + /// for this routine to complete is unavailable. This occurs when the + /// `unicode-case` feature is not enabled. + pub fn try_case_fold_simple( + &mut self, + ) -> result::Result<(), CaseFoldError> { + self.set.case_fold_simple() + } + + /// Negate this character class. + /// + /// For all `c` where `c` is a Unicode scalar value, if `c` was in this + /// set, then it will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this character class with the given character class, in place. + pub fn union(&mut self, other: &ClassUnicode) { + self.set.union(&other.set); + } + + /// Intersect this character class with the given character class, in + /// place. + pub fn intersect(&mut self, other: &ClassUnicode) { + self.set.intersect(&other.set); + } + + /// Subtract the given character class from this character class, in place. + pub fn difference(&mut self, other: &ClassUnicode) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given character classes, in + /// place. + /// + /// This computes the symmetric difference of two character classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassUnicode) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII codepoint. + pub fn is_all_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') + } +} + +/// An iterator over all ranges in a Unicode character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); + +impl<'a> Iterator for ClassUnicodeIter<'a> { + type Item = &'a ClassUnicodeRange; + + fn next(&mut self) -> Option<&'a ClassUnicodeRange> { + self.0.next() + } +} + +/// A single range of characters represented by Unicode scalar values. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassUnicodeRange { + start: char, + end: char, +} + +impl fmt::Debug for ClassUnicodeRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let start = if !self.start.is_whitespace() && !self.start.is_control() + { + self.start.to_string() + } else { + format!("0x{:X}", self.start as u32) + }; + let end = if !self.end.is_whitespace() && !self.end.is_control() { + self.end.to_string() + } else { + format!("0x{:X}", self.end as u32) + }; + f.debug_struct("ClassUnicodeRange") + .field("start", &start) + .field("end", &end) + .finish() + } +} + +impl Interval for ClassUnicodeRange { + type Bound = char; + + #[inline] + fn lower(&self) -> char { + self.start + } + #[inline] + fn upper(&self) -> char { + self.end + } + #[inline] + fn set_lower(&mut self, bound: char) { + self.start = bound; + } + #[inline] + fn set_upper(&mut self, bound: char) { + self.end = bound; + } + + /// Apply simple case folding to this Unicode scalar value range. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple( + &self, + ranges: &mut Vec<ClassUnicodeRange>, + ) -> Result<(), unicode::CaseFoldError> { + if !unicode::contains_simple_case_mapping(self.start, self.end)? { + return Ok(()); + } + let start = self.start as u32; + let end = (self.end as u32).saturating_add(1); + let mut next_simple_cp = None; + for cp in (start..end).filter_map(char::from_u32) { + if next_simple_cp.map_or(false, |next| cp < next) { + continue; + } + let it = match unicode::simple_fold(cp)? { + Ok(it) => it, + Err(next) => { + next_simple_cp = next; + continue; + } + }; + for cp_folded in it { + ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); + } + } + Ok(()) + } +} + +impl ClassUnicodeRange { + /// Create a new Unicode scalar value range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: char, end: char) -> ClassUnicodeRange { + ClassUnicodeRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> char { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> char { + self.end + } +} + +/// A set of characters represented by arbitrary bytes (where one byte +/// corresponds to one character). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBytes { + set: IntervalSet<ClassBytesRange>, +} + +impl ClassBytes { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new<I>(ranges: I) -> ClassBytes + where + I: IntoIterator<Item = ClassBytesRange>, + { + ClassBytes { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + pub fn empty() -> ClassBytes { + ClassBytes::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassBytesRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassBytesIter<'_> { + ClassBytesIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassBytesRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// Note that this only applies ASCII case folding, which is limited to the + /// characters `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + self.set.case_fold_simple().expect("ASCII case folding never fails"); + } + + /// Negate this byte class. + /// + /// For all `b` where `b` is a any byte, if `b` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this byte class with the given byte class, in place. + pub fn union(&mut self, other: &ClassBytes) { + self.set.union(&other.set); + } + + /// Intersect this byte class with the given byte class, in place. + pub fn intersect(&mut self, other: &ClassBytes) { + self.set.intersect(&other.set); + } + + /// Subtract the given byte class from this byte class, in place. + pub fn difference(&mut self, other: &ClassBytes) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given byte classes, in place. + /// + /// This computes the symmetric difference of two byte classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassBytes) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII byte. + pub fn is_all_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) + } +} + +/// An iterator over all ranges in a byte character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); + +impl<'a> Iterator for ClassBytesIter<'a> { + type Item = &'a ClassBytesRange; + + fn next(&mut self) -> Option<&'a ClassBytesRange> { + self.0.next() + } +} + +/// A single range of characters represented by arbitrary bytes. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassBytesRange { + start: u8, + end: u8, +} + +impl Interval for ClassBytesRange { + type Bound = u8; + + #[inline] + fn lower(&self) -> u8 { + self.start + } + #[inline] + fn upper(&self) -> u8 { + self.end + } + #[inline] + fn set_lower(&mut self, bound: u8) { + self.start = bound; + } + #[inline] + fn set_upper(&mut self, bound: u8) { + self.end = bound; + } + + /// Apply simple case folding to this byte range. Only ASCII case mappings + /// (for a-z) are applied. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple( + &self, + ranges: &mut Vec<ClassBytesRange>, + ) -> Result<(), unicode::CaseFoldError> { + if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'a'); + let upper = cmp::min(self.end, b'z'); + ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); + } + if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'A'); + let upper = cmp::min(self.end, b'Z'); + ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); + } + Ok(()) + } +} + +impl ClassBytesRange { + /// Create a new byte range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: u8, end: u8) -> ClassBytesRange { + ClassBytesRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> u8 { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> u8 { + self.end + } +} + +impl fmt::Debug for ClassBytesRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut debug = f.debug_struct("ClassBytesRange"); + if self.start <= 0x7F { + debug.field("start", &(self.start as char)); + } else { + debug.field("start", &self.start); + } + if self.end <= 0x7F { + debug.field("end", &(self.end as char)); + } else { + debug.field("end", &self.end); + } + debug.finish() + } +} + +/// The high-level intermediate representation for an anchor assertion. +/// +/// A matching anchor assertion is always zero-length. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Anchor { + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLine, + /// Match the end of a line or the end of text. Specifically, + /// this matches at the end position of the input, or at the position + /// immediately preceding a `\n` character. + EndLine, + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + StartText, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + EndText, +} + +/// The high-level intermediate representation for a word-boundary assertion. +/// +/// A matching word boundary assertion is always zero-length. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum WordBoundary { + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Unicode, + /// Match a Unicode-aware negation of a word boundary. + UnicodeNegate, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Ascii, + /// Match an ASCII-only negation of a word boundary. + AsciiNegate, +} + +impl WordBoundary { + /// Returns true if and only if this word boundary assertion is negated. + pub fn is_negated(&self) -> bool { + match *self { + WordBoundary::Unicode | WordBoundary::Ascii => false, + WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, + } + } +} + +/// The high-level intermediate representation for a group. +/// +/// This represents one of three possible group types: +/// +/// 1. A non-capturing group (e.g., `(?:expr)`). +/// 2. A capturing group (e.g., `(expr)`). +/// 3. A named capturing group (e.g., `(?P<name>expr)`). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Group { + /// The kind of this group. If it is a capturing group, then the kind + /// contains the capture group index (and the name, if it is a named + /// group). + pub kind: GroupKind, + /// The expression inside the capturing group, which may be empty. + pub hir: Box<Hir>, +} + +/// The kind of group. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum GroupKind { + /// A normal unnamed capturing group. + /// + /// The value is the capture index of the group. + CaptureIndex(u32), + /// A named capturing group. + CaptureName { + /// The name of the group. + name: String, + /// The capture index of the group. + index: u32, + }, + /// A non-capturing group. + NonCapturing, +} + +/// The high-level intermediate representation of a repetition operator. +/// +/// A repetition operator permits the repetition of an arbitrary +/// sub-expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The kind of this repetition operator. + pub kind: RepetitionKind, + /// Whether this repetition operator is greedy or not. A greedy operator + /// will match as much as it can. A non-greedy operator will match as + /// little as it can. + /// + /// Typically, operators are greedy by default and are only non-greedy when + /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is + /// not. However, this can be inverted via the `U` "ungreedy" flag. + pub greedy: bool, + /// The expression being repeated. + pub hir: Box<Hir>, +} + +impl Repetition { + /// Returns true if and only if this repetition operator makes it possible + /// to match the empty string. + /// + /// Note that this is not defined inductively. For example, while `a*` + /// will report `true`, `()+` will not, even though `()` matches the empty + /// string and one or more occurrences of something that matches the empty + /// string will always match the empty string. In order to get the + /// inductive definition, see the corresponding method on + /// [`Hir`](struct.Hir.html). + pub fn is_match_empty(&self) -> bool { + match self.kind { + RepetitionKind::ZeroOrOne => true, + RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => false, + RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, + RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, + RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, + } + } +} + +/// The kind of a repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionKind { + /// Matches a sub-expression zero or one times. + ZeroOrOne, + /// Matches a sub-expression zero or more times. + ZeroOrMore, + /// Matches a sub-expression one or more times. + OneOrMore, + /// Matches a sub-expression within a bounded range of times. + Range(RepetitionRange), +} + +/// The kind of a counted repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionRange { + /// Matches a sub-expression exactly this many times. + Exactly(u32), + /// Matches a sub-expression at least this many times. + AtLeast(u32), + /// Matches a sub-expression at least `m` times and at most `n` times. + Bounded(u32, u32), +} + +/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack +/// space but heap space proportional to the depth of the total `Hir`. +impl Drop for Hir { + fn drop(&mut self) { + use std::mem; + + match *self.kind() { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => return, + HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, + HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, + HirKind::Concat(ref x) if x.is_empty() => return, + HirKind::Alternation(ref x) if x.is_empty() => return, + _ => {} + } + + let mut stack = vec![mem::replace(self, Hir::empty())]; + while let Some(mut expr) = stack.pop() { + match expr.kind { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => {} + HirKind::Group(ref mut x) => { + stack.push(mem::replace(&mut x.hir, Hir::empty())); + } + HirKind::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.hir, Hir::empty())); + } + HirKind::Concat(ref mut x) => { + stack.extend(x.drain(..)); + } + HirKind::Alternation(ref mut x) => { + stack.extend(x.drain(..)); + } + } + } + } +} + +/// A type that documents various attributes of an HIR expression. +/// +/// These attributes are typically defined inductively on the HIR. +#[derive(Clone, Debug, Eq, PartialEq)] +struct HirInfo { + /// Represent yes/no questions by a bitfield to conserve space, since + /// this is included in every HIR expression. + /// + /// If more attributes need to be added, it is OK to increase the size of + /// this as appropriate. + bools: u16, +} + +// A simple macro for defining bitfield accessors/mutators. +macro_rules! define_bool { + ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { + fn $is_fn_name(&self) -> bool { + self.bools & (0b1 << $bit) > 0 + } + + fn $set_fn_name(&mut self, yes: bool) { + if yes { + self.bools |= 1 << $bit; + } else { + self.bools &= !(1 << $bit); + } + } + }; +} + +impl HirInfo { + fn new() -> HirInfo { + HirInfo { bools: 0 } + } + + define_bool!(0, is_always_utf8, set_always_utf8); + define_bool!(1, is_all_assertions, set_all_assertions); + define_bool!(2, is_anchored_start, set_anchored_start); + define_bool!(3, is_anchored_end, set_anchored_end); + define_bool!(4, is_line_anchored_start, set_line_anchored_start); + define_bool!(5, is_line_anchored_end, set_line_anchored_end); + define_bool!(6, is_any_anchored_start, set_any_anchored_start); + define_bool!(7, is_any_anchored_end, set_any_anchored_end); + define_bool!(8, is_match_empty, set_match_empty); + define_bool!(9, is_literal, set_literal); + define_bool!(10, is_alternation_literal, set_alternation_literal); +} + +#[cfg(test)] +mod tests { + use super::*; + + fn uclass(ranges: &[(char, char)]) -> ClassUnicode { + let ranges: Vec<ClassUnicodeRange> = ranges + .iter() + .map(|&(s, e)| ClassUnicodeRange::new(s, e)) + .collect(); + ClassUnicode::new(ranges) + } + + fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { + let ranges: Vec<ClassBytesRange> = + ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect(); + ClassBytes::new(ranges) + } + + fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + #[cfg(feature = "unicode-case")] + fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn usymdifference( + cls1: &ClassUnicode, + cls2: &ClassUnicode, + ) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn unegate(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + fn bcasefold(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn bnegate(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + #[test] + fn class_range_canonical_unicode() { + let range = ClassUnicodeRange::new('\u{00FF}', '\0'); + assert_eq!('\0', range.start()); + assert_eq!('\u{00FF}', range.end()); + } + + #[test] + fn class_range_canonical_bytes() { + let range = ClassBytesRange::new(b'\xFF', b'\0'); + assert_eq!(b'\0', range.start()); + assert_eq!(b'\xFF', range.end()); + } + + #[test] + fn class_canonicalize_unicode() { + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('a', 'c')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('w', 'y')]); + let expected = vec![('w', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[ + ('c', 'f'), + ('a', 'g'), + ('d', 'j'), + ('a', 'c'), + ('m', 'p'), + ('l', 's'), + ]); + let expected = vec![('a', 'j'), ('l', 's')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('u', 'w')]); + let expected = vec![('u', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); + let expected = vec![('\x00', '\u{10FFFF}')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = vec![('a', 'b')]; + assert_eq!(expected, uranges(&cls)); + } + + #[test] + fn class_canonicalize_bytes() { + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); + let expected = vec![(b'w', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[ + (b'c', b'f'), + (b'a', b'g'), + (b'd', b'j'), + (b'a', b'c'), + (b'm', b'p'), + (b'l', b's'), + ]); + let expected = vec![(b'a', b'j'), (b'l', b's')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); + let expected = vec![(b'u', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); + let expected = vec![(b'\x00', b'\xFF')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = vec![(b'a', b'b')]; + assert_eq!(expected, branges(&cls)); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn class_case_fold_unicode() { + let cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + let expected = uclass(&[ + ('A', 'J'), + ('L', 'S'), + ('a', 'j'), + ('l', 's'), + ('\u{17F}', '\u{17F}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'Z')]); + let expected = uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('a', 'z')]); + let expected = uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('_', '_')]); + let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('=', '=')]); + let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('\x00', '\x10')]); + assert_eq!(cls, ucasefold(&cls)); + + let cls = uclass(&[('k', 'k')]); + let expected = + uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('@', '@')]); + assert_eq!(cls, ucasefold(&cls)); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + assert!(cls.try_case_fold_simple().is_err()); + } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled_panics() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + cls.case_fold_simple(); + } + + #[test] + fn class_case_fold_bytes() { + let cls = bclass(&[ + (b'C', b'F'), + (b'A', b'G'), + (b'D', b'J'), + (b'A', b'C'), + (b'M', b'P'), + (b'L', b'S'), + (b'c', b'f'), + ]); + let expected = + bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'Z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); + let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); + let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'\x00', b'\x10')]); + assert_eq!(cls, bcasefold(&cls)); + + let cls = bclass(&[(b'k', b'k')]); + let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'@', b'@')]); + assert_eq!(cls, bcasefold(&cls)); + } + + #[test] + fn class_negate_unicode() { + let cls = uclass(&[('a', 'a')]); + let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = uclass(&[ + ('\x00', '\x60'), + ('\x64', '\x77'), + ('\x7B', '\u{10FFFF}'), + ]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', 'a')]); + let expected = uclass(&[('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\x60')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}')]); + let expected = uclass(&[]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[]); + let expected = uclass(&[('\x00', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = + uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]); + let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FF}')]); + let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FE}')]); + let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{D7FF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{E000}')]); + assert_eq!(expected, unegate(&cls)); + } + + #[test] + fn class_negate_bytes() { + let cls = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = bclass(&[ + (b'\x00', b'\x60'), + (b'\x64', b'\x77'), + (b'\x7B', b'\xFF'), + ]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'a')]); + let expected = bclass(&[(b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'\xFF')]); + let expected = bclass(&[(b'\x00', b'\x60')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF')]); + let expected = bclass(&[]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[]); + let expected = bclass(&[(b'\x00', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); + let expected = bclass(&[(b'\xFE', b'\xFE')]); + assert_eq!(expected, bnegate(&cls)); + } + + #[test] + fn class_union_unicode() { + let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[('a', 'z'), ('A', 'C')]); + assert_eq!(expected, uunion(&cls1, &cls2)); + } + + #[test] + fn class_union_bytes() { + let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); + assert_eq!(expected, bunion(&cls1, &cls2)); + } + + #[test] + fn class_intersect_unicode() { + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('b', 'b')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('b', 'c')]); + let expected = uclass(&[('b', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('c', 'd')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('b', 'c')]); + let cls2 = uclass(&[('a', 'd')]); + let expected = uclass(&[('b', 'c')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); + let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('h', 'h')]); + let expected = uclass(&[('h', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); + let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); + let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); + let expected = uclass(&[('b', 'f')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + } + + #[test] + fn class_intersect_bytes() { + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'b', b'b')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'b', b'c')]); + let expected = bclass(&[(b'b', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'c', b'd')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'b', b'c')]); + let cls2 = bclass(&[(b'a', b'd')]); + let expected = bclass(&[(b'b', b'c')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); + let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'h', b'h')]); + let expected = bclass(&[(b'h', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); + let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); + let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); + let expected = bclass(&[(b'b', b'f')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + } + + #[test] + fn class_difference_unicode() { + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('b', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('z', 'z')]); + let expected = uclass(&[('a', 'y')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('m', 'm')]); + let expected = uclass(&[('a', 'l'), ('n', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('d', 'v')]); + let expected = uclass(&[('a', 'c')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('x', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('x', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + } + + #[test] + fn class_difference_bytes() { + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'b', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'z', b'z')]); + let expected = bclass(&[(b'a', b'y')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'm', b'm')]); + let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'd', b'v')]); + let expected = bclass(&[(b'a', b'c')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'x', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'x', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_unicode() { + let cls1 = uclass(&[('a', 'm')]); + let cls2 = uclass(&[('g', 't')]); + let expected = uclass(&[('a', 'f'), ('n', 't')]); + assert_eq!(expected, usymdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_bytes() { + let cls1 = bclass(&[(b'a', b'm')]); + let cls2 = bclass(&[(b'g', b't')]); + let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); + assert_eq!(expected, bsymdifference(&cls1, &cls2)); + } + + #[test] + #[should_panic] + fn hir_byte_literal_non_ascii() { + Hir::literal(Literal::Byte(b'a')); + } + + // We use a thread with an explicit stack size to test that our destructor + // for Hir can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let mut expr = Hir::empty(); + for _ in 0..100 { + expr = Hir::group(Group { + kind: GroupKind::NonCapturing, + hir: Box::new(expr), + }); + expr = Hir::repetition(Repetition { + kind: RepetitionKind::ZeroOrOne, + greedy: true, + hir: Box::new(expr), + }); + + expr = Hir { + kind: HirKind::Concat(vec![expr]), + info: HirInfo::new(), + }; + expr = Hir { + kind: HirKind::Alternation(vec![expr]), + info: HirInfo::new(), + }; + } + assert!(!expr.kind.is_empty()); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + thread::Builder::new() + .stack_size(1 << 10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } +} diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs new file mode 100644 index 000000000..b71f3897c --- /dev/null +++ b/vendor/regex-syntax/src/hir/print.rs @@ -0,0 +1,367 @@ +/*! +This module provides a regular expression printer for `Hir`. +*/ + +use std::fmt; + +use crate::hir::visitor::{self, Visitor}; +use crate::hir::{self, Hir, HirKind}; +use crate::is_meta_character; + +/// A builder for constructing a printer. +/// +/// Note that since a printer doesn't have any configuration knobs, this type +/// remains unexported. +#[derive(Clone, Debug)] +struct PrinterBuilder { + _priv: (), +} + +impl Default for PrinterBuilder { + fn default() -> PrinterBuilder { + PrinterBuilder::new() + } +} + +impl PrinterBuilder { + fn new() -> PrinterBuilder { + PrinterBuilder { _priv: () } + } + + fn build(&self) -> Printer { + Printer { _priv: () } + } +} + +/// A printer for a regular expression's high-level intermediate +/// representation. +/// +/// A printer converts a high-level intermediate representation (HIR) to a +/// regular expression pattern string. This particular printer uses constant +/// stack space and heap space proportional to the size of the HIR. +/// +/// Since this printer is only using the HIR, the pattern it prints will likely +/// not resemble the original pattern at all. For example, a pattern like +/// `\pL` will have its entire class written out. +/// +/// The purpose of this printer is to provide a means to mutate an HIR and then +/// build a regular expression from the result of that mutation. (A regex +/// library could provide a constructor from this HIR explicitly, but that +/// creates an unnecessary public coupling between the regex library and this +/// specific HIR representation.) +#[derive(Debug)] +pub struct Printer { + _priv: (), +} + +impl Printer { + /// Create a new printer. + pub fn new() -> Printer { + PrinterBuilder::new().build() + } + + /// Print the given `Ast` to the given writer. The writer must implement + /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used + /// here are a `fmt::Formatter` (which is available in `fmt::Display` + /// implementations) or a `&mut String`. + pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result { + visitor::visit(hir, Writer { wtr }) + } +} + +#[derive(Debug)] +struct Writer<W> { + wtr: W, +} + +impl<W: fmt::Write> Visitor for Writer<W> { + type Output = (); + type Err = fmt::Error; + + fn finish(self) -> fmt::Result { + Ok(()) + } + + fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { + match *hir.kind() { + HirKind::Empty + | HirKind::Repetition(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => {} + HirKind::Literal(hir::Literal::Unicode(c)) => { + self.write_literal_char(c)?; + } + HirKind::Literal(hir::Literal::Byte(b)) => { + self.write_literal_byte(b)?; + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + self.wtr.write_str("[")?; + for range in cls.iter() { + if range.start() == range.end() { + self.write_literal_char(range.start())?; + } else { + self.write_literal_char(range.start())?; + self.wtr.write_str("-")?; + self.write_literal_char(range.end())?; + } + } + self.wtr.write_str("]")?; + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + self.wtr.write_str("(?-u:[")?; + for range in cls.iter() { + if range.start() == range.end() { + self.write_literal_class_byte(range.start())?; + } else { + self.write_literal_class_byte(range.start())?; + self.wtr.write_str("-")?; + self.write_literal_class_byte(range.end())?; + } + } + self.wtr.write_str("])")?; + } + HirKind::Anchor(hir::Anchor::StartLine) => { + self.wtr.write_str("(?m:^)")?; + } + HirKind::Anchor(hir::Anchor::EndLine) => { + self.wtr.write_str("(?m:$)")?; + } + HirKind::Anchor(hir::Anchor::StartText) => { + self.wtr.write_str(r"\A")?; + } + HirKind::Anchor(hir::Anchor::EndText) => { + self.wtr.write_str(r"\z")?; + } + HirKind::WordBoundary(hir::WordBoundary::Unicode) => { + self.wtr.write_str(r"\b")?; + } + HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => { + self.wtr.write_str(r"\B")?; + } + HirKind::WordBoundary(hir::WordBoundary::Ascii) => { + self.wtr.write_str(r"(?-u:\b)")?; + } + HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => { + self.wtr.write_str(r"(?-u:\B)")?; + } + HirKind::Group(ref x) => match x.kind { + hir::GroupKind::CaptureIndex(_) => { + self.wtr.write_str("(")?; + } + hir::GroupKind::CaptureName { ref name, .. } => { + write!(self.wtr, "(?P<{}>", name)?; + } + hir::GroupKind::NonCapturing => { + self.wtr.write_str("(?:")?; + } + }, + } + Ok(()) + } + + fn visit_post(&mut self, hir: &Hir) -> fmt::Result { + match *hir.kind() { + // Handled during visit_pre + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => {} + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => { + self.wtr.write_str("?")?; + } + hir::RepetitionKind::ZeroOrMore => { + self.wtr.write_str("*")?; + } + hir::RepetitionKind::OneOrMore => { + self.wtr.write_str("+")?; + } + hir::RepetitionKind::Range(ref x) => match *x { + hir::RepetitionRange::Exactly(m) => { + write!(self.wtr, "{{{}}}", m)?; + } + hir::RepetitionRange::AtLeast(m) => { + write!(self.wtr, "{{{},}}", m)?; + } + hir::RepetitionRange::Bounded(m, n) => { + write!(self.wtr, "{{{},{}}}", m, n)?; + } + }, + } + if !x.greedy { + self.wtr.write_str("?")?; + } + } + HirKind::Group(_) => { + self.wtr.write_str(")")?; + } + } + Ok(()) + } + + fn visit_alternation_in(&mut self) -> fmt::Result { + self.wtr.write_str("|") + } +} + +impl<W: fmt::Write> Writer<W> { + fn write_literal_char(&mut self, c: char) -> fmt::Result { + if is_meta_character(c) { + self.wtr.write_str("\\")?; + } + self.wtr.write_char(c) + } + + fn write_literal_byte(&mut self, b: u8) -> fmt::Result { + let c = b as char; + if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { + self.write_literal_char(c) + } else { + write!(self.wtr, "(?-u:\\x{:02X})", b) + } + } + + fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { + let c = b as char; + if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { + self.write_literal_char(c) + } else { + write!(self.wtr, "\\x{:02X}", b) + } + } +} + +#[cfg(test)] +mod tests { + use super::Printer; + use crate::ParserBuilder; + + fn roundtrip(given: &str, expected: &str) { + roundtrip_with(|b| b, given, expected); + } + + fn roundtrip_bytes(given: &str, expected: &str) { + roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected); + } + + fn roundtrip_with<F>(mut f: F, given: &str, expected: &str) + where + F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, + { + let mut builder = ParserBuilder::new(); + f(&mut builder); + let hir = builder.build().parse(given).unwrap(); + + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&hir, &mut dst).unwrap(); + + // Check that the result is actually valid. + builder.build().parse(&dst).unwrap(); + + assert_eq!(expected, dst); + } + + #[test] + fn print_literal() { + roundtrip("a", "a"); + roundtrip(r"\xff", "\u{FF}"); + roundtrip_bytes(r"\xff", "\u{FF}"); + roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)"); + roundtrip("â", "â"); + } + + #[test] + fn print_class() { + roundtrip(r"[a]", r"[a]"); + roundtrip(r"[a-z]", r"[a-z]"); + roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); + roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]"); + roundtrip(r"[-]", r"[\-]"); + roundtrip(r"[â-â]", r"[â-â]"); + + roundtrip(r"(?-u)[a]", r"(?-u:[a])"); + roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); + roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); + + // The following test that the printer escapes meta characters + // in character classes. + roundtrip(r"[\[]", r"[\[]"); + roundtrip(r"[Z-_]", r"[Z-_]"); + roundtrip(r"[Z-_--Z]", r"[\[-_]"); + + // The following test that the printer escapes meta characters + // in byte oriented character classes. + roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])"); + roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); + roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); + } + + #[test] + fn print_anchor() { + roundtrip(r"^", r"\A"); + roundtrip(r"$", r"\z"); + roundtrip(r"(?m)^", r"(?m:^)"); + roundtrip(r"(?m)$", r"(?m:$)"); + } + + #[test] + fn print_word_boundary() { + roundtrip(r"\b", r"\b"); + roundtrip(r"\B", r"\B"); + roundtrip(r"(?-u)\b", r"(?-u:\b)"); + roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)"); + } + + #[test] + fn print_repetition() { + roundtrip("a?", "a?"); + roundtrip("a??", "a??"); + roundtrip("(?U)a?", "a??"); + + roundtrip("a*", "a*"); + roundtrip("a*?", "a*?"); + roundtrip("(?U)a*", "a*?"); + + roundtrip("a+", "a+"); + roundtrip("a+?", "a+?"); + roundtrip("(?U)a+", "a+?"); + + roundtrip("a{1}", "a{1}"); + roundtrip("a{1,}", "a{1,}"); + roundtrip("a{1,5}", "a{1,5}"); + roundtrip("a{1}?", "a{1}?"); + roundtrip("a{1,}?", "a{1,}?"); + roundtrip("a{1,5}?", "a{1,5}?"); + roundtrip("(?U)a{1}", "a{1}?"); + roundtrip("(?U)a{1,}", "a{1,}?"); + roundtrip("(?U)a{1,5}", "a{1,5}?"); + } + + #[test] + fn print_group() { + roundtrip("()", "()"); + roundtrip("(?P<foo>)", "(?P<foo>)"); + roundtrip("(?:)", "(?:)"); + + roundtrip("(a)", "(a)"); + roundtrip("(?P<foo>a)", "(?P<foo>a)"); + roundtrip("(?:a)", "(?:a)"); + + roundtrip("((((a))))", "((((a))))"); + } + + #[test] + fn print_alternation() { + roundtrip("|", "|"); + roundtrip("||", "||"); + + roundtrip("a|b", "a|b"); + roundtrip("a|b|c", "a|b|c"); + roundtrip("foo|bar|quux", "foo|bar|quux"); + } +} diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs new file mode 100644 index 000000000..56afbbed8 --- /dev/null +++ b/vendor/regex-syntax/src/hir/translate.rs @@ -0,0 +1,3211 @@ +/*! +Defines a translator that converts an `Ast` to an `Hir`. +*/ + +use std::cell::{Cell, RefCell}; +use std::result; + +use crate::ast::{self, Ast, Span, Visitor}; +use crate::hir::{self, Error, ErrorKind, Hir}; +use crate::unicode::{self, ClassQuery}; + +type Result<T> = result::Result<T, Error>; + +/// A builder for constructing an AST->HIR translator. +#[derive(Clone, Debug)] +pub struct TranslatorBuilder { + allow_invalid_utf8: bool, + flags: Flags, +} + +impl Default for TranslatorBuilder { + fn default() -> TranslatorBuilder { + TranslatorBuilder::new() + } +} + +impl TranslatorBuilder { + /// Create a new translator builder with a default c onfiguration. + pub fn new() -> TranslatorBuilder { + TranslatorBuilder { + allow_invalid_utf8: false, + flags: Flags::default(), + } + } + + /// Build a translator using the current configuration. + pub fn build(&self) -> Translator { + Translator { + stack: RefCell::new(vec![]), + flags: Cell::new(self.flags), + allow_invalid_utf8: self.allow_invalid_utf8, + } + } + + /// When enabled, translation will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the translator is guaranteed to produce + /// an expression that will only ever match valid UTF-8 (otherwise, the + /// translator will return an error). + /// + /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII + /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause + /// the parser to return an error. Namely, a negated ASCII word boundary + /// can result in matching positions that aren't valid UTF-8 boundaries. + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.allow_invalid_utf8 = yes; + self + } + + /// Enable or disable the case insensitive flag (`i`) by default. + pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.case_insensitive = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the multi-line matching flag (`m`) by default. + pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.multi_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "dot matches any character" flag (`s`) by + /// default. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut TranslatorBuilder { + self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "swap greed" flag (`U`) by default. + pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.swap_greed = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.unicode = if yes { None } else { Some(false) }; + self + } +} + +/// A translator maps abstract syntax to a high level intermediate +/// representation. +/// +/// A translator may be benefit from reuse. That is, a translator can translate +/// many abstract syntax trees. +/// +/// A `Translator` can be configured in more detail via a +/// [`TranslatorBuilder`](struct.TranslatorBuilder.html). +#[derive(Clone, Debug)] +pub struct Translator { + /// Our call stack, but on the heap. + stack: RefCell<Vec<HirFrame>>, + /// The current flag settings. + flags: Cell<Flags>, + /// Whether we're allowed to produce HIR that can match arbitrary bytes. + allow_invalid_utf8: bool, +} + +impl Translator { + /// Create a new translator using the default configuration. + pub fn new() -> Translator { + TranslatorBuilder::new().build() + } + + /// Translate the given abstract syntax tree (AST) into a high level + /// intermediate representation (HIR). + /// + /// If there was a problem doing the translation, then an HIR-specific + /// error is returned. + /// + /// The original pattern string used to produce the `Ast` *must* also be + /// provided. The translator does not use the pattern string during any + /// correct translation, but is used for error reporting. + pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> { + ast::visit(ast, TranslatorI::new(self, pattern)) + } +} + +/// An HirFrame is a single stack frame, represented explicitly, which is +/// created for each item in the Ast that we traverse. +/// +/// Note that technically, this type doesn't represent our entire stack +/// frame. In particular, the Ast visitor represents any state associated with +/// traversing the Ast itself. +#[derive(Clone, Debug)] +enum HirFrame { + /// An arbitrary HIR expression. These get pushed whenever we hit a base + /// case in the Ast. They get popped after an inductive (i.e., recursive) + /// step is complete. + Expr(Hir), + /// A Unicode character class. This frame is mutated as we descend into + /// the Ast of a character class (which is itself its own mini recursive + /// structure). + ClassUnicode(hir::ClassUnicode), + /// A byte-oriented character class. This frame is mutated as we descend + /// into the Ast of a character class (which is itself its own mini + /// recursive structure). + /// + /// Byte character classes are created when Unicode mode (`u`) is disabled. + /// If `allow_invalid_utf8` is disabled (the default), then a byte + /// character is only permitted to match ASCII text. + ClassBytes(hir::ClassBytes), + /// This is pushed on to the stack upon first seeing any kind of group, + /// indicated by parentheses (including non-capturing groups). It is popped + /// upon leaving a group. + Group { + /// The old active flags when this group was opened. + /// + /// If this group sets flags, then the new active flags are set to the + /// result of merging the old flags with the flags introduced by this + /// group. If the group doesn't set any flags, then this is simply + /// equivalent to whatever flags were set when the group was opened. + /// + /// When this group is popped, the active flags should be restored to + /// the flags set here. + /// + /// The "active" flags correspond to whatever flags are set in the + /// Translator. + old_flags: Flags, + }, + /// This is pushed whenever a concatenation is observed. After visiting + /// every sub-expression in the concatenation, the translator's stack is + /// popped until it sees a Concat frame. + Concat, + /// This is pushed whenever an alternation is observed. After visiting + /// every sub-expression in the alternation, the translator's stack is + /// popped until it sees an Alternation frame. + Alternation, +} + +impl HirFrame { + /// Assert that the current stack frame is an Hir expression and return it. + fn unwrap_expr(self) -> Hir { + match self { + HirFrame::Expr(expr) => expr, + _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), + } + } + + /// Assert that the current stack frame is a Unicode class expression and + /// return it. + fn unwrap_class_unicode(self) -> hir::ClassUnicode { + match self { + HirFrame::ClassUnicode(cls) => cls, + _ => panic!( + "tried to unwrap Unicode class \ + from HirFrame, got: {:?}", + self + ), + } + } + + /// Assert that the current stack frame is a byte class expression and + /// return it. + fn unwrap_class_bytes(self) -> hir::ClassBytes { + match self { + HirFrame::ClassBytes(cls) => cls, + _ => panic!( + "tried to unwrap byte class \ + from HirFrame, got: {:?}", + self + ), + } + } + + /// Assert that the current stack frame is a group indicator and return + /// its corresponding flags (the flags that were active at the time the + /// group was entered). + fn unwrap_group(self) -> Flags { + match self { + HirFrame::Group { old_flags } => old_flags, + _ => { + panic!("tried to unwrap group from HirFrame, got: {:?}", self) + } + } + } +} + +impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { + type Output = Hir; + type Err = Error; + + fn finish(self) -> Result<Hir> { + // ... otherwise, we should have exactly one HIR on the stack. + assert_eq!(self.trans().stack.borrow().len(), 1); + Ok(self.pop().unwrap().unwrap_expr()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Class(ast::Class::Bracketed(_)) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + Ast::Group(ref x) => { + let old_flags = x + .flags() + .map(|ast| self.set_flags(ast)) + .unwrap_or_else(|| self.flags()); + self.push(HirFrame::Group { old_flags }); + } + Ast::Concat(ref x) if x.asts.is_empty() => {} + Ast::Concat(_) => { + self.push(HirFrame::Concat); + } + Ast::Alternation(ref x) if x.asts.is_empty() => {} + Ast::Alternation(_) => { + self.push(HirFrame::Alternation); + } + _ => {} + } + Ok(()) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) => { + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Flags(ref x) => { + self.set_flags(&x.flags); + // Flags in the AST are generally considered directives and + // not actual sub-expressions. However, they can be used in + // the concrete syntax like `((?i))`, and we need some kind of + // indication of an expression there, and Empty is the correct + // choice. + // + // There can also be things like `(?i)+`, but we rule those out + // in the parser. In the future, we might allow them for + // consistency sake. + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Literal(ref x) => { + self.push(HirFrame::Expr(self.hir_literal(x)?)); + } + Ast::Dot(span) => { + self.push(HirFrame::Expr(self.hir_dot(span)?)); + } + Ast::Assertion(ref x) => { + self.push(HirFrame::Expr(self.hir_assertion(x)?)); + } + Ast::Class(ast::Class::Perl(ref x)) => { + if self.flags().unicode() { + let cls = self.hir_perl_unicode_class(x)?; + let hcls = hir::Class::Unicode(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } else { + let cls = self.hir_perl_byte_class(x); + let hcls = hir::Class::Bytes(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } + } + Ast::Class(ast::Class::Unicode(ref x)) => { + let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); + self.push(HirFrame::Expr(Hir::class(cls))); + } + Ast::Class(ast::Class::Bracketed(ref ast)) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; + if cls.ranges().is_empty() { + return Err(self.error( + ast.span, + ErrorKind::EmptyClassNotAllowed, + )); + } + let expr = Hir::class(hir::Class::Unicode(cls)); + self.push(HirFrame::Expr(expr)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + self.bytes_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; + if cls.ranges().is_empty() { + return Err(self.error( + ast.span, + ErrorKind::EmptyClassNotAllowed, + )); + } + + let expr = Hir::class(hir::Class::Bytes(cls)); + self.push(HirFrame::Expr(expr)); + } + } + Ast::Repetition(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + self.push(HirFrame::Expr(self.hir_repetition(x, expr))); + } + Ast::Group(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + let old_flags = self.pop().unwrap().unwrap_group(); + self.trans().flags.set(old_flags); + self.push(HirFrame::Expr(self.hir_group(x, expr))); + } + Ast::Concat(_) => { + let mut exprs = vec![]; + while let Some(HirFrame::Expr(expr)) = self.pop() { + if !expr.kind().is_empty() { + exprs.push(expr); + } + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::concat(exprs))); + } + Ast::Alternation(_) => { + let mut exprs = vec![]; + while let Some(HirFrame::Expr(expr)) = self.pop() { + exprs.push(expr); + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::alternation(exprs))); + } + } + Ok(()) + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Bracketed(_) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + // We needn't handle the Union case here since the visitor will + // do it for us. + _ => {} + } + Ok(()) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) => {} + ast::ClassSetItem::Literal(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let byte = self.class_literal_byte(x)?; + cls.push(hir::ClassBytesRange::new(byte, byte)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Range(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let start = self.class_literal_byte(&x.start)?; + let end = self.class_literal_byte(&x.end)?; + cls.push(hir::ClassBytesRange::new(start, end)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Ascii(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_ascii_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_ascii_byte_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Unicode(ref x) => { + let xcls = self.hir_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } + ast::ClassSetItem::Perl(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_perl_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_perl_byte_class(x); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Bracketed(ref ast) => { + if self.flags().unicode() { + let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; + + let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); + cls2.union(&cls1); + self.push(HirFrame::ClassUnicode(cls2)); + } else { + let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); + self.bytes_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; + + let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); + cls2.union(&cls1); + self.push(HirFrame::ClassBytes(cls2)); + } + } + // This is handled automatically by the visitor. + ast::ClassSetItem::Union(_) => {} + } + Ok(()) + } + + fn visit_class_set_binary_op_pre( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_in( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_post( + &mut self, + op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + use crate::ast::ClassSetBinaryOpKind::*; + + if self.flags().unicode() { + let mut rhs = self.pop().unwrap().unwrap_class_unicode(); + let mut lhs = self.pop().unwrap().unwrap_class_unicode(); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + if self.flags().case_insensitive() { + rhs.try_case_fold_simple().map_err(|_| { + self.error( + op.rhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + lhs.try_case_fold_simple().map_err(|_| { + self.error( + op.lhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut rhs = self.pop().unwrap().unwrap_class_bytes(); + let mut lhs = self.pop().unwrap().unwrap_class_bytes(); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + if self.flags().case_insensitive() { + rhs.case_fold_simple(); + lhs.case_fold_simple(); + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } +} + +/// The internal implementation of a translator. +/// +/// This type is responsible for carrying around the original pattern string, +/// which is not tied to the internal state of a translator. +/// +/// A TranslatorI exists for the time it takes to translate a single Ast. +#[derive(Clone, Debug)] +struct TranslatorI<'t, 'p> { + trans: &'t Translator, + pattern: &'p str, +} + +impl<'t, 'p> TranslatorI<'t, 'p> { + /// Build a new internal translator. + fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { + TranslatorI { trans: trans, pattern: pattern } + } + + /// Return a reference to the underlying translator. + fn trans(&self) -> &Translator { + &self.trans + } + + /// Push the given frame on to the call stack. + fn push(&self, frame: HirFrame) { + self.trans().stack.borrow_mut().push(frame); + } + + /// Pop the top of the call stack. If the call stack is empty, return None. + fn pop(&self) -> Option<HirFrame> { + self.trans().stack.borrow_mut().pop() + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ErrorKind) -> Error { + Error { kind: kind, pattern: self.pattern.to_string(), span: span } + } + + /// Return a copy of the active flags. + fn flags(&self) -> Flags { + self.trans().flags.get() + } + + /// Set the flags of this translator from the flags set in the given AST. + /// Then, return the old flags. + fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { + let old_flags = self.flags(); + let mut new_flags = Flags::from_ast(ast_flags); + new_flags.merge(&old_flags); + self.trans().flags.set(new_flags); + old_flags + } + + fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> { + let ch = match self.literal_to_char(lit)? { + byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)), + hir::Literal::Unicode(ch) => ch, + }; + if self.flags().case_insensitive() { + self.hir_from_char_case_insensitive(lit.span, ch) + } else { + self.hir_from_char(lit.span, ch) + } + } + + /// Convert an Ast literal to its scalar representation. + /// + /// When Unicode mode is enabled, then this always succeeds and returns a + /// `char` (Unicode scalar value). + /// + /// When Unicode mode is disabled, then a raw byte is returned. If that + /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns + /// an error. + fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> { + if self.flags().unicode() { + return Ok(hir::Literal::Unicode(lit.c)); + } + let byte = match lit.byte() { + None => return Ok(hir::Literal::Unicode(lit.c)), + Some(byte) => byte, + }; + if byte <= 0x7F { + return Ok(hir::Literal::Unicode(byte as char)); + } + if !self.trans().allow_invalid_utf8 { + return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); + } + Ok(hir::Literal::Byte(byte)) + } + + fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> { + if !self.flags().unicode() && c.len_utf8() > 1 { + return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + } + Ok(Hir::literal(hir::Literal::Unicode(c))) + } + + fn hir_from_char_case_insensitive( + &self, + span: Span, + c: char, + ) -> Result<Hir> { + if self.flags().unicode() { + // If case folding won't do anything, then don't bother trying. + let map = + unicode::contains_simple_case_mapping(c, c).map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + if !map { + return self.hir_from_char(span, c); + } + let mut cls = + hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( + c, c, + )]); + cls.try_case_fold_simple().map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + Ok(Hir::class(hir::Class::Unicode(cls))) + } else { + if c.len_utf8() > 1 { + return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + } + // If case folding won't do anything, then don't bother trying. + match c { + 'A'..='Z' | 'a'..='z' => {} + _ => return self.hir_from_char(span, c), + } + let mut cls = + hir::ClassBytes::new(vec![hir::ClassBytesRange::new( + c as u8, c as u8, + )]); + cls.case_fold_simple(); + Ok(Hir::class(hir::Class::Bytes(cls))) + } + } + + fn hir_dot(&self, span: Span) -> Result<Hir> { + let unicode = self.flags().unicode(); + if !unicode && !self.trans().allow_invalid_utf8 { + return Err(self.error(span, ErrorKind::InvalidUtf8)); + } + Ok(if self.flags().dot_matches_new_line() { + Hir::any(!unicode) + } else { + Hir::dot(!unicode) + }) + } + + fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> { + let unicode = self.flags().unicode(); + let multi_line = self.flags().multi_line(); + Ok(match asst.kind { + ast::AssertionKind::StartLine => Hir::anchor(if multi_line { + hir::Anchor::StartLine + } else { + hir::Anchor::StartText + }), + ast::AssertionKind::EndLine => Hir::anchor(if multi_line { + hir::Anchor::EndLine + } else { + hir::Anchor::EndText + }), + ast::AssertionKind::StartText => { + Hir::anchor(hir::Anchor::StartText) + } + ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText), + ast::AssertionKind::WordBoundary => { + Hir::word_boundary(if unicode { + hir::WordBoundary::Unicode + } else { + hir::WordBoundary::Ascii + }) + } + ast::AssertionKind::NotWordBoundary => { + Hir::word_boundary(if unicode { + hir::WordBoundary::UnicodeNegate + } else { + // It is possible for negated ASCII word boundaries to + // match at invalid UTF-8 boundaries, even when searching + // valid UTF-8. + if !self.trans().allow_invalid_utf8 { + return Err( + self.error(asst.span, ErrorKind::InvalidUtf8) + ); + } + hir::WordBoundary::AsciiNegate + }) + } + }) + } + + fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { + let kind = match group.kind { + ast::GroupKind::CaptureIndex(idx) => { + hir::GroupKind::CaptureIndex(idx) + } + ast::GroupKind::CaptureName(ref capname) => { + hir::GroupKind::CaptureName { + name: capname.name.clone(), + index: capname.index, + } + } + ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing, + }; + Hir::group(hir::Group { kind: kind, hir: Box::new(expr) }) + } + + fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { + let kind = match rep.op.kind { + ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne, + ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore, + ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore, + ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( + m, + n, + )) => { + hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n)) + } + }; + let greedy = + if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; + Hir::repetition(hir::Repetition { + kind: kind, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_unicode_class( + &self, + ast_class: &ast::ClassUnicode, + ) -> Result<hir::ClassUnicode> { + use crate::ast::ClassUnicodeKind::*; + + if !self.flags().unicode() { + return Err( + self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) + ); + } + let query = match ast_class.kind { + OneLetter(name) => ClassQuery::OneLetter(name), + Named(ref name) => ClassQuery::Binary(name), + NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { + property_name: name, + property_value: value, + }, + }; + let mut result = self.convert_unicode_class_error( + &ast_class.span, + unicode::class(query), + ); + if let Ok(ref mut class) = result { + self.unicode_fold_and_negate( + &ast_class.span, + ast_class.negated, + class, + )?; + if class.ranges().is_empty() { + let err = self + .error(ast_class.span, ErrorKind::EmptyClassNotAllowed); + return Err(err); + } + } + result + } + + fn hir_ascii_unicode_class( + &self, + ast: &ast::ClassAscii, + ) -> Result<hir::ClassUnicode> { + let mut cls = hir::ClassUnicode::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), + ); + self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_ascii_byte_class( + &self, + ast: &ast::ClassAscii, + ) -> Result<hir::ClassBytes> { + let mut cls = hir::ClassBytes::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), + ); + self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_perl_unicode_class( + &self, + ast_class: &ast::ClassPerl, + ) -> Result<hir::ClassUnicode> { + use crate::ast::ClassPerlKind::*; + + assert!(self.flags().unicode()); + let result = match ast_class.kind { + Digit => unicode::perl_digit(), + Space => unicode::perl_space(), + Word => unicode::perl_word(), + }; + let mut class = + self.convert_unicode_class_error(&ast_class.span, result)?; + // We needn't apply case folding here because the Perl Unicode classes + // are already closed under Unicode simple case folding. + if ast_class.negated { + class.negate(); + } + Ok(class) + } + + fn hir_perl_byte_class( + &self, + ast_class: &ast::ClassPerl, + ) -> hir::ClassBytes { + use crate::ast::ClassPerlKind::*; + + assert!(!self.flags().unicode()); + let mut class = match ast_class.kind { + Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), + Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), + Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), + }; + // We needn't apply case folding here because the Perl ASCII classes + // are already closed (under ASCII case folding). + if ast_class.negated { + class.negate(); + } + class + } + + /// Converts the given Unicode specific error to an HIR translation error. + /// + /// The span given should approximate the position at which an error would + /// occur. + fn convert_unicode_class_error( + &self, + span: &Span, + result: unicode::Result<hir::ClassUnicode>, + ) -> Result<hir::ClassUnicode> { + result.map_err(|err| { + let sp = span.clone(); + match err { + unicode::Error::PropertyNotFound => { + self.error(sp, ErrorKind::UnicodePropertyNotFound) + } + unicode::Error::PropertyValueNotFound => { + self.error(sp, ErrorKind::UnicodePropertyValueNotFound) + } + unicode::Error::PerlClassNotFound => { + self.error(sp, ErrorKind::UnicodePerlClassNotFound) + } + } + }) + } + + fn unicode_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassUnicode, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation field, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.try_case_fold_simple().map_err(|_| { + self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) + })?; + } + if negated { + class.negate(); + } + Ok(()) + } + + fn bytes_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassBytes, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation first, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.case_fold_simple(); + } + if negated { + class.negate(); + } + if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); + } + Ok(()) + } + + /// Return a scalar byte value suitable for use as a literal in a byte + /// character class. + fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> { + match self.literal_to_char(ast)? { + hir::Literal::Byte(byte) => Ok(byte), + hir::Literal::Unicode(ch) => { + if ch <= 0x7F as char { + Ok(ch as u8) + } else { + // We can't feasibly support Unicode in + // byte oriented classes. Byte classes don't + // do Unicode case folding. + Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) + } + } + } + } +} + +/// A translator's representation of a regular expression's flags at any given +/// moment in time. +/// +/// Each flag can be in one of three states: absent, present but disabled or +/// present but enabled. +#[derive(Clone, Copy, Debug, Default)] +struct Flags { + case_insensitive: Option<bool>, + multi_line: Option<bool>, + dot_matches_new_line: Option<bool>, + swap_greed: Option<bool>, + unicode: Option<bool>, + // Note that `ignore_whitespace` is omitted here because it is handled + // entirely in the parser. +} + +impl Flags { + fn from_ast(ast: &ast::Flags) -> Flags { + let mut flags = Flags::default(); + let mut enable = true; + for item in &ast.items { + match item.kind { + ast::FlagsItemKind::Negation => { + enable = false; + } + ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { + flags.case_insensitive = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { + flags.multi_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { + flags.dot_matches_new_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { + flags.swap_greed = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { + flags.unicode = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} + } + } + flags + } + + fn merge(&mut self, previous: &Flags) { + if self.case_insensitive.is_none() { + self.case_insensitive = previous.case_insensitive; + } + if self.multi_line.is_none() { + self.multi_line = previous.multi_line; + } + if self.dot_matches_new_line.is_none() { + self.dot_matches_new_line = previous.dot_matches_new_line; + } + if self.swap_greed.is_none() { + self.swap_greed = previous.swap_greed; + } + if self.unicode.is_none() { + self.unicode = previous.unicode; + } + } + + fn case_insensitive(&self) -> bool { + self.case_insensitive.unwrap_or(false) + } + + fn multi_line(&self) -> bool { + self.multi_line.unwrap_or(false) + } + + fn dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line.unwrap_or(false) + } + + fn swap_greed(&self) -> bool { + self.swap_greed.unwrap_or(false) + } + + fn unicode(&self) -> bool { + self.unicode.unwrap_or(true) + } +} + +fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { + let ranges: Vec<_> = ascii_class(kind) + .iter() + .cloned() + .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)) + .collect(); + hir::ClassBytes::new(ranges) +} + +fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { + use crate::ast::ClassAsciiKind::*; + match *kind { + Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')], + Alpha => &[('A', 'Z'), ('a', 'z')], + Ascii => &[('\x00', '\x7F')], + Blank => &[('\t', '\t'), (' ', ' ')], + Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')], + Digit => &[('0', '9')], + Graph => &[('!', '~')], + Lower => &[('a', 'z')], + Print => &[(' ', '~')], + Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')], + Space => &[ + ('\t', '\t'), + ('\n', '\n'), + ('\x0B', '\x0B'), + ('\x0C', '\x0C'), + ('\r', '\r'), + (' ', ' '), + ], + Upper => &[('A', 'Z')], + Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')], + Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')], + } +} + +#[cfg(test)] +mod tests { + use crate::ast::parse::ParserBuilder; + use crate::ast::{self, Ast, Position, Span}; + use crate::hir::{self, Hir, HirKind}; + use crate::unicode::{self, ClassQuery}; + + use super::{ascii_class, TranslatorBuilder}; + + // We create these errors to compare with real hir::Errors in the tests. + // We define equality between TestError and hir::Error to disregard the + // pattern string in hir::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: hir::ErrorKind, + } + + impl PartialEq<hir::Error> for TestError { + fn eq(&self, other: &hir::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq<TestError> for hir::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn parse(pattern: &str) -> Ast { + ParserBuilder::new().octal(true).build().parse(pattern).unwrap() + } + + fn t(pattern: &str) -> Hir { + TranslatorBuilder::new() + .allow_invalid_utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn t_err(pattern: &str) -> hir::Error { + TranslatorBuilder::new() + .allow_invalid_utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap_err() + } + + fn t_bytes(pattern: &str) -> Hir { + TranslatorBuilder::new() + .allow_invalid_utf8(true) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn hir_lit(s: &str) -> Hir { + match s.len() { + 0 => Hir::empty(), + _ => { + let lits = s + .chars() + .map(hir::Literal::Unicode) + .map(Hir::literal) + .collect(); + Hir::concat(lits) + } + } + } + + fn hir_blit(s: &[u8]) -> Hir { + match s.len() { + 0 => Hir::empty(), + 1 => Hir::literal(hir::Literal::Byte(s[0])), + _ => { + let lits = s + .iter() + .cloned() + .map(hir::Literal::Byte) + .map(Hir::literal) + .collect(); + Hir::concat(lits) + } + } + } + + fn hir_group(i: u32, expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::CaptureIndex(i), + hir: Box::new(expr), + }) + } + + fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::CaptureName { + name: name.to_string(), + index: i, + }, + hir: Box::new(expr), + }) + } + + fn hir_group_nocap(expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(expr), + }) + } + + fn hir_quest(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrOne, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_star(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_plus(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::OneOrMore, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::Range(range), + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_alt(alts: Vec<Hir>) -> Hir { + Hir::alternation(alts) + } + + fn hir_cat(exprs: Vec<Hir>) -> Hir { + Hir::concat(exprs) + } + + #[allow(dead_code)] + fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { + Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) + } + + #[allow(dead_code)] + fn hir_uclass_perl_word() -> Hir { + Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) + } + + fn hir_uclass(ranges: &[(char, char)]) -> Hir { + let ranges: Vec<hir::ClassUnicodeRange> = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges))) + } + + fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { + let ranges: Vec<hir::ClassBytesRange> = ranges + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + } + + fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir { + let ranges: Vec<hir::ClassBytesRange> = ranges + .iter() + .map(|&(s, e)| { + assert!(s as u32 <= 0x7F); + assert!(e as u32 <= 0x7F); + hir::ClassBytesRange::new(s as u8, e as u8) + }) + .collect(); + Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + } + + fn hir_case_fold(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.case_fold_simple(); + Hir::class(cls) + } + _ => panic!("cannot case fold non-class Hir expr"), + } + } + + fn hir_negate(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.negate(); + Hir::class(cls) + } + _ => panic!("cannot negate non-class Hir expr"), + } + } + + #[allow(dead_code)] + fn hir_union(expr1: Hir, expr2: Hir) -> Hir { + use crate::hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { + c1.union(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { + c1.union(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot union non-class Hir exprs"), + } + } + + #[allow(dead_code)] + fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { + use crate::hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { + c1.difference(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { + c1.difference(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot difference non-class Hir exprs"), + } + } + + fn hir_anchor(anchor: hir::Anchor) -> Hir { + Hir::anchor(anchor) + } + + fn hir_word(wb: hir::WordBoundary) -> Hir { + Hir::word_boundary(wb) + } + + #[test] + fn empty() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?i)"), Hir::empty()); + assert_eq!(t("()"), hir_group(1, Hir::empty())); + assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); + assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty())); + assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); + assert_eq!( + t("()|()"), + hir_alt(vec![ + hir_group(1, Hir::empty()), + hir_group(2, Hir::empty()), + ]) + ); + assert_eq!( + t("(|b)"), + hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) + ); + assert_eq!( + t("(a|)"), + hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) + ); + assert_eq!( + t("(a||c)"), + hir_group( + 1, + hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) + ) + ); + assert_eq!( + t("(||)"), + hir_group( + 1, + hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) + ) + ); + } + + #[test] + fn literal() { + assert_eq!(t("a"), hir_lit("a")); + assert_eq!(t("(?-u)a"), hir_lit("a")); + assert_eq!(t("â"), hir_lit("â")); + assert_eq!(t("abcd"), hir_lit("abcd")); + + assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); + assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!( + t_err("(?-u)â"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 7) + ), + } + ); + assert_eq!( + t_err(r"(?-u)\xFF"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(9, 1, 10) + ), + } + ); + } + + #[test] + fn literal_case_insensitive() { + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i:a)"), + hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],)) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("a(?i)a(?-i)a"), + hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)ab@c"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_uclass(&[('B', 'B'), ('b', 'b')]), + hir_lit("@"), + hir_uclass(&[('C', 'C'), ('c', 'c')]), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)β"), + hir_uclass(&[('Î', 'Î'), ('β', 'β'), ('Ī', 'Ī'),]) + ); + + assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?-u)a(?i)a(?-i)a"), + hir_cat(vec![ + hir_lit("a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("a"), + ]) + ); + assert_eq!( + t("(?i-u)ab@c"), + hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), + hir_lit("@"), + hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), + ]) + ); + + assert_eq!( + t_bytes("(?i-u)a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!( + t_bytes("(?i-u)\x61"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!( + t_bytes(r"(?i-u)\x61"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!( + t_err("(?i-u)β"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 8), + ), + } + ); + } + + #[test] + fn dot() { + assert_eq!( + t("."), + hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),]) + ); + assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),])); + assert_eq!( + t_bytes("(?-u)."), + hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),]) + ); + assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + + // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. + assert_eq!( + t_err("(?-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(6, 1, 7) + ), + } + ); + assert_eq!( + t_err("(?s-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + fn assertions() { + assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText)); + assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText)); + assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine)); + assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine)); + assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText)); + + assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode)); + assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate)); + assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii)); + assert_eq!( + t_bytes(r"(?-u)\B"), + hir_word(hir::WordBoundary::AsciiNegate) + ); + + assert_eq!( + t_err(r"(?-u)\B"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + fn group() { + assert_eq!(t("(a)"), hir_group(1, hir_lit("a"))); + assert_eq!( + t("(a)(b)"), + hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group(2, hir_lit("b")), + ]) + ); + assert_eq!( + t("(a)|(b)"), + hir_alt(vec![ + hir_group(1, hir_lit("a")), + hir_group(2, hir_lit("b")), + ]) + ); + assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty())); + assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a"))); + assert_eq!( + t("(?P<foo>a)(?P<bar>b)"), + hir_cat(vec![ + hir_group_name(1, "foo", hir_lit("a")), + hir_group_name(2, "bar", hir_lit("b")), + ]) + ); + assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); + assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a"))); + assert_eq!( + t("(?:a)(b)"), + hir_cat(vec![ + hir_group_nocap(hir_lit("a")), + hir_group(1, hir_lit("b")), + ]) + ); + assert_eq!( + t("(a)(?:b)(c)"), + hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group_nocap(hir_lit("b")), + hir_group(2, hir_lit("c")), + ]) + ); + assert_eq!( + t("(a)(?P<foo>b)(c)"), + hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group_name(2, "foo", hir_lit("b")), + hir_group(3, hir_lit("c")), + ]) + ); + assert_eq!(t("()"), hir_group(1, Hir::empty())); + assert_eq!(t("((?i))"), hir_group(1, Hir::empty())); + assert_eq!(t("((?x))"), hir_group(1, Hir::empty())); + assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty()))); + } + + #[test] + fn flags() { + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i:a)a"), + hir_cat(vec![ + hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])), + hir_lit("a"), + ]) + ); + assert_eq!( + t("(?i-u:a)β"), + hir_cat(vec![ + hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("β"), + ]) + ); + assert_eq!( + t("(?:(?i-u)a)b"), + hir_cat(vec![ + hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("b"), + ]) + ); + assert_eq!( + t("((?i-u)a)b"), + hir_cat(vec![ + hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("b"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)(?-i:a)a"), + hir_cat(vec![ + hir_group_nocap(hir_lit("a")), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?im)a^"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartLine), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?im)a^(?i-m)a^"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartLine), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartText), + ]) + ); + assert_eq!( + t("(?U)a*a*?(?-U)a*a*?"), + hir_cat(vec![ + hir_star(false, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(false, hir_lit("a")), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?:a(?i)a)a"), + hir_cat(vec![ + hir_group_nocap(hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ])), + hir_lit("a"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)(?:a(?-i)a)a"), + hir_cat(vec![ + hir_group_nocap(hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ])), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ]) + ); + } + + #[test] + fn escape() { + assert_eq!( + t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), + hir_lit(r"\.+*?()|[]{}^$#") + ); + } + + #[test] + fn repetition() { + assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); + assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); + assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); + assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); + assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); + assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); + + assert_eq!( + t("a{1}"), + hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),) + ); + assert_eq!( + t("a{1,}"), + hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) + ); + assert_eq!( + t("a{1,2}"), + hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),) + ); + assert_eq!( + t("a{1}?"), + hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),) + ); + assert_eq!( + t("a{1,}?"), + hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) + ); + assert_eq!( + t("a{1,2}?"), + hir_range( + false, + hir::RepetitionRange::Bounded(1, 2), + hir_lit("a"), + ) + ); + + assert_eq!( + t("ab?"), + hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) + ); + assert_eq!( + t("(ab)?"), + hir_quest( + true, + hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) + ) + ); + assert_eq!( + t("a|b?"), + hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) + ); + } + + #[test] + fn cat_alt() { + assert_eq!( + t("(ab)"), + hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) + ); + assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),])); + assert_eq!( + t("a|b|c"), + hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) + ); + assert_eq!( + t("ab|bc|cd"), + hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) + ); + assert_eq!( + t("(a|b)"), + hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),])) + ); + assert_eq!( + t("(a|b|c)"), + hir_group( + 1, + hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) + ) + ); + assert_eq!( + t("(ab|bc|cd)"), + hir_group( + 1, + hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) + ) + ); + assert_eq!( + t("(ab|(bc|(cd)))"), + hir_group( + 1, + hir_alt(vec![ + hir_lit("ab"), + hir_group( + 2, + hir_alt(vec![ + hir_lit("bc"), + hir_group(3, hir_lit("cd")), + ]) + ), + ]) + ) + ); + } + + #[test] + fn class_ascii() { + assert_eq!( + t("[[:alnum:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)) + ); + assert_eq!( + t("[[:alpha:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha)) + ); + assert_eq!( + t("[[:ascii:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii)) + ); + assert_eq!( + t("[[:blank:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank)) + ); + assert_eq!( + t("[[:cntrl:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl)) + ); + assert_eq!( + t("[[:digit:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t("[[:graph:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph)) + ); + assert_eq!( + t("[[:lower:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)) + ); + assert_eq!( + t("[[:print:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Print)) + ); + assert_eq!( + t("[[:punct:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct)) + ); + assert_eq!( + t("[[:space:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t("[[:upper:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper)) + ); + assert_eq!( + t("[[:word:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Word)) + ); + assert_eq!( + t("[[:xdigit:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit)) + ); + + assert_eq!( + t("[[:^lower:]]"), + hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[[:lower:]]"), + hir_uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]) + ); + + assert_eq!( + t("(?-u)[[:lower:]]"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower)) + ); + assert_eq!( + t("(?i-u)[[:lower:]]"), + hir_case_fold(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Lower + ))) + ); + + assert_eq!( + t_err("(?-u)[[:^lower:]]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(16, 1, 17) + ), + } + ); + assert_eq!( + t_err("(?i-u)[[:^lower:]]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(7, 1, 8), + Position::new(17, 1, 18) + ), + } + ); + } + + #[test] + fn class_ascii_multiple() { + // See: https://github.com/rust-lang/regex/issues/680 + assert_eq!( + t("[[:alnum:][:^ascii:]]"), + hir_union( + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_uclass(&[('\u{80}', '\u{10FFFF}')]), + ), + ); + assert_eq!( + t_bytes("(?-u)[[:alnum:][:^ascii:]]"), + hir_union( + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_bclass(&[(0x80, 0xFF)]), + ), + ); + } + + #[test] + #[cfg(feature = "unicode-perl")] + fn class_perl() { + // Unicode + assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); + assert_eq!(t(r"\w"), hir_uclass_perl_word()); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\d"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\s"), + hir_uclass_query(ClassQuery::Binary("space")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); + + // Unicode, negated + assert_eq!( + t(r"\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + assert_eq!( + t(r"\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) + ); + assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); + + // ASCII only + assert_eq!( + t(r"(?-u)\d"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t(r"(?-u)\s"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t(r"(?-u)\w"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + ); + assert_eq!( + t(r"(?i-u)\d"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t(r"(?i-u)\s"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t(r"(?i-u)\w"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + ); + + // ASCII only, negated + assert_eq!( + t(r"(?-u)\D"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit + ))) + ); + assert_eq!( + t(r"(?-u)\S"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Space + ))) + ); + assert_eq!( + t(r"(?-u)\W"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word + ))) + ); + assert_eq!( + t(r"(?i-u)\D"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit + ))) + ); + assert_eq!( + t(r"(?i-u)\S"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Space + ))) + ); + assert_eq!( + t(r"(?i-u)\W"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word + ))) + ); + } + + #[test] + #[cfg(not(feature = "unicode-perl"))] + fn class_perl_word_disabled() { + assert_eq!( + t_err(r"\w"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] + fn class_perl_space_disabled() { + assert_eq!( + t_err(r"\s"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all( + not(feature = "unicode-perl"), + not(feature = "unicode-gencat") + ))] + fn class_perl_digit_disabled() { + assert_eq!( + t_err(r"\d"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_gencat() { + assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{se PaRa ToR}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{gc:Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{gc=Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{Other}"), + hir_uclass_query(ClassQuery::Binary("Other")) + ); + assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other"))); + + assert_eq!( + t(r"\PZ"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + assert_eq!( + t(r"\P{separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + assert_eq!( + t(r"\P{gc!=separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + + assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); + assert_eq!( + t(r"\p{assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned")) + ); + assert_eq!( + t(r"\p{ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII")) + ); + assert_eq!( + t(r"\p{gc:any}"), + hir_uclass_query(ClassQuery::Binary("Any")) + ); + assert_eq!( + t(r"\p{gc:assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned")) + ); + assert_eq!( + t(r"\p{gc:ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII")) + ); + + assert_eq!( + t_err(r"(?-u)\pZ"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 9) + ), + } + ); + assert_eq!( + t_err(r"(?-u)\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(18, 1, 19) + ), + } + ); + assert_eq!( + t_err(r"\pE"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(3, 1, 4) + ), + } + ); + assert_eq!( + t_err(r"\p{Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + assert_eq!( + t_err(r"\p{gc:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-gencat"))] + fn class_unicode_gencat_disabled() { + assert_eq!( + t_err(r"\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + + assert_eq!( + t_err(r"\p{Any}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-script")] + fn class_unicode_script() { + assert_eq!( + t(r"\p{Greek}"), + hir_uclass_query(ClassQuery::Binary("Greek")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\p{Greek}"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\P{Greek}"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "Greek" + )))) + ); + + assert_eq!( + t_err(r"\p{sc:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + assert_eq!( + t_err(r"\p{scx:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-script"))] + fn class_unicode_script_disabled() { + assert_eq!( + t_err(r"\p{Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 10) + ), + } + ); + + assert_eq!( + t_err(r"\p{scx:Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-age")] + fn class_unicode_age() { + assert_eq!( + t_err(r"\p{age:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_any_empty() { + assert_eq!( + t_err(r"\P{any}"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-age"))] + fn class_unicode_age_disabled() { + assert_eq!( + t_err(r"\p{age:3.0}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + fn class_bracketed() { + assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); + assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); + assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); + assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); + assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); + assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); + assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\pZ]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\p{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\PZ]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\P{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(all( + feature = "unicode-case", + any(feature = "unicode-perl", feature = "unicode-gencat") + ))] + assert_eq!( + t(r"(?i)[^\D]"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[^\P{greek}]"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) + ); + + assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); + assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); + assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); + + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[k]"), + hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[β]"), + hir_uclass(&[('Î', 'Î'), ('β', 'β'), ('Ī', 'Ī'),]) + ); + assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); + + assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')]))); + assert_eq!( + t_bytes("(?-u)[^a]"), + hir_negate(hir_bclass(&[(b'a', b'a')])) + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!( + t(r"[^\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\pZ]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\p{separator}]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[^\p{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "greek" + )))) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[\P{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "greek" + )))) + ); + + // Test some weird cases. + assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); + + assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); + assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); + + assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); + assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); + + assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); + assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); + + assert_eq!( + t_err("(?-u)[^a]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(9, 1, 10) + ), + } + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] + assert_eq!( + t_err(r"[^\s\S]"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] + assert_eq!( + t_err(r"(?-u)[^\s\S]"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(12, 1, 13) + ), + } + ); + } + + #[test] + fn class_bracketed_union() { + assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[a\pZb]"), + hir_union( + hir_uclass(&[('a', 'b')]), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ); + #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] + assert_eq!( + t(r"[\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[\p{age:3.0}\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("cyrillic")), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ) + ) + ); + + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), + hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + )) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + )) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ))) + ); + } + + #[test] + fn class_bracketed_nested() { + assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[]))); + + assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); + assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); + + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[a[^c]]"), + hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[a-b[^c]]"), + hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + ); + + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[^a-b[^c]]"), + hir_uclass(&[('C', 'C'), ('c', 'c')]) + ); + + assert_eq!( + t_err(r"[^a-c[^c]]"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t_err(r"(?i)[^a-c[^c]]"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(4, 1, 5), + Position::new(14, 1, 15) + ), + } + ); + } + + #[test] + fn class_bracketed_intersect() { + assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); + assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + + assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); + assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); + assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); + + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[abc&&b-c]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[abc&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[[abc]&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_uclass(&[('c', 'x')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[c-da-b&&a-d]"), + hir_case_fold(hir_uclass(&[('a', 'd')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[a-d&&c-da-b]"), + hir_case_fold(hir_uclass(&[('a', 'd')])) + ); + + assert_eq!( + t("(?i-u)[abc&&b-c]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[abc&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[[abc]&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_bclass(&[(b'c', b'x')])) + ); + assert_eq!( + t("(?i-u)[c-da-b&&a-d]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')])) + ); + assert_eq!( + t("(?i-u)[a-d&&c-da-b]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')])) + ); + + // In `[a^]`, `^` does not need to be escaped, so it makes sense that + // `^` is also allowed to be unescaped after `&&`. + assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); + // `]` needs to be escaped after `&&` since it's not at start of class. + assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); + assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); + // Test precedence. + assert_eq!( + t(r"[a-w&&[^c-g]z]"), + hir_uclass(&[('a', 'b'), ('h', 'w')]) + ); + } + + #[test] + fn class_bracketed_intersect_negate() { + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^\w&&\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^[\w&&\d]]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^[^\w&&\d]]"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(feature = "unicode-perl")] + assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); + + #[cfg(feature = "unicode-perl")] + assert_eq!( + t_bytes(r"(?-u)[^\w&&\d]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit + ))) + ); + assert_eq!( + t_bytes(r"(?-u)[^[a-z&&a-c]]"), + hir_negate(hir_bclass(&[(b'a', b'c')])) + ); + assert_eq!( + t_bytes(r"(?-u)[^[\w&&\d]]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit + ))) + ); + assert_eq!( + t_bytes(r"(?-u)[^[^\w&&\d]]"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word + ))) + ); + } + + #[test] + fn class_bracketed_difference() { + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\pL--[:ascii:]]"), + hir_difference( + hir_uclass_query(ClassQuery::Binary("letter")), + hir_uclass(&[('\0', '\x7F')]) + ) + ); + + assert_eq!( + t(r"(?-u)[[:alpha:]--[:lower:]]"), + hir_bclass(&[(b'A', b'Z')]) + ); + } + + #[test] + fn class_bracketed_symmetric_difference() { + #[cfg(feature = "unicode-script")] + assert_eq!( + t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), + hir_uclass(&[ + ('\u{0342}', '\u{0342}'), + ('\u{0345}', '\u{0345}'), + ('\u{1DC0}', '\u{1DC1}'), + ]) + ); + assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')])); + + assert_eq!( + t(r"(?-u)[a-g~~c-j]"), + hir_bclass(&[(b'a', b'b'), (b'h', b'j')]) + ); + } + + #[test] + fn ignore_whitespace() { + assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); + assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); + assert_eq!( + t(r"(?x)\x # comment +{ # comment + 53 # comment +} #comment"), + hir_lit("S") + ); + + assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); + assert_eq!( + t(r"(?x)\x # comment + 53 # comment"), + hir_lit("S") + ); + assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); + + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"(?x)\p # comment +{ # comment + Separator # comment +} # comment"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + + assert_eq!( + t(r"(?x)a # comment +{ # comment + 5 # comment + , # comment + 10 # comment +} # comment"), + hir_range( + true, + hir::RepetitionRange::Bounded(5, 10), + hir_lit("a") + ) + ); + + assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); + } + + #[test] + fn analysis_is_always_utf8() { + // Positive examples. + assert!(t_bytes(r"a").is_always_utf8()); + assert!(t_bytes(r"ab").is_always_utf8()); + assert!(t_bytes(r"(?-u)a").is_always_utf8()); + assert!(t_bytes(r"(?-u)ab").is_always_utf8()); + assert!(t_bytes(r"\xFF").is_always_utf8()); + assert!(t_bytes(r"\xFF\xFF").is_always_utf8()); + assert!(t_bytes(r"[^a]").is_always_utf8()); + assert!(t_bytes(r"[^a][^a]").is_always_utf8()); + assert!(t_bytes(r"\b").is_always_utf8()); + assert!(t_bytes(r"\B").is_always_utf8()); + assert!(t_bytes(r"(?-u)\b").is_always_utf8()); + + // Negative examples. + assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8()); + assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8()); + assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\B").is_always_utf8()); + } + + #[test] + fn analysis_is_all_assertions() { + // Positive examples. + assert!(t(r"\b").is_all_assertions()); + assert!(t(r"\B").is_all_assertions()); + assert!(t(r"^").is_all_assertions()); + assert!(t(r"$").is_all_assertions()); + assert!(t(r"\A").is_all_assertions()); + assert!(t(r"\z").is_all_assertions()); + assert!(t(r"$^\z\A\b\B").is_all_assertions()); + assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions()); + assert!(t(r"^$|$^").is_all_assertions()); + assert!(t(r"((\b)+())*^").is_all_assertions()); + + // Negative examples. + assert!(!t(r"^a").is_all_assertions()); + } + + #[test] + fn analysis_is_anchored() { + // Positive examples. + assert!(t(r"^").is_anchored_start()); + assert!(t(r"$").is_anchored_end()); + assert!(t(r"^").is_line_anchored_start()); + assert!(t(r"$").is_line_anchored_end()); + + assert!(t(r"^^").is_anchored_start()); + assert!(t(r"$$").is_anchored_end()); + assert!(t(r"^^").is_line_anchored_start()); + assert!(t(r"$$").is_line_anchored_end()); + + assert!(t(r"^$").is_anchored_start()); + assert!(t(r"^$").is_anchored_end()); + assert!(t(r"^$").is_line_anchored_start()); + assert!(t(r"^$").is_line_anchored_end()); + + assert!(t(r"^foo").is_anchored_start()); + assert!(t(r"foo$").is_anchored_end()); + assert!(t(r"^foo").is_line_anchored_start()); + assert!(t(r"foo$").is_line_anchored_end()); + + assert!(t(r"^foo|^bar").is_anchored_start()); + assert!(t(r"foo$|bar$").is_anchored_end()); + assert!(t(r"^foo|^bar").is_line_anchored_start()); + assert!(t(r"foo$|bar$").is_line_anchored_end()); + + assert!(t(r"^(foo|bar)").is_anchored_start()); + assert!(t(r"(foo|bar)$").is_anchored_end()); + assert!(t(r"^(foo|bar)").is_line_anchored_start()); + assert!(t(r"(foo|bar)$").is_line_anchored_end()); + + assert!(t(r"^+").is_anchored_start()); + assert!(t(r"$+").is_anchored_end()); + assert!(t(r"^+").is_line_anchored_start()); + assert!(t(r"$+").is_line_anchored_end()); + assert!(t(r"^++").is_anchored_start()); + assert!(t(r"$++").is_anchored_end()); + assert!(t(r"^++").is_line_anchored_start()); + assert!(t(r"$++").is_line_anchored_end()); + assert!(t(r"(^)+").is_anchored_start()); + assert!(t(r"($)+").is_anchored_end()); + assert!(t(r"(^)+").is_line_anchored_start()); + assert!(t(r"($)+").is_line_anchored_end()); + + assert!(t(r"$^").is_anchored_start()); + assert!(t(r"$^").is_anchored_start()); + assert!(t(r"$^").is_line_anchored_end()); + assert!(t(r"$^").is_line_anchored_end()); + assert!(t(r"$^|^$").is_anchored_start()); + assert!(t(r"$^|^$").is_anchored_end()); + assert!(t(r"$^|^$").is_line_anchored_start()); + assert!(t(r"$^|^$").is_line_anchored_end()); + + assert!(t(r"\b^").is_anchored_start()); + assert!(t(r"$\b").is_anchored_end()); + assert!(t(r"\b^").is_line_anchored_start()); + assert!(t(r"$\b").is_line_anchored_end()); + assert!(t(r"^(?m:^)").is_anchored_start()); + assert!(t(r"(?m:$)$").is_anchored_end()); + assert!(t(r"^(?m:^)").is_line_anchored_start()); + assert!(t(r"(?m:$)$").is_line_anchored_end()); + assert!(t(r"(?m:^)^").is_anchored_start()); + assert!(t(r"$(?m:$)").is_anchored_end()); + assert!(t(r"(?m:^)^").is_line_anchored_start()); + assert!(t(r"$(?m:$)").is_line_anchored_end()); + + // Negative examples. + assert!(!t(r"(?m)^").is_anchored_start()); + assert!(!t(r"(?m)$").is_anchored_end()); + assert!(!t(r"(?m:^$)|$^").is_anchored_start()); + assert!(!t(r"(?m:^$)|$^").is_anchored_end()); + assert!(!t(r"$^|(?m:^$)").is_anchored_start()); + assert!(!t(r"$^|(?m:^$)").is_anchored_end()); + + assert!(!t(r"a^").is_anchored_start()); + assert!(!t(r"$a").is_anchored_start()); + assert!(!t(r"a^").is_line_anchored_start()); + assert!(!t(r"$a").is_line_anchored_start()); + + assert!(!t(r"a^").is_anchored_end()); + assert!(!t(r"$a").is_anchored_end()); + assert!(!t(r"a^").is_line_anchored_end()); + assert!(!t(r"$a").is_line_anchored_end()); + + assert!(!t(r"^foo|bar").is_anchored_start()); + assert!(!t(r"foo|bar$").is_anchored_end()); + assert!(!t(r"^foo|bar").is_line_anchored_start()); + assert!(!t(r"foo|bar$").is_line_anchored_end()); + + assert!(!t(r"^*").is_anchored_start()); + assert!(!t(r"$*").is_anchored_end()); + assert!(!t(r"^*").is_line_anchored_start()); + assert!(!t(r"$*").is_line_anchored_end()); + assert!(!t(r"^*+").is_anchored_start()); + assert!(!t(r"$*+").is_anchored_end()); + assert!(!t(r"^*+").is_line_anchored_start()); + assert!(!t(r"$*+").is_line_anchored_end()); + assert!(!t(r"^+*").is_anchored_start()); + assert!(!t(r"$+*").is_anchored_end()); + assert!(!t(r"^+*").is_line_anchored_start()); + assert!(!t(r"$+*").is_line_anchored_end()); + assert!(!t(r"(^)*").is_anchored_start()); + assert!(!t(r"($)*").is_anchored_end()); + assert!(!t(r"(^)*").is_line_anchored_start()); + assert!(!t(r"($)*").is_line_anchored_end()); + } + + #[test] + fn analysis_is_line_anchored() { + assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start()); + assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end()); + + assert!(t(r"(?m)^foo|^bar").is_line_anchored_start()); + assert!(t(r"(?m)foo$|bar$").is_line_anchored_end()); + + assert!(t(r"(?m)^").is_line_anchored_start()); + assert!(t(r"(?m)$").is_line_anchored_end()); + + assert!(t(r"(?m:^$)|$^").is_line_anchored_start()); + assert!(t(r"(?m:^$)|$^").is_line_anchored_end()); + + assert!(t(r"$^|(?m:^$)").is_line_anchored_start()); + assert!(t(r"$^|(?m:^$)").is_line_anchored_end()); + } + + #[test] + fn analysis_is_any_anchored() { + // Positive examples. + assert!(t(r"^").is_any_anchored_start()); + assert!(t(r"$").is_any_anchored_end()); + assert!(t(r"\A").is_any_anchored_start()); + assert!(t(r"\z").is_any_anchored_end()); + + // Negative examples. + assert!(!t(r"(?m)^").is_any_anchored_start()); + assert!(!t(r"(?m)$").is_any_anchored_end()); + assert!(!t(r"$").is_any_anchored_start()); + assert!(!t(r"^").is_any_anchored_end()); + } + + #[test] + fn analysis_is_match_empty() { + // Positive examples. + assert!(t(r"").is_match_empty()); + assert!(t(r"()").is_match_empty()); + assert!(t(r"()*").is_match_empty()); + assert!(t(r"()+").is_match_empty()); + assert!(t(r"()?").is_match_empty()); + assert!(t(r"a*").is_match_empty()); + assert!(t(r"a?").is_match_empty()); + assert!(t(r"a{0}").is_match_empty()); + assert!(t(r"a{0,}").is_match_empty()); + assert!(t(r"a{0,1}").is_match_empty()); + assert!(t(r"a{0,10}").is_match_empty()); + #[cfg(feature = "unicode-gencat")] + assert!(t(r"\pL*").is_match_empty()); + assert!(t(r"a*|b").is_match_empty()); + assert!(t(r"b|a*").is_match_empty()); + assert!(t(r"a|").is_match_empty()); + assert!(t(r"|a").is_match_empty()); + assert!(t(r"a||b").is_match_empty()); + assert!(t(r"a*a?(abcd)*").is_match_empty()); + assert!(t(r"^").is_match_empty()); + assert!(t(r"$").is_match_empty()); + assert!(t(r"(?m)^").is_match_empty()); + assert!(t(r"(?m)$").is_match_empty()); + assert!(t(r"\A").is_match_empty()); + assert!(t(r"\z").is_match_empty()); + assert!(t(r"\B").is_match_empty()); + assert!(t_bytes(r"(?-u)\B").is_match_empty()); + assert!(t(r"\b").is_match_empty()); + assert!(t(r"(?-u)\b").is_match_empty()); + + // Negative examples. + assert!(!t(r"a+").is_match_empty()); + assert!(!t(r"a{1}").is_match_empty()); + assert!(!t(r"a{1,}").is_match_empty()); + assert!(!t(r"a{1,2}").is_match_empty()); + assert!(!t(r"a{1,10}").is_match_empty()); + assert!(!t(r"b|a").is_match_empty()); + assert!(!t(r"a*a+(abcd)*").is_match_empty()); + } + + #[test] + fn analysis_is_literal() { + // Positive examples. + assert!(t(r"a").is_literal()); + assert!(t(r"ab").is_literal()); + assert!(t(r"abc").is_literal()); + assert!(t(r"(?m)abc").is_literal()); + + // Negative examples. + assert!(!t(r"").is_literal()); + assert!(!t(r"^").is_literal()); + assert!(!t(r"a|b").is_literal()); + assert!(!t(r"(a)").is_literal()); + assert!(!t(r"a+").is_literal()); + assert!(!t(r"foo(a)").is_literal()); + assert!(!t(r"(a)foo").is_literal()); + assert!(!t(r"[a]").is_literal()); + } + + #[test] + fn analysis_is_alternation_literal() { + // Positive examples. + assert!(t(r"a").is_alternation_literal()); + assert!(t(r"ab").is_alternation_literal()); + assert!(t(r"abc").is_alternation_literal()); + assert!(t(r"(?m)abc").is_alternation_literal()); + assert!(t(r"a|b").is_alternation_literal()); + assert!(t(r"a|b|c").is_alternation_literal()); + assert!(t(r"foo|bar").is_alternation_literal()); + assert!(t(r"foo|bar|baz").is_alternation_literal()); + + // Negative examples. + assert!(!t(r"").is_alternation_literal()); + assert!(!t(r"^").is_alternation_literal()); + assert!(!t(r"(a)").is_alternation_literal()); + assert!(!t(r"a+").is_alternation_literal()); + assert!(!t(r"foo(a)").is_alternation_literal()); + assert!(!t(r"(a)foo").is_alternation_literal()); + assert!(!t(r"[a]").is_alternation_literal()); + assert!(!t(r"[a]|b").is_alternation_literal()); + assert!(!t(r"a|[b]").is_alternation_literal()); + assert!(!t(r"(a)|b").is_alternation_literal()); + assert!(!t(r"a|(b)").is_alternation_literal()); + } +} diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs new file mode 100644 index 000000000..4f5a70909 --- /dev/null +++ b/vendor/regex-syntax/src/hir/visitor.rs @@ -0,0 +1,203 @@ +use crate::hir::{self, Hir, HirKind}; + +/// A trait for visiting the high-level IR (HIR) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on a high-level intermediate representation of a regular +/// expression without necessarily using recursion. In particular, this permits +/// callers to do case analysis with constant stack usage, which can be +/// important since the size of an HIR may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`](fn.visit.html) function. +pub trait Visitor { + /// The result of visiting an HIR. + type Output; + /// An error that visiting an HIR might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the HIR or an error. + fn finish(self) -> Result<Self::Output, Self::Err>; + + /// This method is called before beginning traversal of the HIR. + fn start(&mut self) {} + + /// This method is called on an `Hir` before descending into child `Hir` + /// nodes. + fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Hir` after descending all of its child + /// `Hir` nodes. + fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an alternation. + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Hir` while calling +/// appropriate methods provided by the +/// [`Visitor`](trait.Visitor.html) trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Hir` without using a stack size proportional to the depth +/// of the `Hir`. Namely, this method will instead use constant stack space, +/// but will use heap space proportional to the size of the `Hir`. This may be +/// desirable in cases where the size of `Hir` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit<V: Visitor>(hir: &Hir, visitor: V) -> Result<V::Output, V::Err> { + HeapVisitor::new().visit(hir, visitor) +} + +/// HeapVisitor visits every item in an `Hir` recursively using constant stack +/// size and a heap size proportional to the size of the `Hir`. +struct HeapVisitor<'a> { + /// A stack of `Hir` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Hir, Frame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Hir`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a hir::Repetition), + /// A stack frame allocated just before descending into a group's child + /// node. + Group(&'a hir::Group), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![] } + } + + fn visit<V: Visitor>( + &mut self, + mut hir: &'a Hir, + mut visitor: V, + ) -> Result<V::Output, V::Err> { + self.stack.clear(); + + visitor.start(); + loop { + visitor.visit_pre(hir)?; + if let Some(x) = self.induct(hir) { + let child = x.child(); + self.stack.push((hir, x)); + hir = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + visitor.visit_post(hir)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_hir, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_hir, frame)) => (post_hir, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + if let Frame::Alternation { .. } = x { + visitor.visit_alternation_in()?; + } + hir = x.child(); + self.stack.push((post_hir, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this HIR, so we can post visit it now. + visitor.visit_post(post_hir)?; + } + } + } + + /// Build a stack frame for the given HIR if one is needed (which occurs if + /// and only if there are child nodes in the HIR). Otherwise, return None. + fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> { + match *hir.kind() { + HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), + HirKind::Group(ref x) => Some(Frame::Group(x)), + HirKind::Concat(ref x) if x.is_empty() => None, + HirKind::Concat(ref x) => { + Some(Frame::Concat { head: &x[0], tail: &x[1..] }) + } + HirKind::Alternation(ref x) if x.is_empty() => None, + HirKind::Alternation(ref x) => { + Some(Frame::Alternation { head: &x[0], tail: &x[1..] }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> { + match induct { + Frame::Repetition(_) => None, + Frame::Group(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { head: &tail[0], tail: &tail[1..] }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child HIR node to visit. + fn child(&self) -> &'a Hir { + match *self { + Frame::Repetition(rep) => &rep.hir, + Frame::Group(group) => &group.hir, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} diff --git a/vendor/regex-syntax/src/lib.rs b/vendor/regex-syntax/src/lib.rs new file mode 100644 index 000000000..9e9af756a --- /dev/null +++ b/vendor/regex-syntax/src/lib.rs @@ -0,0 +1,312 @@ +/*! +This crate provides a robust regular expression parser. + +This crate defines two primary types: + +* [`Ast`](ast/enum.Ast.html) is the abstract syntax of a regular expression. + An abstract syntax corresponds to a *structured representation* of the + concrete syntax of a regular expression, where the concrete syntax is the + pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it + can be converted back to the original concrete syntax (modulo some details, + like whitespace). To a first approximation, the abstract syntax is complex + and difficult to analyze. +* [`Hir`](hir/struct.Hir.html) is the high-level intermediate representation + ("HIR" or "high-level IR" for short) of regular expression. It corresponds to + an intermediate state of a regular expression that sits between the abstract + syntax and the low level compiled opcodes that are eventually responsible for + executing a regular expression search. Given some high-level IR, it is not + possible to produce the original concrete syntax (although it is possible to + produce an equivalent concrete syntax, but it will likely scarcely resemble + the original pattern). To a first approximation, the high-level IR is simple + and easy to analyze. + +These two types come with conversion routines: + +* An [`ast::parse::Parser`](ast/parse/struct.Parser.html) converts concrete + syntax (a `&str`) to an [`Ast`](ast/enum.Ast.html). +* A [`hir::translate::Translator`](hir/translate/struct.Translator.html) + converts an [`Ast`](ast/enum.Ast.html) to a [`Hir`](hir/struct.Hir.html). + +As a convenience, the above two conversion routines are combined into one via +the top-level [`Parser`](struct.Parser.html) type. This `Parser` will first +convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. + + +# Example + +This example shows how to parse a pattern string into its HIR: + +``` +use regex_syntax::Parser; +use regex_syntax::hir::{self, Hir}; + +let hir = Parser::new().parse("a|b").unwrap(); +assert_eq!(hir, Hir::alternation(vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::literal(hir::Literal::Unicode('b')), +])); +``` + + +# Concrete syntax supported + +The concrete syntax is documented as part of the public API of the +[`regex` crate](https://docs.rs/regex/%2A/regex/#syntax). + + +# Input safety + +A key feature of this library is that it is safe to use with end user facing +input. This plays a significant role in the internal implementation. In +particular: + +1. Parsers provide a `nest_limit` option that permits callers to control how + deeply nested a regular expression is allowed to be. This makes it possible + to do case analysis over an `Ast` or an `Hir` using recursion without + worrying about stack overflow. +2. Since relying on a particular stack size is brittle, this crate goes to + great lengths to ensure that all interactions with both the `Ast` and the + `Hir` do not use recursion. Namely, they use constant stack space and heap + space proportional to the size of the original pattern string (in bytes). + This includes the type's corresponding destructors. (One exception to this + is literal extraction, but this will eventually get fixed.) + + +# Error reporting + +The `Display` implementations on all `Error` types exposed in this library +provide nice human readable errors that are suitable for showing to end users +in a monospace font. + + +# Literal extraction + +This crate provides limited support for +[literal extraction from `Hir` values](hir/literal/struct.Literals.html). +Be warned that literal extraction currently uses recursion, and therefore, +stack size proportional to the size of the `Hir`. + +The purpose of literal extraction is to speed up searches. That is, if you +know a regular expression must match a prefix or suffix literal, then it is +often quicker to search for instances of that literal, and then confirm or deny +the match using the full regular expression engine. These optimizations are +done automatically in the `regex` crate. + + +# Crate features + +An important feature provided by this crate is its Unicode support. This +includes things like case folding, boolean properties, general categories, +scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`. +However, a downside of this support is that it requires bundling several +Unicode data tables that are substantial in size. + +A fair number of use cases do not require full Unicode support. For this +reason, this crate exposes a number of features to control which Unicode +data is available. + +If a regular expression attempts to use a Unicode feature that is not available +because the corresponding crate feature was disabled, then translating that +regular expression to an `Hir` will return an error. (It is still possible +construct an `Ast` for such a regular expression, since Unicode data is not +used until translation to an `Hir`.) Stated differently, enabling or disabling +any of the features below can only add or subtract from the total set of valid +regular expressions. Enabling or disabling a feature will never modify the +match semantics of a regular expression. + +The following features are available: + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. +*/ + +#![deny(missing_docs)] +#![warn(missing_debug_implementations)] +#![forbid(unsafe_code)] + +pub use crate::error::{Error, Result}; +pub use crate::parser::{Parser, ParserBuilder}; +pub use crate::unicode::UnicodeWordError; + +pub mod ast; +mod either; +mod error; +pub mod hir; +mod parser; +mod unicode; +mod unicode_tables; +pub mod utf8; + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn escape(text: &str) -> String { + let mut quoted = String::new(); + escape_into(text, &mut quoted); + quoted +} + +/// Escapes all meta characters in `text` and writes the result into `buf`. +/// +/// This will append escape characters into the given buffer. The characters +/// that are appended are safe to use as a literal in a regular expression. +pub fn escape_into(text: &str, buf: &mut String) { + buf.reserve(text.len()); + for c in text.chars() { + if is_meta_character(c) { + buf.push('\\'); + } + buf.push(c); + } +} + +/// Returns true if the give character has significance in a regex. +/// +/// These are the only characters that are allowed to be escaped, with one +/// exception: an ASCII space character may be escaped when extended mode (with +/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns +/// `false`. +/// +/// Note that the set of characters for which this function returns `true` or +/// `false` is fixed and won't change in a semver compatible release. +pub fn is_meta_character(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' + | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, + _ => false, + } +} + +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +/// +/// # Panics +/// +/// If the `unicode-perl` feature is not enabled, then this function panics. +/// For this reason, it is recommended that callers use +/// [`try_is_word_character`](fn.try_is_word_character.html) +/// instead. +pub fn is_word_character(c: char) -> bool { + try_is_word_character(c).expect("unicode-perl feature must be enabled") +} + +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +/// +/// # Errors +/// +/// If the `unicode-perl` feature is not enabled, then this function always +/// returns an error. +pub fn try_is_word_character( + c: char, +) -> std::result::Result<bool, UnicodeWordError> { + unicode::is_word_character(c) +} + +/// Returns true if and only if the given character is an ASCII word character. +/// +/// An ASCII word character is defined by the following character class: +/// `[_0-9a-zA-Z]'. +pub fn is_word_byte(c: u8) -> bool { + match c { + b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn escape_meta() { + assert_eq!( + escape(r"\.+*?()|[]{}^$#&-~"), + r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string() + ); + } + + #[test] + fn word_byte() { + assert!(is_word_byte(b'a')); + assert!(!is_word_byte(b'-')); + } + + #[test] + #[cfg(feature = "unicode-perl")] + fn word_char() { + assert!(is_word_character('a'), "ASCII"); + assert!(is_word_character('à '), "Latin-1"); + assert!(is_word_character('β'), "Greek"); + assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)"); + assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)"); + assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)"); + assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)"); + assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)"); + assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)"); + assert!(!is_word_character('-')); + assert!(!is_word_character('â')); + } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_panic() { + assert!(is_word_character('a')); + } + + #[test] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_error() { + assert!(try_is_word_character('a').is_err()); + } +} diff --git a/vendor/regex-syntax/src/parser.rs b/vendor/regex-syntax/src/parser.rs new file mode 100644 index 000000000..a5ee524a8 --- /dev/null +++ b/vendor/regex-syntax/src/parser.rs @@ -0,0 +1,200 @@ +use crate::ast; +use crate::hir; + +use crate::Result; + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +/// +/// This type combines the builder options for both the +/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) +/// and the +/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). +#[derive(Clone, Debug, Default)] +pub struct ParserBuilder { + ast: ast::parse::ParserBuilder, + hir: hir::translate::TranslatorBuilder, +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder::default() + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { ast: self.ast.build(), hir: self.hir.build() } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.ast.nest_limit(limit); + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.octal(yes); + self + } + + /// When enabled, the parser will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the parser is guaranteed to produce + /// an expression that will only ever match valid UTF-8 (otherwise, the + /// parser will return an error). + /// + /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII + /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause + /// the parser to return an error. Namely, a negated ASCII word boundary + /// can result in matching positions that aren't valid UTF-8 boundaries. + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.allow_invalid_utf8(yes); + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.ignore_whitespace(yes); + self + } + + /// Enable or disable the case insensitive flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.case_insensitive(yes); + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.multi_line(yes); + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.dot_matches_new_line(yes); + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.swap_greed(yes); + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.unicode(yes); + self + } +} + +/// A convenience parser for regular expressions. +/// +/// This parser takes as input a regular expression pattern string (the +/// "concrete syntax") and returns a high-level intermediate representation +/// (the HIR) suitable for most types of analysis. In particular, this parser +/// hides the intermediate state of producing an AST (the "abstract syntax"). +/// The AST is itself far more complex than the HIR, so this parser serves as a +/// convenience for never having to deal with it at all. +/// +/// If callers have more fine grained use cases that need an AST, then please +/// see the [`ast::parse`](ast/parse/index.html) module. +/// +/// A `Parser` can be configured in more detail via a +/// [`ParserBuilder`](struct.ParserBuilder.html). +#[derive(Clone, Debug)] +pub struct Parser { + ast: ast::parse::Parser, + hir: hir::translate::Translator, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with `parse` method. The parse method returns + /// a high level intermediate representation of the given regular + /// expression. + /// + /// To set configuration options on the parser, use + /// [`ParserBuilder`](struct.ParserBuilder.html). + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into a high level intermediate + /// representation. + pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> { + let ast = self.ast.parse(pattern)?; + let hir = self.hir.translate(pattern, &ast)?; + Ok(hir) + } +} diff --git a/vendor/regex-syntax/src/unicode.rs b/vendor/regex-syntax/src/unicode.rs new file mode 100644 index 000000000..b894c7db2 --- /dev/null +++ b/vendor/regex-syntax/src/unicode.rs @@ -0,0 +1,999 @@ +use std::error; +use std::fmt; +use std::result; + +use crate::hir; + +/// A type alias for errors specific to Unicode handling of classes. +pub type Result<T> = result::Result<T, Error>; + +/// An inclusive range of codepoints from a generated file (hence the static +/// lifetime). +type Range = &'static [(char, char)]; + +/// An error that occurs when dealing with Unicode. +/// +/// We don't impl the Error trait here because these always get converted +/// into other public errors. (This error type isn't exported.) +#[derive(Debug)] +pub enum Error { + PropertyNotFound, + PropertyValueNotFound, + // Not used when unicode-perl is enabled. + #[allow(dead_code)] + PerlClassNotFound, +} + +/// A type alias for errors specific to Unicode case folding. +pub type FoldResult<T> = result::Result<T, CaseFoldError>; + +/// An error that occurs when Unicode-aware simple case folding fails. +/// +/// This error can occur when the case mapping tables necessary for Unicode +/// aware case folding are unavailable. This only occurs when the +/// `unicode-case` feature is disabled. (The feature is enabled by default.) +#[derive(Debug)] +pub struct CaseFoldError(()); + +impl error::Error for CaseFoldError {} + +impl fmt::Display for CaseFoldError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "Unicode-aware case folding is not available \ + (probably because the unicode-case feature is not enabled)" + ) + } +} + +/// An error that occurs when the Unicode-aware `\w` class is unavailable. +/// +/// This error can occur when the data tables necessary for the Unicode aware +/// Perl character class `\w` are unavailable. This only occurs when the +/// `unicode-perl` feature is disabled. (The feature is enabled by default.) +#[derive(Debug)] +pub struct UnicodeWordError(()); + +impl error::Error for UnicodeWordError {} + +impl fmt::Display for UnicodeWordError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "Unicode-aware \\w class is not available \ + (probably because the unicode-perl feature is not enabled)" + ) + } +} + +/// Return an iterator over the equivalence class of simple case mappings +/// for the given codepoint. The equivalence class does not include the +/// given codepoint. +/// +/// If the equivalence class is empty, then this returns the next scalar +/// value that has a non-empty equivalence class, if it exists. If no such +/// scalar value exists, then `None` is returned. The point of this behavior +/// is to permit callers to avoid calling `simple_fold` more than they need +/// to, since there is some cost to fetching the equivalence class. +/// +/// This returns an error if the Unicode case folding tables are not available. +pub fn simple_fold( + c: char, +) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> { + #[cfg(not(feature = "unicode-case"))] + fn imp( + _: char, + ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> + { + use std::option::IntoIter; + Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(())) + } + + #[cfg(feature = "unicode-case")] + fn imp( + c: char, + ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> + { + use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; + + Ok(CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |&(c1, _)| c1) + .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c)) + .map_err(|i| { + if i >= CASE_FOLDING_SIMPLE.len() { + None + } else { + Some(CASE_FOLDING_SIMPLE[i].0) + } + })) + } + + imp(c) +} + +/// Returns true if and only if the given (inclusive) range contains at least +/// one Unicode scalar value that has a non-empty non-trivial simple case +/// mapping. +/// +/// This function panics if `end < start`. +/// +/// This returns an error if the Unicode case folding tables are not available. +pub fn contains_simple_case_mapping( + start: char, + end: char, +) -> FoldResult<bool> { + #[cfg(not(feature = "unicode-case"))] + fn imp(_: char, _: char) -> FoldResult<bool> { + Err(CaseFoldError(())) + } + + #[cfg(feature = "unicode-case")] + fn imp(start: char, end: char) -> FoldResult<bool> { + use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; + use std::cmp::Ordering; + + assert!(start <= end); + Ok(CASE_FOLDING_SIMPLE + .binary_search_by(|&(c, _)| { + if start <= c && c <= end { + Ordering::Equal + } else if c > end { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok()) + } + + imp(start, end) +} + +/// A query for finding a character class defined by Unicode. This supports +/// either use of a property name directly, or lookup by property value. The +/// former generally refers to Binary properties (see UTS#44, Table 8), but +/// as a special exception (see UTS#18, Section 1.2) both general categories +/// (an enumeration) and scripts (a catalog) are supported as if each of their +/// possible values were a binary property. +/// +/// In all circumstances, property names and values are normalized and +/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. +/// +/// The lifetime `'a` refers to the shorter of the lifetimes of property name +/// and property value. +#[derive(Debug)] +pub enum ClassQuery<'a> { + /// Return a class corresponding to a Unicode binary property, named by + /// a single letter. + OneLetter(char), + /// Return a class corresponding to a Unicode binary property. + /// + /// Note that, by special exception (see UTS#18, Section 1.2), both + /// general category values and script values are permitted here as if + /// they were a binary property. + Binary(&'a str), + /// Return a class corresponding to all codepoints whose property + /// (identified by `property_name`) corresponds to the given value + /// (identified by `property_value`). + ByValue { + /// A property name. + property_name: &'a str, + /// A property value. + property_value: &'a str, + }, +} + +impl<'a> ClassQuery<'a> { + fn canonicalize(&self) -> Result<CanonicalClassQuery> { + match *self { + ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), + ClassQuery::Binary(name) => self.canonical_binary(name), + ClassQuery::ByValue { property_name, property_value } => { + let property_name = symbolic_name_normalize(property_name); + let property_value = symbolic_name_normalize(property_value); + + let canon_name = match canonical_prop(&property_name)? { + None => return Err(Error::PropertyNotFound), + Some(canon_name) => canon_name, + }; + Ok(match canon_name { + "General_Category" => { + let canon = match canonical_gencat(&property_value)? { + None => return Err(Error::PropertyValueNotFound), + Some(canon) => canon, + }; + CanonicalClassQuery::GeneralCategory(canon) + } + "Script" => { + let canon = match canonical_script(&property_value)? { + None => return Err(Error::PropertyValueNotFound), + Some(canon) => canon, + }; + CanonicalClassQuery::Script(canon) + } + _ => { + let vals = match property_values(canon_name)? { + None => return Err(Error::PropertyValueNotFound), + Some(vals) => vals, + }; + let canon_val = + match canonical_value(vals, &property_value) { + None => { + return Err(Error::PropertyValueNotFound) + } + Some(canon_val) => canon_val, + }; + CanonicalClassQuery::ByValue { + property_name: canon_name, + property_value: canon_val, + } + } + }) + } + } + } + + fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> { + let norm = symbolic_name_normalize(name); + + // This is a special case where 'cf' refers to the 'Format' general + // category, but where the 'cf' abbreviation is also an abbreviation + // for the 'Case_Folding' property. But we want to treat it as + // a general category. (Currently, we don't even support the + // 'Case_Folding' property. But if we do in the future, users will be + // required to spell it out.) + if norm != "cf" { + if let Some(canon) = canonical_prop(&norm)? { + return Ok(CanonicalClassQuery::Binary(canon)); + } + } + if let Some(canon) = canonical_gencat(&norm)? { + return Ok(CanonicalClassQuery::GeneralCategory(canon)); + } + if let Some(canon) = canonical_script(&norm)? { + return Ok(CanonicalClassQuery::Script(canon)); + } + Err(Error::PropertyNotFound) + } +} + +/// Like ClassQuery, but its parameters have been canonicalized. This also +/// differentiates binary properties from flattened general categories and +/// scripts. +#[derive(Debug, Eq, PartialEq)] +enum CanonicalClassQuery { + /// The canonical binary property name. + Binary(&'static str), + /// The canonical general category name. + GeneralCategory(&'static str), + /// The canonical script name. + Script(&'static str), + /// An arbitrary association between property and value, both of which + /// have been canonicalized. + /// + /// Note that by construction, the property name of ByValue will never + /// be General_Category or Script. Those two cases are subsumed by the + /// eponymous variants. + ByValue { + /// The canonical property name. + property_name: &'static str, + /// The canonical property value. + property_value: &'static str, + }, +} + +/// Looks up a Unicode class given a query. If one doesn't exist, then +/// `None` is returned. +pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> { + use self::CanonicalClassQuery::*; + + match query.canonicalize()? { + Binary(name) => bool_property(name), + GeneralCategory(name) => gencat(name), + Script(name) => script(name), + ByValue { property_name: "Age", property_value } => { + let mut class = hir::ClassUnicode::empty(); + for set in ages(property_value)? { + class.union(&hir_class(set)); + } + Ok(class) + } + ByValue { property_name: "Script_Extensions", property_value } => { + script_extension(property_value) + } + ByValue { + property_name: "Grapheme_Cluster_Break", + property_value, + } => gcb(property_value), + ByValue { property_name: "Sentence_Break", property_value } => { + sb(property_value) + } + ByValue { property_name: "Word_Break", property_value } => { + wb(property_value) + } + _ => { + // What else should we support? + Err(Error::PropertyNotFound) + } + } +} + +/// Returns a Unicode aware class for \w. +/// +/// This returns an error if the data is not available for \w. +pub fn perl_word() -> Result<hir::ClassUnicode> { + #[cfg(not(feature = "unicode-perl"))] + fn imp() -> Result<hir::ClassUnicode> { + Err(Error::PerlClassNotFound) + } + + #[cfg(feature = "unicode-perl")] + fn imp() -> Result<hir::ClassUnicode> { + use crate::unicode_tables::perl_word::PERL_WORD; + Ok(hir_class(PERL_WORD)) + } + + imp() +} + +/// Returns a Unicode aware class for \s. +/// +/// This returns an error if the data is not available for \s. +pub fn perl_space() -> Result<hir::ClassUnicode> { + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] + fn imp() -> Result<hir::ClassUnicode> { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] + fn imp() -> Result<hir::ClassUnicode> { + use crate::unicode_tables::perl_space::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + #[cfg(feature = "unicode-bool")] + fn imp() -> Result<hir::ClassUnicode> { + use crate::unicode_tables::property_bool::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + imp() +} + +/// Returns a Unicode aware class for \d. +/// +/// This returns an error if the data is not available for \d. +pub fn perl_digit() -> Result<hir::ClassUnicode> { + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] + fn imp() -> Result<hir::ClassUnicode> { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] + fn imp() -> Result<hir::ClassUnicode> { + use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + #[cfg(feature = "unicode-gencat")] + fn imp() -> Result<hir::ClassUnicode> { + use crate::unicode_tables::general_category::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + imp() +} + +/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. +pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { + let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + hir::ClassUnicode::new(hir_ranges) +} + +/// Returns true only if the given codepoint is in the `\w` character class. +/// +/// If the `unicode-perl` feature is not enabled, then this returns an error. +pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> { + #[cfg(not(feature = "unicode-perl"))] + fn imp(_: char) -> result::Result<bool, UnicodeWordError> { + Err(UnicodeWordError(())) + } + + #[cfg(feature = "unicode-perl")] + fn imp(c: char) -> result::Result<bool, UnicodeWordError> { + use crate::is_word_byte; + use crate::unicode_tables::perl_word::PERL_WORD; + use std::cmp::Ordering; + + if c <= 0x7F as char && is_word_byte(c as u8) { + return Ok(true); + } + Ok(PERL_WORD + .binary_search_by(|&(start, end)| { + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok()) + } + + imp(c) +} + +/// A mapping of property values for a specific property. +/// +/// The first element of each tuple is a normalized property value while the +/// second element of each tuple is the corresponding canonical property +/// value. +type PropertyValues = &'static [(&'static str, &'static str)]; + +fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> { + Ok(match normalized_value { + "any" => Some("Any"), + "assigned" => Some("Assigned"), + "ascii" => Some("ASCII"), + _ => { + let gencats = property_values("General_Category")?.unwrap(); + canonical_value(gencats, normalized_value) + } + }) +} + +fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> { + let scripts = property_values("Script")?.unwrap(); + Ok(canonical_value(scripts, normalized_value)) +} + +/// Find the canonical property name for the given normalized property name. +/// +/// If no such property exists, then `None` is returned. +/// +/// The normalized property name must have been normalized according to +/// UAX44 LM3, which can be done using `symbolic_name_normalize`. +/// +/// If the property names data is not available, then an error is returned. +fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &str) -> Result<Option<&'static str>> { + Err(Error::PropertyNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &str) -> Result<Option<&'static str>> { + use crate::unicode_tables::property_names::PROPERTY_NAMES; + + Ok(PROPERTY_NAMES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_NAMES[i].1)) + } + + imp(normalized_name) +} + +/// Find the canonical property value for the given normalized property +/// value. +/// +/// The given property values should correspond to the values for the property +/// under question, which can be found using `property_values`. +/// +/// If no such property value exists, then `None` is returned. +/// +/// The normalized property value must have been normalized according to +/// UAX44 LM3, which can be done using `symbolic_name_normalize`. +fn canonical_value( + vals: PropertyValues, + normalized_value: &str, +) -> Option<&'static str> { + vals.binary_search_by_key(&normalized_value, |&(n, _)| n) + .ok() + .map(|i| vals[i].1) +} + +/// Return the table of property values for the given property name. +/// +/// If the property values data is not available, then an error is returned. +fn property_values( + canonical_property_name: &'static str, +) -> Result<Option<PropertyValues>> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &'static str) -> Result<Option<PropertyValues>> { + Err(Error::PropertyValueNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &'static str) -> Result<Option<PropertyValues>> { + use crate::unicode_tables::property_values::PROPERTY_VALUES; + + Ok(PROPERTY_VALUES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_VALUES[i].1)) + } + + imp(canonical_property_name) +} + +// This is only used in some cases, but small enough to just let it be dead +// instead of figuring out (and maintaining) the right set of features. +#[allow(dead_code)] +fn property_set( + name_map: &'static [(&'static str, Range)], + canonical: &'static str, +) -> Option<Range> { + name_map + .binary_search_by_key(&canonical, |x| x.0) + .ok() + .map(|i| name_map[i].1) +} + +/// Returns an iterator over Unicode Age sets. Each item corresponds to a set +/// of codepoints that were added in a particular revision of Unicode. The +/// iterator yields items in chronological order. +/// +/// If the given age value isn't valid or if the data isn't available, then an +/// error is returned instead. +fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { + #[cfg(not(feature = "unicode-age"))] + fn imp(_: &str) -> Result<impl Iterator<Item = Range>> { + use std::option::IntoIter; + Err::<IntoIter<Range>, _>(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-age")] + fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { + use crate::unicode_tables::age; + + const AGES: &'static [(&'static str, Range)] = &[ + ("V1_1", age::V1_1), + ("V2_0", age::V2_0), + ("V2_1", age::V2_1), + ("V3_0", age::V3_0), + ("V3_1", age::V3_1), + ("V3_2", age::V3_2), + ("V4_0", age::V4_0), + ("V4_1", age::V4_1), + ("V5_0", age::V5_0), + ("V5_1", age::V5_1), + ("V5_2", age::V5_2), + ("V6_0", age::V6_0), + ("V6_1", age::V6_1), + ("V6_2", age::V6_2), + ("V6_3", age::V6_3), + ("V7_0", age::V7_0), + ("V8_0", age::V8_0), + ("V9_0", age::V9_0), + ("V10_0", age::V10_0), + ("V11_0", age::V11_0), + ("V12_0", age::V12_0), + ("V12_1", age::V12_1), + ("V13_0", age::V13_0), + ]; + assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); + + let pos = AGES.iter().position(|&(age, _)| canonical_age == age); + match pos { + None => Err(Error::PropertyValueNotFound), + Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)), + } + } + + imp(canonical_age) +} + +/// Returns the Unicode HIR class corresponding to the given general category. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given general category could not be found, or if the general +/// category data is not available, then an error is returned. +fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> { + #[cfg(not(feature = "unicode-gencat"))] + fn imp(_: &'static str) -> Result<hir::ClassUnicode> { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-gencat")] + fn imp(name: &'static str) -> Result<hir::ClassUnicode> { + use crate::unicode_tables::general_category::BY_NAME; + match name { + "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), + "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), + "Assigned" => { + let mut cls = gencat("Unassigned")?; + cls.negate(); + Ok(cls) + } + name => property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound), + } + } + + match canonical_name { + "Decimal_Number" => perl_digit(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given script. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script could not be found, or if the script data is not +/// available, then an error is returned. +fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result<hir::ClassUnicode> { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result<hir::ClassUnicode> { + use crate::unicode_tables::script::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given script extension. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script extension could not be found, or if the script data is +/// not available, then an error is returned. +fn script_extension( + canonical_name: &'static str, +) -> Result<hir::ClassUnicode> { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result<hir::ClassUnicode> { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result<hir::ClassUnicode> { + use crate::unicode_tables::script_extension::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given Unicode boolean +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given boolean property could not be found, or if the boolean +/// property data is not available, then an error is returned. +fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> { + #[cfg(not(feature = "unicode-bool"))] + fn imp(_: &'static str) -> Result<hir::ClassUnicode> { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-bool")] + fn imp(name: &'static str) -> Result<hir::ClassUnicode> { + use crate::unicode_tables::property_bool::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyNotFound) + } + + match canonical_name { + "Decimal_Number" => perl_digit(), + "White_Space" => perl_space(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given grapheme cluster +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result<hir::ClassUnicode> { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result<hir::ClassUnicode> { + use crate::unicode_tables::grapheme_cluster_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given word break +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result<hir::ClassUnicode> { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result<hir::ClassUnicode> { + use crate::unicode_tables::word_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given sentence +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result<hir::ClassUnicode> { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result<hir::ClassUnicode> { + use crate::unicode_tables::sentence_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Like symbolic_name_normalize_bytes, but operates on a string. +fn symbolic_name_normalize(x: &str) -> String { + let mut tmp = x.as_bytes().to_vec(); + let len = symbolic_name_normalize_bytes(&mut tmp).len(); + tmp.truncate(len); + // This should always succeed because `symbolic_name_normalize_bytes` + // guarantees that `&tmp[..len]` is always valid UTF-8. + // + // N.B. We could avoid the additional UTF-8 check here, but it's unlikely + // to be worth skipping the additional safety check. A benchmark must + // justify it first. + String::from_utf8(tmp).unwrap() +} + +/// Normalize the given symbolic name in place according to UAX44-LM3. +/// +/// A "symbolic name" typically corresponds to property names and property +/// value aliases. Note, though, that it should not be applied to property +/// string values. +/// +/// The slice returned is guaranteed to be valid UTF-8 for all possible values +/// of `slice`. +/// +/// See: https://unicode.org/reports/tr44/#UAX44-LM3 +fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { + // I couldn't find a place in the standard that specified that property + // names/aliases had a particular structure (unlike character names), but + // we assume that it's ASCII only and drop anything that isn't ASCII. + let mut start = 0; + let mut starts_with_is = false; + if slice.len() >= 2 { + // Ignore any "is" prefix. + starts_with_is = slice[0..2] == b"is"[..] + || slice[0..2] == b"IS"[..] + || slice[0..2] == b"iS"[..] + || slice[0..2] == b"Is"[..]; + if starts_with_is { + start = 2; + } + } + let mut next_write = 0; + for i in start..slice.len() { + // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid + // UTF-8, we ensure that the slice contains only ASCII bytes. In + // particular, we drop every non-ASCII byte from the normalized string. + let b = slice[i]; + if b == b' ' || b == b'_' || b == b'-' { + continue; + } else if b'A' <= b && b <= b'Z' { + slice[next_write] = b + (b'a' - b'A'); + next_write += 1; + } else if b <= 0x7F { + slice[next_write] = b; + next_write += 1; + } + } + // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally + // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross + // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it + // is actually an alias for the 'Other' general category. + if starts_with_is && next_write == 1 && slice[0] == b'c' { + slice[0] = b'i'; + slice[1] = b's'; + slice[2] = b'c'; + next_write = 3; + } + &mut slice[..next_write] +} + +#[cfg(test)] +mod tests { + use super::{ + contains_simple_case_mapping, simple_fold, symbolic_name_normalize, + symbolic_name_normalize_bytes, + }; + + #[cfg(feature = "unicode-case")] + fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { + simple_fold(c).unwrap().unwrap() + } + + #[cfg(feature = "unicode-case")] + fn simple_fold_err(c: char) -> Option<char> { + match simple_fold(c).unwrap() { + Ok(_) => unreachable!("simple_fold returned Ok iterator"), + Err(next) => next, + } + } + + #[cfg(feature = "unicode-case")] + fn contains_case_map(start: char, end: char) -> bool { + contains_simple_case_mapping(start, end).unwrap() + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_k() { + let xs: Vec<char> = simple_fold_ok('k').collect(); + assert_eq!(xs, vec!['K', 'âĒ']); + + let xs: Vec<char> = simple_fold_ok('K').collect(); + assert_eq!(xs, vec!['k', 'âĒ']); + + let xs: Vec<char> = simple_fold_ok('âĒ').collect(); + assert_eq!(xs, vec!['K', 'k']); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_a() { + let xs: Vec<char> = simple_fold_ok('a').collect(); + assert_eq!(xs, vec!['A']); + + let xs: Vec<char> = simple_fold_ok('A').collect(); + assert_eq!(xs, vec!['a']); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_empty() { + assert_eq!(Some('A'), simple_fold_err('?')); + assert_eq!(Some('A'), simple_fold_err('@')); + assert_eq!(Some('a'), simple_fold_err('[')); + assert_eq!(Some('â°'), simple_fold_err('â')); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_max() { + assert_eq!(None, simple_fold_err('\u{10FFFE}')); + assert_eq!(None, simple_fold_err('\u{10FFFF}')); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn simple_fold_disabled() { + assert!(simple_fold('a').is_err()); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn range_contains() { + assert!(contains_case_map('A', 'A')); + assert!(contains_case_map('Z', 'Z')); + assert!(contains_case_map('A', 'Z')); + assert!(contains_case_map('@', 'A')); + assert!(contains_case_map('Z', '[')); + assert!(contains_case_map('â', 'â°')); + + assert!(!contains_case_map('[', '[')); + assert!(!contains_case_map('[', '`')); + + assert!(!contains_case_map('â', 'â')); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn range_contains_disabled() { + assert!(contains_simple_case_mapping('a', 'a').is_err()); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn regression_466() { + use super::{CanonicalClassQuery, ClassQuery}; + + let q = ClassQuery::OneLetter('C'); + assert_eq!( + q.canonicalize().unwrap(), + CanonicalClassQuery::GeneralCategory("Other") + ); + } + + #[test] + fn sym_normalize() { + let sym_norm = symbolic_name_normalize; + + assert_eq!(sym_norm("Line_Break"), "linebreak"); + assert_eq!(sym_norm("Line-break"), "linebreak"); + assert_eq!(sym_norm("linebreak"), "linebreak"); + assert_eq!(sym_norm("BA"), "ba"); + assert_eq!(sym_norm("ba"), "ba"); + assert_eq!(sym_norm("Greek"), "greek"); + assert_eq!(sym_norm("isGreek"), "greek"); + assert_eq!(sym_norm("IS_Greek"), "greek"); + assert_eq!(sym_norm("isc"), "isc"); + assert_eq!(sym_norm("is c"), "isc"); + assert_eq!(sym_norm("is_c"), "isc"); + } + + #[test] + fn valid_utf8_symbolic() { + let mut x = b"abc\xFFxyz".to_vec(); + let y = symbolic_name_normalize_bytes(&mut x); + assert_eq!(y, b"abcxyz"); + } +} diff --git a/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE b/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE new file mode 100644 index 000000000..b82826bdb --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE @@ -0,0 +1,57 @@ +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE + +Unicode Data Files include all data files under the directories +http://www.unicode.org/Public/, http://www.unicode.org/reports/, +http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and +http://www.unicode.org/utility/trac/browser/. + +Unicode Data Files do not include PDF online code charts under the +directory http://www.unicode.org/Public/. + +Software includes any source code published in the Unicode Standard +or under the directories +http://www.unicode.org/Public/, http://www.unicode.org/reports/, +http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and +http://www.unicode.org/utility/trac/browser/. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright Š 1991-2018 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in http://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. diff --git a/vendor/regex-syntax/src/unicode_tables/age.rs b/vendor/regex-syntax/src/unicode_tables/age.rs new file mode 100644 index 000000000..7772919eb --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/age.rs @@ -0,0 +1,1673 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate age ucd-13.0.0 --chars +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("V10_0", V10_0), + ("V11_0", V11_0), + ("V12_0", V12_0), + ("V12_1", V12_1), + ("V13_0", V13_0), + ("V1_1", V1_1), + ("V2_0", V2_0), + ("V2_1", V2_1), + ("V3_0", V3_0), + ("V3_1", V3_1), + ("V3_2", V3_2), + ("V4_0", V4_0), + ("V4_1", V4_1), + ("V5_0", V5_0), + ("V5_1", V5_1), + ("V5_2", V5_2), + ("V6_0", V6_0), + ("V6_1", V6_1), + ("V6_2", V6_2), + ("V6_3", V6_3), + ("V7_0", V7_0), + ("V8_0", V8_0), + ("V9_0", V9_0), +]; + +pub const V10_0: &'static [(char, char)] = &[ + ('āĄ ', 'āĄĒ'), + ('ā§ŧ', 'ā§Ŋ'), + ('\u{afa}', '\u{aff}'), + ('\u{d00}', '\u{d00}'), + ('\u{d3b}', '\u{d3c}'), + ('áŗˇ', 'áŗˇ'), + ('\u{1df6}', '\u{1df9}'), + ('âŋ', 'âŋ'), + ('âŋ', 'âŋ'), + ('â¯', 'â¯'), + ('âš
', 'âš'), + ('ãŽ', 'ãŽ'), + ('éŋ', 'éŋĒ'), + ('đ', 'đ¯'), + ('đ¨', '\u{11a47}'), + ('đŠ', 'đĒ'), + ('đĒ', 'đĒ'), + ('đĒ', 'đĒĸ'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('đĩ', 'đĩ'), + ('đŋĄ', 'đŋĄ'), + ('đ', 'đ'), + ('đ
°', 'đģ'), + ('đ ', 'đĨ'), + ('đ', 'đ'), + ('đˇ', 'đ¸'), + ('đ¤', 'đ¤'), + ('đ¤', 'đ¤'), + ('đ¤¨', 'đ¤¯'), + ('đ¤ą', 'đ¤˛'), + ('đĨ', 'đĨ'), + ('đĨ', 'đĨĢ'), + ('đĻ', 'đĻ'), + ('đ§', 'đ§Ļ'), + ('đŦē°', 'đŽ¯ '), +]; + +pub const V11_0: &'static [(char, char)] = &[ + ('Õ ', 'Õ '), + ('Ö', 'Ö'), + ('ׯ', 'ׯ'), + ('\u{7fd}', 'ßŋ'), + ('\u{8d3}', '\u{8d3}'), + ('\u{9fe}', '\u{9fe}'), + ('āŠļ', 'āŠļ'), + ('\u{c04}', '\u{c04}'), + ('ā˛', 'ā˛'), + ('᥸', '᥸'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('âŽē', 'âŽŧ'), + ('â¯', 'â¯Ģ'), + ('⯰', '⯞'), + ('âš', 'âš'), + ('ã¯', 'ã¯'), + ('éŋĢ', 'éŋ¯'), + ('ę¯', 'ę¯'), + ('ę¸', 'ęš'), + ('ęŖž', '\u{a8ff}'), + ('đ¨´', 'đ¨ĩ'), + ('đŠ', 'đŠ'), + ('đ´', '\u{10d27}'), + ('đ´°', 'đ´š'), + ('đŧ', 'đŧ§'), + ('đŧ°', 'đŊ'), + ('\u{110cd}', '\u{110cd}'), + ('đ
', 'đ
'), + ('\u{1133b}', '\u{1133b}'), + ('\u{1145e}', '\u{1145e}'), + ('đ', 'đ'), + ('đ ', 'đ ģ'), + ('đĒ', 'đĒ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', 'đļ'), + ('đļ ', 'đļŠ'), + ('đģ ', 'đģ¸'), + ('đš', 'đē'), + ('đ', 'đą'), + ('đ ', 'đŗ'), + ('đ˛', 'đ¸'), + ('đąą', 'đ˛´'), + ('đ¯', 'đ¯'), + ('đš', 'đš'), + ('đ', 'đ'), + ('đĨ', 'đĨ'), + ('đĨŦ', 'đĨ°'), + ('đĨŗ', 'đĨļ'), + ('đĨē', 'đĨē'), + ('đĨŧ', 'đĨŋ'), + ('đĻ', 'đĻĸ'), + ('đĻ°', 'đĻš'), + ('đ§', 'đ§'), + ('đ§§', 'đ§ŋ'), + ('đŠ ', 'đŠ'), +]; + +pub const V12_0: &'static [(char, char)] = &[ + ('āąˇ', 'āąˇ'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē ', 'āē '), + ('āē¨', 'āēŠ'), + ('āēŦ', 'āēŦ'), + ('\u{eba}', '\u{eba}'), + ('áŗē', 'áŗē'), + ('â¯', 'â¯'), + ('â¯ŋ', 'â¯ŋ'), + ('âš', 'âš'), + ('ęē', 'ęŋ'), + ('ę', 'ę'), + ('ęĻ', 'ę§'), + ('đŋ ', 'đŋļ'), + ('đ', 'đ'), + ('đ¸', 'đ¸'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', '\u{119d7}'), + ('\u{119da}', 'đ§¤'), + ('đĒ', 'đĒ
'), + ('đŋ', 'đŋą'), + ('đŋŋ', 'đŋŋ'), + ('\u{13430}', '\u{13438}'), + ('đŊ
', 'đŊ'), + ('\u{16f4f}', '\u{16f4f}'), + ('đŊŋ', 'đž'), + ('đŋĸ', 'đŋŖ'), + ('đ˛', 'đˇ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ', 'đŦ'), + ('\u{1e130}', 'đŊ'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ', 'đš'), + ('đŋ', 'đŋ'), + ('đĨ', 'đĨ'), + ('đ´', 'đ´Ŋ'), + ('đ
Ŧ', 'đ
Ŧ'), + ('đ', 'đ'), + ('đē', 'đē'), + ('đ ', 'đĢ'), + ('đ¤', 'đ¤'), + ('đ¤ŋ', 'đ¤ŋ'), + ('đĨą', 'đĨą'), + ('đĨģ', 'đĨģ'), + ('đĻĨ', 'đĻĒ'), + ('đĻŽ', 'đĻ¯'), + ('đĻē', 'đĻŋ'), + ('đ§', 'đ§'), + ('đ§', 'đ§'), + ('đ¨', 'đŠ'), + ('đŠ°', 'đŠŗ'), + ('đŠ¸', 'đŠē'), + ('đĒ', 'đĒ'), + ('đĒ', 'đĒ'), +]; + +pub const V12_1: &'static [(char, char)] = &[('ãŋ', 'ãŋ')]; + +pub const V13_0: &'static [(char, char)] = &[ + ('\u{8be}', '\u{8c7}'), + ('\u{b55}', '\u{b55}'), + ('\u{d04}', '\u{d04}'), + ('\u{d81}', '\u{d81}'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{2b97}', '\u{2b97}'), + ('\u{2e50}', '\u{2e52}'), + ('\u{31bb}', '\u{31bf}'), + ('\u{4db6}', '\u{4dbf}'), + ('\u{9ff0}', '\u{9ffc}'), + ('\u{a7c7}', '\u{a7ca}'), + ('\u{a7f5}', '\u{a7f6}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{ab68}', '\u{ab6b}'), + ('\u{1019c}', '\u{1019c}'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eab}', '\u{10ead}'), + ('\u{10eb0}', '\u{10eb1}'), + ('\u{10fb0}', '\u{10fcb}'), + ('\u{11147}', '\u{11147}'), + ('\u{111ce}', '\u{111cf}'), + ('\u{1145a}', '\u{1145a}'), + ('\u{11460}', '\u{11461}'), + ('\u{11900}', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{11946}'), + ('\u{11950}', '\u{11959}'), + ('\u{11fb0}', '\u{11fb0}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{16ff0}', '\u{16ff1}'), + ('\u{18af3}', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('\u{1f10d}', '\u{1f10f}'), + ('\u{1f16d}', '\u{1f16f}'), + ('\u{1f1ad}', '\u{1f1ad}'), + ('\u{1f6d6}', '\u{1f6d7}'), + ('\u{1f6fb}', '\u{1f6fc}'), + ('\u{1f8b0}', '\u{1f8b1}'), + ('\u{1f90c}', '\u{1f90c}'), + ('\u{1f972}', '\u{1f972}'), + ('\u{1f977}', '\u{1f978}'), + ('\u{1f9a3}', '\u{1f9a4}'), + ('\u{1f9ab}', '\u{1f9ad}'), + ('\u{1f9cb}', '\u{1f9cb}'), + ('\u{1fa74}', '\u{1fa74}'), + ('\u{1fa83}', '\u{1fa86}'), + ('\u{1fa96}', '\u{1faa8}'), + ('\u{1fab0}', '\u{1fab6}'), + ('\u{1fac0}', '\u{1fac2}'), + ('\u{1fad0}', '\u{1fad6}'), + ('\u{1fb00}', '\u{1fb92}'), + ('\u{1fb94}', '\u{1fbca}'), + ('\u{1fbf0}', '\u{1fbf9}'), + ('\u{2a6d7}', '\u{2a6dd}'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const V1_1: &'static [(char, char)] = &[ + ('\u{0}', 'Įĩ'), + ('Įē', 'Č'), + ('É', 'ʨ'), + ('Ę°', 'Ë'), + ('Ë ', 'ËŠ'), + ('\u{300}', '\u{345}'), + ('\u{360}', '\u{361}'), + ('Í´', 'Íĩ'), + ('Íē', 'Íē'), + ('Íž', 'Íž'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī ', 'Ī '), + ('Īĸ', 'Īŗ'), + ('Đ', 'Đ'), + ('Đ', 'Ņ'), + ('Ņ', 'Ņ'), + ('Ņ', '\u{486}'), + ('Ō', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'ĶĢ'), + ('ĶŽ', 'Ķĩ'), + ('Ķ¸', 'Ķš'), + ('Ôą', 'Õ'), + ('Õ', 'Õ'), + ('ÕĄ', 'Ö'), + ('Ö', 'Ö'), + ('\u{5b0}', '\u{5b9}'), + ('\u{5bb}', '×'), + ('×', '×Ē'), + ('×°', '×´'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('ØĄ', 'Øē'), + ('Ų', '\u{652}'), + ('Ų ', 'Ų'), + ('\u{670}', 'Úˇ'), + ('Úē', 'Úž'), + ('Û', 'Û'), + ('Û', '\u{6ed}'), + ('Û°', 'Ûš'), + ('\u{901}', 'ā¤'), + ('ā¤
', 'ā¤š'), + ('\u{93c}', '\u{94d}'), + ('āĨ', '\u{954}'), + ('āĨ', 'āĨ°'), + ('\u{981}', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('ā§', 'ā§'), + ('ā§', '\u{9e3}'), + ('ā§Ļ', 'ā§ē'), + ('\u{a02}', '\u{a02}'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('\u{a3c}', '\u{a3c}'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠĻ', 'āŠ´'), + ('\u{a81}', 'āĒ'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', '\u{acd}'), + ('āĢ', 'āĢ'), + ('āĢ ', 'āĢ '), + ('āĢĻ', 'āĢ¯'), + ('\u{b01}', 'āŦ'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦļ', 'āŦš'), + ('\u{b3c}', '\u{b43}'), + ('ā', 'ā'), + ('ā', '\u{b4d}'), + ('\u{b56}', '\u{b57}'), + ('ā', 'ā'), + ('ā', 'āĄ'), + ('āĻ', 'ā°'), + ('\u{b82}', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽĩ'), + ('āŽˇ', 'āŽš'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('ā¯§', 'ā¯˛'), + ('ā°', 'ā°'), + ('ā°
', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°ŗ'), + ('ā°ĩ', 'ā°š'), + ('\u{c3e}', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('āą ', 'āąĄ'), + ('āąĻ', 'āą¯'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('ā˛ž', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('āŗ', 'āŗ'), + ('āŗ ', 'āŗĄ'), + ('āŗĻ', 'āŗ¯'), + ('ā´', 'ā´'), + ('ā´
', 'ā´'), + ('ā´', 'ā´'), + ('ā´', 'ā´¨'), + ('ā´Ē', 'ā´š'), + ('\u{d3e}', '\u{d43}'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('āĩ ', 'āĩĄ'), + ('āĩĻ', 'āĩ¯'), + ('ā¸', '\u{e3a}'), + ('ā¸ŋ', 'āš'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āēĄ', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āē§'), + ('āēĒ', 'āēĢ'), + ('āē', '\u{eb9}'), + ('\u{ebb}', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('\u{ec8}', '\u{ecd}'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('á ', 'á
'), + ('á', 'áļ'), + ('áģ', 'áģ'), + ('á', 'á
'), + ('á
', 'áĸ'), + ('á¨', 'áš'), + ('á¸', 'áē'), + ('áē ', 'áģš'), + ('áŧ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ¯'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋž'), + ('\u{2000}', '\u{202e}'), + ('â°', 'â'), + ('\u{206a}', 'â°'), + ('â´', 'â'), + ('â ', 'âĒ'), + ('\u{20d0}', '\u{20e1}'), + ('â', 'â¸'), + ('â
', 'â'), + ('â', 'âĒ'), + ('â', 'âą'), + ('â', 'â'), + ('â', 'âē'), + ('â', 'â¤'), + ('â', 'â'), + ('â ', 'âĒ'), + ('â', 'â'), + ('â ', 'â¯'), + ('â', 'â'), + ('â', 'â¯'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â§'), + ('âŠ', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âĄ', 'â§'), + ('âļ', 'â'), + ('â', 'â¯'), + ('âą', 'âž'), + ('\u{3000}', 'ãˇ'), + ('ãŋ', 'ãŋ'), + ('ã', 'ã'), + ('\u{3099}', 'ã'), + ('ãĄ', 'ãž'), + ('ã
', 'ãŦ'), + ('ãą', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã ', 'ã'), + ('ã ', 'ãģ'), + ('ãŋ', 'ã°'), + ('ã', 'ã'), + ('ã', 'ãž'), + ('ã', 'ãļ'), + ('ãģ', 'ã'), + ('ã ', 'ãž'), + ('ä¸', 'éžĨ'), + ('\u{e000}', 'ī¨'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('\u{fb1e}', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'ī´ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇģ'), + ('\u{fe20}', '\u{fe23}'), + ('ī¸°', 'īš'), + ('īš', 'īš'), + ('īš', 'īšĻ'), + ('īš¨', 'īšĢ'), + ('īš°', 'īš˛'), + ('īš´', 'īš´'), + ('īšļ', 'īģŧ'), + ('\u{feff}', '\u{feff}'), + ('īŧ', 'īŊ'), + ('īŊĄ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ ', 'īŋĻ'), + ('īŋ¨', 'īŋŽ'), + ('īŋŊ', '\u{ffff}'), +]; + +pub const V2_0: &'static [(char, char)] = &[ + ('\u{591}', '\u{5a1}'), + ('\u{5a3}', '\u{5af}'), + ('\u{5c4}', '\u{5c4}'), + ('āŧ', 'āŊ'), + ('āŊ', 'āŊŠ'), + ('\u{f71}', 'āž'), + ('\u{f90}', '\u{f95}'), + ('\u{f97}', '\u{f97}'), + ('\u{f99}', '\u{fad}'), + ('\u{fb1}', '\u{fb7}'), + ('\u{fb9}', '\u{fb9}'), + ('áē', 'áē'), + ('âĢ', 'âĢ'), + ('ę°', 'íŖ'), + ('\u{1fffe}', '\u{1ffff}'), + ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), + ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), + ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), + ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), + ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), + ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), + ('\u{efffe}', '\u{10ffff}'), +]; + +pub const V2_1: &'static [(char, char)] = &[('âŦ', 'âŦ'), ('īŋŧ', 'īŋŧ')]; + +pub const V3_0: &'static [(char, char)] = &[ + ('Įļ', 'Įš'), + ('Č', 'Č'), + ('Čĸ', 'Čŗ'), + ('ĘŠ', 'Ę'), + ('Ë', 'Ë'), + ('ËĒ', 'ËŽ'), + ('\u{346}', '\u{34e}'), + ('\u{362}', '\u{362}'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('ĪĄ', 'ĪĄ'), + ('Đ', 'Đ'), + ('Đ', 'Đ'), + ('Ņ', 'Ņ'), + ('Ņ', 'Ņ'), + ('\u{488}', '\u{489}'), + ('Ō', 'Ō'), + ('ĶŦ', 'Ķ'), + ('Ö', 'Ö'), + ('\u{653}', '\u{655}'), + ('Ú¸', 'Úš'), + ('Úŋ', 'Úŋ'), + ('Û', 'Û'), + ('Ûē', 'Ûž'), + ('Ü', 'Ü'), + ('\u{70f}', 'ÜŦ'), + ('\u{730}', '\u{74a}'), + ('Ū', '\u{7b0}'), + ('āļ', 'āļ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇ˛', 'āˇ´'), + ('āŊĒ', 'āŊĒ'), + ('\u{f96}', '\u{f96}'), + ('\u{fae}', '\u{fb0}'), + ('\u{fb8}', '\u{fb8}'), + ('\u{fba}', '\u{fbc}'), + ('āžž', 'āŋ'), + ('āŋ', 'āŋ'), + ('á', 'áĄ'), + ('áŖ', 'á§'), + ('áŠ', 'áĒ'), + ('áŦ', '\u{1032}'), + ('\u{1036}', '\u{1039}'), + ('á', '\u{1059}'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áŽ'), + ('á°', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áŽ'), + ('á°', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('áĄ', 'áŧ'), + ('á ', 'á´'), + ('á', 'áļ'), + ('\u{1680}', 'á'), + ('á ', 'á°'), + ('á', 'á'), + ('á ', 'áŠ'), + ('á ', '\u{180e}'), + ('á ', 'á '), + ('á ', 'ᥡ'), + ('áĸ', '\u{18a9}'), + ('\u{202f}', '\u{202f}'), + ('â', 'â'), + ('â', 'â¯'), + ('\u{20e2}', '\u{20e3}'), + ('âš', 'âē'), + ('â', 'â'), + ('âĢ', 'âŗ'), + ('â', 'â'), + ('âģ', 'âģ'), + ('âŊ', 'â'), + ('âĨ', 'âĻ'), + ('â°', 'âˇ'), + ('â', 'â'), + ('â°', 'âą'), + ('â ', 'âŖŋ'), + ('âē', 'âē'), + ('âē', 'âģŗ'), + ('âŧ', 'âŋ'), + ('âŋ°', 'âŋģ'), + ('ã¸', 'ãē'), + ('ãž', 'ãž'), + ('ã ', 'ãˇ'), + ('ã', 'äļĩ'), + ('ę', 'ę'), + ('ę', 'ęĄ'), + ('ę¤', 'ęŗ'), + ('ęĩ', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('īŦ', 'īŦ'), + ('\u{fff9}', '\u{fffb}'), +]; + +pub const V3_1: &'static [(char, char)] = &[ + ('Ī´', 'Īĩ'), + ('\u{fdd0}', '\u{fdef}'), + ('đ', 'đ'), + ('đ ', 'đŖ'), + ('đ°', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đĩ'), + ('đ', 'đĻ'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŖ'), + ('đ¨', 'đ'), + ('đ', 'đŋ'), + ('đ ', 'đĒ'), + ('đ¯ ', 'đ¯¨'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const V3_2: &'static [(char, char)] = &[ + ('Č ', 'Č '), + ('\u{34f}', '\u{34f}'), + ('\u{363}', '\u{36f}'), + ('Ī', 'Ī'), + ('Īļ', 'Īļ'), + ('Ō', 'Ō'), + ('Ķ
', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ô', 'Ô'), + ('ŲŽ', 'Ų¯'), + ('Ūą', 'Ūą'), + ('áˇ', 'á¸'), + ('á', 'á'), + ('á', '\u{1714}'), + ('á ', 'áļ'), + ('á', '\u{1753}'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('\u{1772}', '\u{1773}'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('\u{205f}', '\u{2063}'), + ('âą', 'âą'), + ('â°', 'âą'), + ('\u{20e4}', '\u{20ea}'), + ('âŊ', 'â
'), + ('â´', 'âŋ'), + ('â˛', 'âŋ'), + ('âŧ', 'âŧ'), + ('â', 'â'), + ('âĢ', 'âž'), + ('â', 'â'), + ('â¸', 'âŋ'), + ('â', 'â'), + ('â˛', 'âŊ'), + ('â', 'â'), + ('â¨', 'âĩ'), + ('â', 'âĢ'), + ('â°', 'âŋ'), + ('â¤', 'âĢŋ'), + ('ãģ', 'ãŊ'), + ('ã', 'ã'), + ('ã', 'ã '), + ('ãŋ', 'ãŋ'), + ('ã°', 'ãŋ'), + ('ã', 'ã'), + ('ãą', 'ãŋ'), + ('ęĸ', 'ęŖ'), + ('ę´', 'ę´'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ī¨°', 'īŠĒ'), + ('īˇŧ', 'īˇŧ'), + ('\u{fe00}', '\u{fe0f}'), + ('īš
', 'īš'), + ('īšŗ', 'īšŗ'), + ('īŊ', 'īŊ '), +]; + +pub const V4_0: &'static [(char, char)] = &[ + ('ČĄ', 'ČĄ'), + ('Č´', 'Čļ'), + ('ĘŽ', 'Ę¯'), + ('˯', 'Ëŋ'), + ('\u{350}', '\u{357}'), + ('\u{35d}', '\u{35f}'), + ('Īˇ', 'Īģ'), + ('\u{600}', '\u{603}'), + ('Ø', '\u{615}'), + ('\u{656}', '\u{658}'), + ('ÛŽ', 'Û¯'), + ('Ûŋ', 'Ûŋ'), + ('Ü', 'ܯ'), + ('Ũ', 'Ũ'), + ('ā¤', 'ā¤'), + ('āĻŊ', 'āĻŊ'), + ('\u{a01}', '\u{a01}'), + ('ā¨', 'ā¨'), + ('āĒ', 'āĒ'), + ('āĢĄ', '\u{ae3}'), + ('āĢą', 'āĢą'), + ('āŦĩ', 'āŦĩ'), + ('āą', 'āą'), + ('ā¯ŗ', 'ā¯ē'), + ('\u{cbc}', 'ā˛Ŋ'), + ('\u{17dd}', '\u{17dd}'), + ('á°', 'áš'), + ('á¤', 'á¤'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', '\u{193b}'), + ('áĨ', 'áĨ'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('᧠', 'á§ŋ'), + ('á´', 'áĩĢ'), + ('â', 'â'), + ('âģ', 'âģ'), + ('â', 'â'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â ', 'âĄ'), + ('âŦ', 'âŦ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãŧ', 'ãŊ'), + ('ã', 'ã'), + ('ãˇ', 'ãē'), + ('ã', 'ã'), + ('ãŋ', 'ãŋ'), + ('äˇ', 'äˇŋ'), + ('īˇŊ', 'īˇŊ'), + ('īš', 'īš'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ', 'đ'), + ('đ', 'đŗ'), + ('đˇ', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĻ', 'đ§'), + ('đ', 'đ'), + ('đ ', 'đŠ'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đ ŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const V4_1: &'static [(char, char)] = &[ + ('ȡ', 'É'), + ('\u{358}', '\u{35c}'), + ('Īŧ', 'Īŋ'), + ('Ķļ', 'Ķˇ'), + ('\u{5a2}', '\u{5a2}'), + ('\u{5c5}', '\u{5c7}'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('\u{659}', '\u{65e}'), + ('Ũ', 'Ũ'), + ('āĨŊ', 'āĨŊ'), + ('ā§', 'ā§'), + ('āŽļ', 'āŽļ'), + ('ā¯Ļ', 'ā¯Ļ'), + ('āŋ', 'āŋ'), + ('áš', 'áē'), + ('áŧ', 'áŧ'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á¯', 'á¯'), + ('á', 'á'), + ('á¯', 'á¯'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('\u{135f}', 'á '), + ('á', 'á'), + ('áĻ', 'áĻŠ'), + ('áĻ°', 'á§'), + ('á§', 'á§'), + ('á§', 'á§'), + ('á¨', '\u{1a1b}'), + ('á¨', 'á¨'), + ('áĩŦ', '\u{1dc3}'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â˛', 'âĩ'), + ('\u{20eb}', '\u{20eb}'), + ('âŧ', 'âŧ'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('âž', 'âŋ'), + ('â', 'â'), + ('âĸ', 'âą'), + ('â', 'â'), + ('âŦ', 'âŦ'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('â˛', 'âŗĒ'), + ('âŗš', 'â´Ĩ'), + ('â´°', 'âĩĨ'), + ('âĩ¯', 'âĩ¯'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('ã', 'ã'), + ('ãž', 'ãž'), + ('éžĻ', 'éžģ'), + ('ę', 'ę'), + ('ę ', 'ę Ģ'), + ('īŠ°', 'īĢ'), + ('ī¸', 'ī¸'), + ('đ
', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ¨', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ŗ'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', 'đŠ'), + ('đŠ', 'đŠ'), + ('đ', 'đ
'), + ('đ¤', 'đĨ'), +]; + +pub const V5_0: &'static [(char, char)] = &[ + ('É', 'É'), + ('Íģ', 'ÍŊ'), + ('Ķ', 'Ķ'), + ('Ķē', 'Ķŋ'), + ('Ô', 'Ô'), + ('\u{5ba}', '\u{5ba}'), + ('ß', 'ßē'), + ('āĨģ', 'āĨŧ'), + ('āĨž', 'āĨŋ'), + ('\u{ce2}', '\u{ce3}'), + ('āŗą', 'āŗ˛'), + ('\u{1b00}', 'á'), + ('á', 'áŧ'), + ('\u{1dc4}', '\u{1dca}'), + ('\u{1dfe}', '\u{1dff}'), + ('\u{20ec}', '\u{20ef}'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'â§'), + ('â˛', 'â˛'), + ('â', 'â'), + ('âŦ', 'âŦ'), + ('âŦ ', 'âŦŖ'), + ('âą ', 'âąŦ'), + ('âą´', '⹡'), + ('ę', 'ę'), + ('ę ', 'ęĄ'), + ('ęĄ', 'ꥡ'), + ('đ¤', 'đ¤'), + ('đ¤', 'đ¤'), + ('đ', 'đŽ'), + ('đ', 'đĸ'), + ('đ°', 'đŗ'), + ('đ ', 'đą'), + ('đ', 'đ'), +]; + +pub const V5_1: &'static [(char, char)] = &[ + ('Í°', 'Íŗ'), + ('Íļ', '͡'), + ('Ī', 'Ī'), + ('\u{487}', '\u{487}'), + ('Ô', 'ÔŖ'), + ('Ø', 'Ø'), + ('\u{616}', '\u{61a}'), + ('Øģ', 'Øŋ'), + ('ŨŽ', 'Ũŋ'), + ('āĨą', 'āĨ˛'), + ('\u{a51}', '\u{a51}'), + ('\u{a75}', '\u{a75}'), + ('\u{b44}', '\u{b44}'), + ('\u{b62}', '\u{b63}'), + ('ā¯', 'ā¯'), + ('ā°Ŋ', 'ā°Ŋ'), + ('āą', 'āą'), + ('\u{c62}', '\u{c63}'), + ('āą¸', 'āąŋ'), + ('ā´Ŋ', 'ā´Ŋ'), + ('\u{d44}', '\u{d44}'), + ('\u{d62}', '\u{d63}'), + ('āĩ°', 'āĩĩ'), + ('āĩš', 'āĩŋ'), + ('āŊĢ', 'āŊŦ'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), + ('áĸ', 'áĸ'), + ('á¨', 'á¨'), + ('áĢ', 'áĢ'), + ('\u{1033}', '\u{1035}'), + ('\u{103a}', 'áŋ'), + ('á', 'á'), + ('á', 'á'), + ('áĸĒ', 'áĸĒ'), + ('\u{1b80}', 'áŽĒ'), + ('ᎎ', '᎚'), + ('á°', '\u{1c37}'), + ('á°ģ', 'áą'), + ('áą', 'áąŋ'), + ('\u{1dcb}', '\u{1de6}'), + ('áē', 'áē'), + ('áģē', 'áģŋ'), + ('\u{2064}', '\u{2064}'), + ('\u{20f0}', '\u{20f0}'), + ('â
', 'â
'), + ('â
', 'â'), + ('â', 'â'), + ('âŗ', 'âŧ'), + ('â', 'â'), + ('â', 'â'), + ('âŦ', 'â¯'), + ('âŦ', 'âŦ'), + ('âŦ¤', 'â'), + ('â', 'â'), + ('âą', 'âą¯'), + ('âąą', 'âąŗ'), + ('⹸', 'âąŊ'), + ('\u{2de0}', '\u{2dff}'), + ('â¸', 'â¸'), + ('â¸', '⸰'), + ('ã', 'ã'), + ('ã', 'ãŖ'), + ('éžŧ', 'éŋ'), + ('ę', 'ęĢ'), + ('ę', 'ę'), + ('ęĸ', 'ęŗ'), + ('\u{a67c}', 'ę'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ęģ', 'ęŋ'), + ('ęĸ', '\u{a8c4}'), + ('ęŖ', 'ęŖ'), + ('ę¤', 'ęĨ'), + ('ęĨ', 'ęĨ'), + ('ę¨', '\u{aa36}'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('\u{fe24}', '\u{fe26}'), + ('đ', 'đ'), + ('đ', '\u{101fd}'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ¤ ', 'đ¤š'), + ('đ¤ŋ', 'đ¤ŋ'), + ('đŠ', 'đŠ'), + ('đ', 'đĢ'), + ('đ°', 'đ'), +]; + +pub const V5_2: &'static [(char, char)] = &[ + ('Ô¤', 'ÔĨ'), + ('ā ', '\u{82d}'), + ('ā °', 'ā ž'), + ('\u{900}', '\u{900}'), + ('āĨ', 'āĨ'), + ('\u{955}', '\u{955}'), + ('āĨš', 'āĨē'), + ('ā§ģ', 'ā§ģ'), + ('āŋ', 'āŋ'), + ('á', '\u{109d}'), + ('á
', 'á
'), + ('áŖ', 'á§'), + ('áē', 'áŋ'), + ('á', 'á'), + ('áˇ', 'áŋ'), + ('áĸ°', 'áŖĩ'), + ('áĻĒ', 'áĻĢ'), + ('á§', 'á§'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', 'áĒ'), + ('áĒ', 'áĒ'), + ('áĒ ', 'áĒ'), + ('\u{1cd0}', 'áŗ˛'), + ('\u{1dfd}', '\u{1dfd}'), + ('âļ', 'â¸'), + ('â
', 'â
'), + ('â', 'â'), + ('â¨', 'â¨'), + ('â', 'â'), + ('âŊ', 'âŋ'), + ('â', 'â'), + ('â', 'âĄ'), + ('âŖ', 'âŖ'), + ('â¨', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('âą°', 'âą°'), + ('âąž', 'âąŋ'), + ('âŗĢ', '\u{2cf1}'), + ('⸹', '⸹'), + ('ã', 'ã'), + ('éŋ', 'éŋ'), + ('ę', 'ęŋ'), + ('ę ', 'ęˇ'), + ('ę °', 'ę š'), + ('\u{a8e0}', 'ęŖģ'), + ('ęĨ ', 'ęĨŧ'), + ('\u{a980}', 'ę§'), + ('ę§', 'ę§'), + ('ę§', 'ę§'), + ('ęŠ ', 'ęŠģ'), + ('ęĒ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ę¯', '\u{abed}'), + ('ę¯°', 'ę¯š'), + ('í°', 'í'), + ('í', 'íģ'), + ('īŠĢ', 'īŠ'), + ('đĄ', 'đĄ'), + ('đĄ', 'đĄ'), + ('đ¤', 'đ¤'), + ('đŠ ', 'đŠŋ'), + ('đŦ', 'đŦĩ'), + ('đŦš', 'đ'), + ('đ', 'đ˛'), + ('đ¸', 'đŋ'), + ('đ°', 'đą'), + ('đš ', 'đšž'), + ('\u{11080}', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đą', 'đą'), + ('đŊ', 'đŊ'), + ('đŋ', 'đŋ'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ
š', 'đ
š'), + ('đ
ģ', 'đ
ŧ'), + ('đ
ŋ', 'đ
ŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đą'), + ('đ', 'đ'), + ('đĒ', 'đĢ´'), +]; + +pub const V6_0: &'static [(char, char)] = &[ + ('ÔĻ', 'Ô§'), + ('Ø ', 'Ø '), + ('\u{65f}', '\u{65f}'), + ('āĄ', '\u{85b}'), + ('āĄ', 'āĄ'), + ('\u{93a}', 'ā¤ģ'), + ('āĨ', 'āĨ'), + ('\u{956}', '\u{957}'), + ('āĨŗ', 'āĨˇ'), + ('ā˛', 'āˇ'), + ('ā´Š', 'ā´Š'), + ('ā´ē', 'ā´ē'), + ('āĩ', 'āĩ'), + ('āž', '\u{f8f}'), + ('āŋ', 'āŋ'), + ('\u{135d}', '\u{135e}'), + ('á¯', 'á¯ŗ'), + ('á¯ŧ', 'á¯ŋ'), + ('\u{1dfc}', '\u{1dfc}'), + ('â', 'â'), + ('âš', 'âš'), + ('âŠ', 'âŗ'), + ('â', 'â'), + ('âĸ', 'âĸ'), + ('â¤', 'â§'), + ('â
', 'â
'), + ('â', 'â'), + ('â¨', 'â¨'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â '), + ('â', 'â'), + ('â°', 'â°'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('âĩ°', 'âĩ°'), + ('\u{2d7f}', '\u{2d7f}'), + ('ã¸', 'ãē'), + ('ę ', 'ęĄ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ęŠ'), + ('ęē', 'ęē'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('īŽ˛', 'ī¯'), + ('đ', 'đ'), + ('đ', 'đ¯'), + ('đ ', 'đ¨¸'), + ('đ', 'đ'), + ('đ ', 'đŽ'), + ('đą', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ°'), + ('đ˛', 'đŧ'), + ('đž', 'đž'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ
', 'đ
Š'), + ('đ
°', 'đ
¸'), + ('đ
ē', 'đ
ē'), + ('đ
Ŋ', 'đ
ž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĻ', 'đŋ'), + ('đ', 'đ'), + ('đ˛', 'đē'), + ('đ', 'đ'), + ('đ', 'đ '), + ('đ°', 'đĩ'), + ('đˇ', 'đŧ'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đ°'), + ('đ', 'đž'), + ('đ', 'đ'), + ('đ', 'đˇ'), + ('đš', 'đŧ'), + ('đ', 'đŊ'), + ('đ', 'đ§'), + ('đģ', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đĨ'), + ('đ¨', 'đĢ'), + ('đ', 'đ'), + ('đ°', 'đŗ'), + ('đĩ', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ
'), + ('đ', 'đŗ'), + ('đĢ', 'đĢ '), +]; + +pub const V6_1: &'static [(char, char)] = &[ + ('Ö', 'Ö'), + ('\u{604}', '\u{604}'), + ('āĸ ', 'āĸ '), + ('āĸĸ', 'āĸŦ'), + ('\u{8e4}', '\u{8fe}'), + ('āĢ°', 'āĢ°'), + ('āģ', 'āģ'), + ('á', 'á'), + ('á', 'á'), + ('áŊ', 'áŋ'), + ('\u{1bab}', '\u{1bad}'), + ('áŽē', 'áŽŋ'), + ('áŗ', 'áŗ'), + ('áŗŗ', 'áŗļ'), + ('â', 'â'), + ('â', 'â'), + ('âŗ˛', 'âŗŗ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('âĩĻ', 'âĩ§'), + ('⸲', 'â¸ģ'), + ('éŋ', 'éŋ'), + ('\u{a674}', '\u{a67b}'), + ('\u{a69f}', '\u{a69f}'), + ('ę', 'ę'), + ('ęĒ', 'ęĒ'), + ('ę¸', 'ęš'), + ('ęĢ ', '\u{aaf6}'), + ('ī¨Ž', 'ī¨¯'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ', 'đ¨'), + ('đ°', 'đš'), + ('\u{11100}', '\u{11134}'), + ('đļ', 'đ
'), + ('\u{11180}', 'đ'), + ('đ', 'đ'), + ('đ', '\u{116b7}'), + ('đ', 'đ'), + ('đŧ', 'đŊ'), + ('đŊ', 'đŊž'), + ('\u{16f8f}', 'đž'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đģ°', 'đģą'), + ('đ
Ē', 'đ
Ģ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĻ', 'đ§'), + ('đŦ', 'đŦ'), + ('đŽ', 'đ¯'), + ('đ´', 'đ´'), +]; + +pub const V6_2: &'static [(char, char)] = &[('âē', 'âē')]; + +pub const V6_3: &'static [(char, char)] = + &[('\u{61c}', '\u{61c}'), ('\u{2066}', '\u{2069}')]; + +pub const V7_0: &'static [(char, char)] = &[ + ('Íŋ', 'Íŋ'), + ('Ô¨', 'Ô¯'), + ('Ö', 'Ö'), + ('\u{605}', '\u{605}'), + ('āĸĄ', 'āĸĄ'), + ('āĸ', 'āĸ˛'), + ('\u{8ff}', '\u{8ff}'), + ('āĨ¸', 'āĨ¸'), + ('āĻ', 'āĻ'), + ('\u{c00}', '\u{c00}'), + ('ā°´', 'ā°´'), + ('\u{c81}', '\u{c81}'), + ('\u{d01}', '\u{d01}'), + ('āˇĻ', 'āˇ¯'), + ('áą', 'á¸'), + ('á¤', 'á¤'), + ('\u{1ab0}', '\u{1abe}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1de7}', '\u{1df5}'), + ('âģ', 'âŊ'), + ('â´', 'âē'), + ('â', 'â'), + ('â', 'â'), + ('â', 'âŗ'), + ('âļ', 'âŽ'), + ('âŽ', '⎚'), + ('âŽŊ', 'â¯'), + ('â¯', 'â¯'), + ('â¸ŧ', 'âš'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĢ', 'ę'), + ('ę°', 'ęą'), + ('ęˇ', 'ęˇ'), + ('ę§ ', '꧞'), + ('\u{aa7c}', 'ęŠŋ'), + ('ęŦ°', 'ę'), + ('ę¤', 'ęĨ'), + ('\u{fe27}', '\u{fe2d}'), + ('đ', 'đ'), + ('đ ', 'đ '), + ('\u{102e0}', 'đģ'), + ('đ', 'đ'), + ('đ', '\u{1037a}'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ¯', 'đ¯'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đĄ ', 'đĸ'), + ('đĸ§', 'đĸ¯'), + ('đĒ', 'đĒ'), + ('đĢ', '\u{10ae6}'), + ('đĢĢ', 'đĢļ'), + ('đŽ', 'đŽ'), + ('đŽ', 'đŽ'), + ('đŽŠ', 'đŽ¯'), + ('\u{1107f}', '\u{1107f}'), + ('đ
', 'đ
ļ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĄ', 'đ´'), + ('đ', 'đ'), + ('đ', 'đŊ'), + ('đ°', '\u{112ea}'), + ('đ°', 'đš'), + ('\u{11301}', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('\u{1133c}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đ', 'đŖ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{115b5}'), + ('đ¸', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ ', 'đŖ˛'), + ('đŖŋ', 'đŖŋ'), + ('đĢ', 'đĢ¸'), + ('đ¯', 'đ'), + ('đŖ', 'đŽ'), + ('đ´', 'đ´'), + ('đŠ', 'đŠ'), + ('đŠ ', 'đŠŠ'), + ('đŠŽ', 'đŠ¯'), + ('đĢ', 'đĢ'), + ('\u{16af0}', 'đĢĩ'), + ('đŦ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ˛', '\u{1bca3}'), + ('đ ', 'đŖ'), + ('đŖ', '\u{1e8d6}'), + ('đŋ', 'đŋ'), + ('đ ', 'đĩ'), + ('đ', 'đ'), + ('đĄ', 'đŦ'), + ('đļ', 'đļ'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đą', 'đˇ'), + ('đŋ', 'đŋ'), + ('đ', 'đ'), + ('đ¸', 'đ¸'), + ('đŊ', 'đž'), + ('đž', 'đŋ'), + ('đ', 'đ'), + ('đ¨', 'đš'), + ('đģ', 'đŖ'), + ('đĨ', 'đē'), + ('đ', 'đ'), + ('đ', 'đŋ'), + ('đ', 'đ'), + ('đ ', 'đŦ'), + ('đ°', 'đŗ'), + ('đ', 'đ'), + ('đ ', 'đ '), + ('đ ', 'đĄ'), + ('đĄ', 'đĄ'), + ('đĄ ', 'đĸ'), + ('đĸ', 'đĸ'), +]; + +pub const V8_0: &'static [(char, char)] = &[ + ('āĸŗ', 'āĸ´'), + ('\u{8e3}', '\u{8e3}'), + ('āĢš', 'āĢš'), + ('āą', 'āą'), + ('āĩ', 'āĩ'), + ('áĩ', 'áĩ'), + ('á¸', 'áŊ'), + ('âž', 'âž'), + ('â', 'â'), + ('â¯Ŧ', '⯯'), + ('éŋ', 'éŋ'), + ('\u{a69e}', '\u{a69e}'), + ('ę', 'ę'), + ('ę˛', 'ęˇ'), + ('ęŖŧ', 'ęŖŊ'), + ('ę ', 'ęŖ'), + ('ę°', 'ęŽŋ'), + ('\u{fe2e}', '\u{fe2f}'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đŖģ', 'đŖŋ'), + ('đĻŧ', 'đĻŊ'), + ('đ§', 'đ§'), + ('đ§', 'đ§ŋ'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đŗē', 'đŗŋ'), + ('\u{111c9}', '\u{111cc}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŠ'), + ('\u{11300}', '\u{11300}'), + ('đ', 'đ'), + ('đ', '\u{115dd}'), + ('đ', 'đ'), + ('\u{1171d}', '\u{1172b}'), + ('đ°', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ ', 'đĒ'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('đ', 'đ¯'), + ('đž', 'đŋ'), + ('đ', 'đ'), + ('đ¸', 'đŋ'), + ('đŋ', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¤', 'đ¤'), + ('đĻ', 'đĻ'), + ('đ§', 'đ§'), + ('đĢ ', 'đŦēĄ'), +]; + +pub const V9_0: &'static [(char, char)] = &[ + ('āĸļ', 'āĸŊ'), + ('\u{8d4}', '\u{8e2}'), + ('ā˛', 'ā˛'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩļ', 'āĩ¸'), + ('á˛', 'á˛'), + ('\u{1dfb}', '\u{1dfb}'), + ('âģ', 'âž'), + ('âš', 'âš'), + ('ęŽ', 'ęŽ'), + ('\u{a8c5}', '\u{a8c5}'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('\u{1123e}', '\u{1123e}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đŦ'), + ('đ°', 'đ°'), + ('đ°', '\u{11c36}'), + ('\u{11c38}', 'đą
'), + ('đą', 'đąŦ'), + ('đą°', 'đ˛'), + ('\u{11c92}', '\u{11ca7}'), + ('đ˛Š', '\u{11cb6}'), + ('đŋ ', 'đŋ '), + ('đ', 'đŦ'), + ('đ ', 'đĢ˛'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('đ¤', '\u{1e94a}'), + ('đĨ', 'đĨ'), + ('đĨ', 'đĨ'), + ('đ', 'đŦ'), + ('đģ', 'đģ'), + ('đē', 'đē'), + ('đ¤', 'đ¤'), + ('đ', 'đ'), + ('đ´', 'đļ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤§'), + ('đ¤°', 'đ¤°'), + ('đ¤ŗ', 'đ¤ž'), + ('đĨ', 'đĨ'), + ('đĨ', 'đĨ'), + ('đĻ
', 'đĻ'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs b/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs new file mode 100644 index 000000000..cfb83f363 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs @@ -0,0 +1,2808 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate case-folding-simple ucd-13.0.0 --chars --all-pairs +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[ + ('A', &['a']), + ('B', &['b']), + ('C', &['c']), + ('D', &['d']), + ('E', &['e']), + ('F', &['f']), + ('G', &['g']), + ('H', &['h']), + ('I', &['i']), + ('J', &['j']), + ('K', &['k', 'âĒ']), + ('L', &['l']), + ('M', &['m']), + ('N', &['n']), + ('O', &['o']), + ('P', &['p']), + ('Q', &['q']), + ('R', &['r']), + ('S', &['s', 'Åŋ']), + ('T', &['t']), + ('U', &['u']), + ('V', &['v']), + ('W', &['w']), + ('X', &['x']), + ('Y', &['y']), + ('Z', &['z']), + ('a', &['A']), + ('b', &['B']), + ('c', &['C']), + ('d', &['D']), + ('e', &['E']), + ('f', &['F']), + ('g', &['G']), + ('h', &['H']), + ('i', &['I']), + ('j', &['J']), + ('k', &['K', 'âĒ']), + ('l', &['L']), + ('m', &['M']), + ('n', &['N']), + ('o', &['O']), + ('p', &['P']), + ('q', &['Q']), + ('r', &['R']), + ('s', &['S', 'Åŋ']), + ('t', &['T']), + ('u', &['U']), + ('v', &['V']), + ('w', &['W']), + ('x', &['X']), + ('y', &['Y']), + ('z', &['Z']), + ('Âĩ', &['Î', 'Îŧ']), + ('Ã', &['à ']), + ('Ã', &['ÃĄ']), + ('Ã', &['Ãĸ']), + ('Ã', &['ÃŖ']), + ('Ã', &['ä']), + ('Ã
', &['ÃĨ', 'âĢ']), + ('Ã', &['ÃĻ']), + ('Ã', &['ç']), + ('Ã', &['è']), + ('Ã', &['Ê']), + ('Ã', &['ÃĒ']), + ('Ã', &['ÃĢ']), + ('Ã', &['ÃŦ']), + ('Ã', &['Ã']), + ('Ã', &['ÃŽ']), + ('Ã', &['ï']), + ('Ã', &['ð']), + ('Ã', &['Ãą']), + ('Ã', &['Ã˛']), + ('Ã', &['Ãŗ']), + ('Ã', &['ô']), + ('Ã', &['Ãĩ']), + ('Ã', &['Ãļ']), + ('Ã', &['ø']), + ('Ã', &['Ú']), + ('Ã', &['Ãē']), + ('Ã', &['Ãģ']), + ('Ã', &['Ãŧ']), + ('Ã', &['ÃŊ']), + ('Ã', &['Þ']), + ('Ã', &['áē']), + ('à ', &['Ã']), + ('ÃĄ', &['Ã']), + ('Ãĸ', &['Ã']), + ('ÃŖ', &['Ã']), + ('ä', &['Ã']), + ('ÃĨ', &['Ã
', 'âĢ']), + ('ÃĻ', &['Ã']), + ('ç', &['Ã']), + ('è', &['Ã']), + ('Ê', &['Ã']), + ('ÃĒ', &['Ã']), + ('ÃĢ', &['Ã']), + ('ÃŦ', &['Ã']), + ('Ã', &['Ã']), + ('ÃŽ', &['Ã']), + ('ï', &['Ã']), + ('ð', &['Ã']), + ('Ãą', &['Ã']), + ('Ã˛', &['Ã']), + ('Ãŗ', &['Ã']), + ('ô', &['Ã']), + ('Ãĩ', &['Ã']), + ('Ãļ', &['Ã']), + ('ø', &['Ã']), + ('Ú', &['Ã']), + ('Ãē', &['Ã']), + ('Ãģ', &['Ã']), + ('Ãŧ', &['Ã']), + ('ÃŊ', &['Ã']), + ('Þ', &['Ã']), + ('Ãŋ', &['Ÿ']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä
']), + ('Ä
', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä', &['Ä']), + ('Ä ', &['ÄĄ']), + ('ÄĄ', &['Ä ']), + ('Äĸ', &['ÄŖ']), + ('ÄŖ', &['Äĸ']), + ('Ĥ', &['ÄĨ']), + ('ÄĨ', &['Ĥ']), + ('ÄĻ', &['ħ']), + ('ħ', &['ÄĻ']), + ('Ĩ', &['ÄŠ']), + ('ÄŠ', &['Ĩ']), + ('ÄĒ', &['ÄĢ']), + ('ÄĢ', &['ÄĒ']), + ('ÄŦ', &['Ä']), + ('Ä', &['ÄŦ']), + ('ÄŽ', &['į']), + ('į', &['ÄŽ']), + ('IJ', &['Äŗ']), + ('Äŗ', &['IJ']), + ('Ä´', &['Äĩ']), + ('Äĩ', &['Ä´']), + ('Äļ', &['ġ']), + ('ġ', &['Äļ']), + ('Äš', &['Äē']), + ('Äē', &['Äš']), + ('Äģ', &['Äŧ']), + ('Äŧ', &['Äģ']), + ('ÄŊ', &['Äž']), + ('Äž', &['ÄŊ']), + ('Äŋ', &['Å']), + ('Å', &['Äŋ']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å
', &['Å']), + ('Å', &['Å
']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å', &['Å']), + ('Å ', &['ÅĄ']), + ('ÅĄ', &['Å ']), + ('Åĸ', &['ÅŖ']), + ('ÅŖ', &['Åĸ']), + ('Ť', &['ÅĨ']), + ('ÅĨ', &['Ť']), + ('ÅĻ', &['ŧ']), + ('ŧ', &['ÅĻ']), + ('Ũ', &['ÅŠ']), + ('ÅŠ', &['Ũ']), + ('ÅĒ', &['ÅĢ']), + ('ÅĢ', &['ÅĒ']), + ('ÅŦ', &['Å']), + ('Å', &['ÅŦ']), + ('ÅŽ', &['ů']), + ('ů', &['ÅŽ']), + ('Å°', &['Åą']), + ('Åą', &['Å°']), + ('Å˛', &['Åŗ']), + ('Åŗ', &['Å˛']), + ('Å´', &['Åĩ']), + ('Åĩ', &['Å´']), + ('Åļ', &['Åˇ']), + ('Åˇ', &['Åļ']), + ('Ÿ', &['Ãŋ']), + ('Åš', &['Åē']), + ('Åē', &['Åš']), + ('Åģ', &['Åŧ']), + ('Åŧ', &['Åģ']), + ('ÅŊ', &['Åž']), + ('Åž', &['ÅŊ']), + ('Åŋ', &['S', 's']), + ('Æ', &['É']), + ('Æ', &['É']), + ('Æ', &['Æ']), + ('Æ', &['Æ']), + ('Æ', &['Æ
']), + ('Æ
', &['Æ']), + ('Æ', &['É']), + ('Æ', &['Æ']), + ('Æ', &['Æ']), + ('Æ', &['É']), + ('Æ', &['É']), + ('Æ', &['Æ']), + ('Æ', &['Æ']), + ('Æ', &['Į']), + ('Æ', &['É']), + ('Æ', &['É']), + ('Æ', &['Æ']), + ('Æ', &['Æ']), + ('Æ', &['É ']), + ('Æ', &['ÉŖ']), + ('Æ', &['Įļ']), + ('Æ', &['ÉŠ']), + ('Æ', &['ɨ']), + ('Æ', &['Æ']), + ('Æ', &['Æ']), + ('Æ', &['ČŊ']), + ('Æ', &['ɯ']), + ('Æ', &['ɲ']), + ('Æ', &['Č ']), + ('Æ', &['Éĩ']), + ('Æ ', &['ÆĄ']), + ('ÆĄ', &['Æ ']), + ('Æĸ', &['ÆŖ']), + ('ÆŖ', &['Æĸ']), + ('Ƥ', &['ÆĨ']), + ('ÆĨ', &['Ƥ']), + ('ÆĻ', &['Ę']), + ('Ƨ', &['ƨ']), + ('ƨ', &['Ƨ']), + ('ÆŠ', &['Ę']), + ('ÆŦ', &['Æ']), + ('Æ', &['ÆŦ']), + ('ÆŽ', &['Ę']), + ('Ư', &['Æ°']), + ('Æ°', &['Ư']), + ('Æą', &['Ę']), + ('Æ˛', &['Ę']), + ('Æŗ', &['Æ´']), + ('Æ´', &['Æŗ']), + ('Æĩ', &['Æļ']), + ('Æļ', &['Æĩ']), + ('Æˇ', &['Ę']), + ('Ƹ', &['Æš']), + ('Æš', &['Ƹ']), + ('Æŧ', &['ÆŊ']), + ('ÆŊ', &['Æŧ']), + ('Æŋ', &['Įˇ']), + ('Į', &['Į
', 'Į']), + ('Į
', &['Į', 'Į']), + ('Į', &['Į', 'Į
']), + ('Į', &['Į', 'Į']), + ('Į', &['Į', 'Į']), + ('Į', &['Į', 'Į']), + ('Į', &['Į', 'Į']), + ('Į', &['Į', 'Į']), + ('Į', &['Į', 'Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į', &['Æ']), + ('Į', &['Į']), + ('Į', &['Į']), + ('Į ', &['ĮĄ']), + ('ĮĄ', &['Į ']), + ('Įĸ', &['ĮŖ']), + ('ĮŖ', &['Įĸ']), + ('Į¤', &['ĮĨ']), + ('ĮĨ', &['Į¤']), + ('ĮĻ', &['Į§']), + ('Į§', &['ĮĻ']), + ('Į¨', &['ĮŠ']), + ('ĮŠ', &['Į¨']), + ('ĮĒ', &['ĮĢ']), + ('ĮĢ', &['ĮĒ']), + ('ĮŦ', &['Į']), + ('Į', &['ĮŦ']), + ('ĮŽ', &['Į¯']), + ('Į¯', &['ĮŽ']), + ('Įą', &['Į˛', 'Įŗ']), + ('Į˛', &['Įą', 'Įŗ']), + ('Įŗ', &['Įą', 'Į˛']), + ('Į´', &['Įĩ']), + ('Įĩ', &['Į´']), + ('Įļ', &['Æ']), + ('Įˇ', &['Æŋ']), + ('Į¸', &['Įš']), + ('Įš', &['Į¸']), + ('Įē', &['Įģ']), + ('Įģ', &['Įē']), + ('Įŧ', &['ĮŊ']), + ('ĮŊ', &['Įŧ']), + ('Įž', &['Įŋ']), + ('Įŋ', &['Įž']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č
']), + ('Č
', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č', &['Č']), + ('Č ', &['Æ']), + ('Čĸ', &['ČŖ']), + ('ČŖ', &['Čĸ']), + ('Ȥ', &['ČĨ']), + ('ČĨ', &['Ȥ']), + ('ČĻ', &['ȧ']), + ('ȧ', &['ČĻ']), + ('Ȩ', &['ČŠ']), + ('ČŠ', &['Ȩ']), + ('ČĒ', &['ČĢ']), + ('ČĢ', &['ČĒ']), + ('ČŦ', &['Č']), + ('Č', &['ČŦ']), + ('ČŽ', &['Č¯']), + ('Č¯', &['ČŽ']), + ('Č°', &['Čą']), + ('Čą', &['Č°']), + ('Ȳ', &['Čŗ']), + ('Čŗ', &['Ȳ']), + ('Čē', &['âąĨ']), + ('Čģ', &['Čŧ']), + ('Čŧ', &['Čģ']), + ('ČŊ', &['Æ']), + ('Čž', &['âąĻ']), + ('Čŋ', &['âąž']), + ('É', &['âąŋ']), + ('É', &['É']), + ('É', &['É']), + ('É', &['Æ']), + ('É', &['Ę']), + ('É
', &['Ę']), + ('É', &['É']), + ('É', &['É']), + ('É', &['É']), + ('É', &['É']), + ('É', &['É']), + ('É', &['É']), + ('É', &['É']), + ('É', &['É']), + ('É', &['É']), + ('É', &['É']), + ('É', &['âą¯']), + ('É', &['âą']), + ('É', &['âą°']), + ('É', &['Æ']), + ('É', &['Æ']), + ('É', &['Æ']), + ('É', &['Æ']), + ('É', &['Æ']), + ('É', &['Æ']), + ('É', &['ęĢ']), + ('É ', &['Æ']), + ('ÉĄ', &['ęŦ']), + ('ÉŖ', &['Æ']), + ('ÉĨ', &['ę']), + ('ÉĻ', &['ęĒ']), + ('ɨ', &['Æ']), + ('ÉŠ', &['Æ']), + ('ÉĒ', &['ęŽ']), + ('ÉĢ', &['âąĸ']), + ('ÉŦ', &['ę']), + ('ɯ', &['Æ']), + ('Éą', &['⹎']), + ('ɲ', &['Æ']), + ('Éĩ', &['Æ']), + ('ÉŊ', &['⹤']), + ('Ę', &['ÆĻ']), + ('Ę', &['ę
']), + ('Ę', &['ÆŠ']), + ('Ę', &['ęą']), + ('Ę', &['ÆŽ']), + ('Ę', &['É']), + ('Ę', &['Æą']), + ('Ę', &['Æ˛']), + ('Ę', &['É
']), + ('Ę', &['Æˇ']), + ('Ę', &['ę˛']), + ('Ę', &['ę°']), + ('\u{345}', &['Î', 'Κ', 'ážž']), + ('Í°', &['Íą']), + ('Íą', &['Í°']), + ('Ͳ', &['Íŗ']), + ('Íŗ', &['Ͳ']), + ('Íļ', &['͡']), + ('͡', &['Íļ']), + ('Íģ', &['ĪŊ']), + ('Íŧ', &['Īž']), + ('ÍŊ', &['Īŋ']), + ('Íŋ', &['Īŗ']), + ('Î', &['ÎŦ']), + ('Î', &['Î']), + ('Î', &['ÎŽ']), + ('Î', &['ί']), + ('Î', &['Ī']), + ('Î', &['Ī']), + ('Î', &['Ī']), + ('Î', &['Îą']), + ('Î', &['β', 'Ī']), + ('Î', &['Îŗ']), + ('Î', &['δ']), + ('Î', &['Îĩ', 'Īĩ']), + ('Î', &['Îļ']), + ('Î', &['Ρ']), + ('Î', &['θ', 'Ī', 'Ī´']), + ('Î', &['\u{345}', 'Κ', 'ážž']), + ('Î', &['Îē', 'Ī°']), + ('Î', &['Îģ']), + ('Î', &['Âĩ', 'Îŧ']), + ('Î', &['ÎŊ']), + ('Î', &['Ξ']), + ('Î', &['Îŋ']), + ('Î ', &['Ī', 'Ī']), + ('ÎĄ', &['Ī', 'Īą']), + ('ÎŖ', &['Ī', 'Ī']), + ('Τ', &['Ī']), + ('ÎĨ', &['Ī
']), + ('ÎĻ', &['Ī', 'Ī']), + ('Χ', &['Ī']), + ('Ψ', &['Ī']), + ('Ί', &['Ī', 'âĻ']), + ('ÎĒ', &['Ī']), + ('ÎĢ', &['Ī']), + ('ÎŦ', &['Î']), + ('Î', &['Î']), + ('ÎŽ', &['Î']), + ('ί', &['Î']), + ('Îą', &['Î']), + ('β', &['Î', 'Ī']), + ('Îŗ', &['Î']), + ('δ', &['Î']), + ('Îĩ', &['Î', 'Īĩ']), + ('Îļ', &['Î']), + ('Ρ', &['Î']), + ('θ', &['Î', 'Ī', 'Ī´']), + ('Κ', &['\u{345}', 'Î', 'ážž']), + ('Îē', &['Î', 'Ī°']), + ('Îģ', &['Î']), + ('Îŧ', &['Âĩ', 'Î']), + ('ÎŊ', &['Î']), + ('Ξ', &['Î']), + ('Îŋ', &['Î']), + ('Ī', &['Î ', 'Ī']), + ('Ī', &['ÎĄ', 'Īą']), + ('Ī', &['ÎŖ', 'Ī']), + ('Ī', &['ÎŖ', 'Ī']), + ('Ī', &['Τ']), + ('Ī
', &['ÎĨ']), + ('Ī', &['ÎĻ', 'Ī']), + ('Ī', &['Χ']), + ('Ī', &['Ψ']), + ('Ī', &['Ί', 'âĻ']), + ('Ī', &['ÎĒ']), + ('Ī', &['ÎĢ']), + ('Ī', &['Î']), + ('Ī', &['Î']), + ('Ī', &['Î']), + ('Ī', &['Ī']), + ('Ī', &['Î', 'β']), + ('Ī', &['Î', 'θ', 'Ī´']), + ('Ī', &['ÎĻ', 'Ī']), + ('Ī', &['Î ', 'Ī']), + ('Ī', &['Ī']), + ('Ī', &['Ī']), + ('Ī', &['Ī']), + ('Ī', &['Ī']), + ('Ī', &['Ī']), + ('Ī', &['Ī']), + ('Ī', &['Ī']), + ('Ī', &['Ī']), + ('Ī', &['Ī']), + ('Ī ', &['ĪĄ']), + ('ĪĄ', &['Ī ']), + ('Īĸ', &['ĪŖ']), + ('ĪŖ', &['Īĸ']), + ('Ī¤', &['ĪĨ']), + ('ĪĨ', &['Ī¤']), + ('ĪĻ', &['Ī§']), + ('Ī§', &['ĪĻ']), + ('Ī¨', &['ĪŠ']), + ('ĪŠ', &['Ī¨']), + ('ĪĒ', &['ĪĢ']), + ('ĪĢ', &['ĪĒ']), + ('ĪŦ', &['Ī']), + ('Ī', &['ĪŦ']), + ('ĪŽ', &['Ī¯']), + ('Ī¯', &['ĪŽ']), + ('Ī°', &['Î', 'Îē']), + ('Īą', &['ÎĄ', 'Ī']), + ('Ī˛', &['Īš']), + ('Īŗ', &['Íŋ']), + ('Ī´', &['Î', 'θ', 'Ī']), + ('Īĩ', &['Î', 'Îĩ']), + ('Īˇ', &['Ī¸']), + ('Ī¸', &['Īˇ']), + ('Īš', &['Ī˛']), + ('Īē', &['Īģ']), + ('Īģ', &['Īē']), + ('ĪŊ', &['Íģ']), + ('Īž', &['Íŧ']), + ('Īŋ', &['ÍŊ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ
', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Ņ']), + ('Đ', &['Đ°']), + ('Đ', &['Đą']), + ('Đ', &['в', 'á˛']), + ('Đ', &['Đŗ']), + ('Đ', &['Đ´', 'á˛']), + ('Đ', &['Đĩ']), + ('Đ', &['Đļ']), + ('Đ', &['С']), + ('Đ', &['и']), + ('Đ', &['Đš']), + ('Đ', &['Đē']), + ('Đ', &['Đģ']), + ('Đ', &['Đŧ']), + ('Đ', &['ĐŊ']), + ('Đ', &['Đž', 'á˛']), + ('Đ', &['Đŋ']), + ('Đ ', &['Ņ']), + ('ĐĄ', &['Ņ', 'á˛']), + ('Đĸ', &['Ņ', 'á˛', 'á˛
']), + ('ĐŖ', &['Ņ']), + ('Ф', &['Ņ']), + ('ĐĨ', &['Ņ
']), + ('ĐĻ', &['Ņ']), + ('Ч', &['Ņ']), + ('Ш', &['Ņ']), + ('ĐŠ', &['Ņ']), + ('ĐĒ', &['Ņ', 'á˛']), + ('ĐĢ', &['Ņ']), + ('ĐŦ', &['Ņ']), + ('Đ', &['Ņ']), + ('ĐŽ', &['Ņ']), + ('Đ¯', &['Ņ']), + ('Đ°', &['Đ']), + ('Đą', &['Đ']), + ('в', &['Đ', 'á˛']), + ('Đŗ', &['Đ']), + ('Đ´', &['Đ', 'á˛']), + ('Đĩ', &['Đ']), + ('Đļ', &['Đ']), + ('С', &['Đ']), + ('и', &['Đ']), + ('Đš', &['Đ']), + ('Đē', &['Đ']), + ('Đģ', &['Đ']), + ('Đŧ', &['Đ']), + ('ĐŊ', &['Đ']), + ('Đž', &['Đ', 'á˛']), + ('Đŋ', &['Đ']), + ('Ņ', &['Đ ']), + ('Ņ', &['ĐĄ', 'á˛']), + ('Ņ', &['Đĸ', 'á˛', 'á˛
']), + ('Ņ', &['ĐŖ']), + ('Ņ', &['Ф']), + ('Ņ
', &['ĐĨ']), + ('Ņ', &['ĐĻ']), + ('Ņ', &['Ч']), + ('Ņ', &['Ш']), + ('Ņ', &['ĐŠ']), + ('Ņ', &['ĐĒ', 'á˛']), + ('Ņ', &['ĐĢ']), + ('Ņ', &['ĐŦ']), + ('Ņ', &['Đ']), + ('Ņ', &['ĐŽ']), + ('Ņ', &['Đ¯']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ
']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ', &['Đ']), + ('Ņ ', &['ŅĄ']), + ('ŅĄ', &['Ņ ']), + ('Ņĸ', &['ŅŖ', 'á˛']), + ('ŅŖ', &['Ņĸ', 'á˛']), + ('Ņ¤', &['ŅĨ']), + ('ŅĨ', &['Ņ¤']), + ('ŅĻ', &['Ņ§']), + ('Ņ§', &['ŅĻ']), + ('Ņ¨', &['ŅŠ']), + ('ŅŠ', &['Ņ¨']), + ('ŅĒ', &['ŅĢ']), + ('ŅĢ', &['ŅĒ']), + ('ŅŦ', &['Ņ']), + ('Ņ', &['ŅŦ']), + ('ŅŽ', &['Ņ¯']), + ('Ņ¯', &['ŅŽ']), + ('Ņ°', &['Ņą']), + ('Ņą', &['Ņ°']), + ('Ņ˛', &['Ņŗ']), + ('Ņŗ', &['Ņ˛']), + ('Ņ´', &['Ņĩ']), + ('Ņĩ', &['Ņ´']), + ('Ņļ', &['Ņˇ']), + ('Ņˇ', &['Ņļ']), + ('Ņ¸', &['Ņš']), + ('Ņš', &['Ņ¸']), + ('Ņē', &['Ņģ']), + ('Ņģ', &['Ņē']), + ('Ņŧ', &['ŅŊ']), + ('ŅŊ', &['Ņŧ']), + ('Ņž', &['Ņŋ']), + ('Ņŋ', &['Ņž']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō', &['Ō']), + ('Ō ', &['ŌĄ']), + ('ŌĄ', &['Ō ']), + ('Ōĸ', &['ŌŖ']), + ('ŌŖ', &['Ōĸ']), + ('Ō¤', &['ŌĨ']), + ('ŌĨ', &['Ō¤']), + ('ŌĻ', &['Ō§']), + ('Ō§', &['ŌĻ']), + ('Ō¨', &['ŌŠ']), + ('ŌŠ', &['Ō¨']), + ('ŌĒ', &['ŌĢ']), + ('ŌĢ', &['ŌĒ']), + ('ŌŦ', &['Ō']), + ('Ō', &['ŌŦ']), + ('ŌŽ', &['Ō¯']), + ('Ō¯', &['ŌŽ']), + ('Ō°', &['Ōą']), + ('Ōą', &['Ō°']), + ('Ō˛', &['Ōŗ']), + ('Ōŗ', &['Ō˛']), + ('Ō´', &['Ōĩ']), + ('Ōĩ', &['Ō´']), + ('Ōļ', &['Ōˇ']), + ('Ōˇ', &['Ōļ']), + ('Ō¸', &['Ōš']), + ('Ōš', &['Ō¸']), + ('Ōē', &['Ōģ']), + ('Ōģ', &['Ōē']), + ('Ōŧ', &['ŌŊ']), + ('ŌŊ', &['Ōŧ']), + ('Ōž', &['Ōŋ']), + ('Ōŋ', &['Ōž']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ
', &['Ķ']), + ('Ķ', &['Ķ
']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ', &['Ķ']), + ('Ķ ', &['ĶĄ']), + ('ĶĄ', &['Ķ ']), + ('Ķĸ', &['ĶŖ']), + ('ĶŖ', &['Ķĸ']), + ('Ķ¤', &['ĶĨ']), + ('ĶĨ', &['Ķ¤']), + ('ĶĻ', &['Ķ§']), + ('Ķ§', &['ĶĻ']), + ('Ķ¨', &['ĶŠ']), + ('ĶŠ', &['Ķ¨']), + ('ĶĒ', &['ĶĢ']), + ('ĶĢ', &['ĶĒ']), + ('ĶŦ', &['Ķ']), + ('Ķ', &['ĶŦ']), + ('ĶŽ', &['Ķ¯']), + ('Ķ¯', &['ĶŽ']), + ('Ķ°', &['Ķą']), + ('Ķą', &['Ķ°']), + ('Ķ˛', &['Ķŗ']), + ('Ķŗ', &['Ķ˛']), + ('Ķ´', &['Ķĩ']), + ('Ķĩ', &['Ķ´']), + ('Ķļ', &['Ķˇ']), + ('Ķˇ', &['Ķļ']), + ('Ķ¸', &['Ķš']), + ('Ķš', &['Ķ¸']), + ('Ķē', &['Ķģ']), + ('Ķģ', &['Ķē']), + ('Ķŧ', &['ĶŊ']), + ('ĶŊ', &['Ķŧ']), + ('Ķž', &['Ķŋ']), + ('Ķŋ', &['Ķž']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô
']), + ('Ô
', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô', &['Ô']), + ('Ô ', &['ÔĄ']), + ('ÔĄ', &['Ô ']), + ('Ôĸ', &['ÔŖ']), + ('ÔŖ', &['Ôĸ']), + ('Ô¤', &['ÔĨ']), + ('ÔĨ', &['Ô¤']), + ('ÔĻ', &['Ô§']), + ('Ô§', &['ÔĻ']), + ('Ô¨', &['ÔŠ']), + ('ÔŠ', &['Ô¨']), + ('ÔĒ', &['ÔĢ']), + ('ÔĢ', &['ÔĒ']), + ('ÔŦ', &['Ô']), + ('Ô', &['ÔŦ']), + ('ÔŽ', &['Ô¯']), + ('Ô¯', &['ÔŽ']), + ('Ôą', &['ÕĄ']), + ('Ô˛', &['Õĸ']), + ('Ôŗ', &['ÕŖ']), + ('Ô´', &['Õ¤']), + ('Ôĩ', &['ÕĨ']), + ('Ôļ', &['ÕĻ']), + ('Ôˇ', &['Õ§']), + ('Ô¸', &['Õ¨']), + ('Ôš', &['ÕŠ']), + ('Ôē', &['ÕĒ']), + ('Ôģ', &['ÕĢ']), + ('Ôŧ', &['ÕŦ']), + ('ÔŊ', &['Õ']), + ('Ôž', &['ÕŽ']), + ('Ôŋ', &['Õ¯']), + ('Õ', &['Õ°']), + ('Õ', &['Õą']), + ('Õ', &['Õ˛']), + ('Õ', &['Õŗ']), + ('Õ', &['Õ´']), + ('Õ
', &['Õĩ']), + ('Õ', &['Õļ']), + ('Õ', &['Õˇ']), + ('Õ', &['Õ¸']), + ('Õ', &['Õš']), + ('Õ', &['Õē']), + ('Õ', &['Õģ']), + ('Õ', &['Õŧ']), + ('Õ', &['ÕŊ']), + ('Õ', &['Õž']), + ('Õ', &['Õŋ']), + ('Õ', &['Ö']), + ('Õ', &['Ö']), + ('Õ', &['Ö']), + ('Õ', &['Ö']), + ('Õ', &['Ö']), + ('Õ', &['Ö
']), + ('Õ', &['Ö']), + ('ÕĄ', &['Ôą']), + ('Õĸ', &['Ô˛']), + ('ÕŖ', &['Ôŗ']), + ('Õ¤', &['Ô´']), + ('ÕĨ', &['Ôĩ']), + ('ÕĻ', &['Ôļ']), + ('Õ§', &['Ôˇ']), + ('Õ¨', &['Ô¸']), + ('ÕŠ', &['Ôš']), + ('ÕĒ', &['Ôē']), + ('ÕĢ', &['Ôģ']), + ('ÕŦ', &['Ôŧ']), + ('Õ', &['ÔŊ']), + ('ÕŽ', &['Ôž']), + ('Õ¯', &['Ôŋ']), + ('Õ°', &['Õ']), + ('Õą', &['Õ']), + ('Õ˛', &['Õ']), + ('Õŗ', &['Õ']), + ('Õ´', &['Õ']), + ('Õĩ', &['Õ
']), + ('Õļ', &['Õ']), + ('Õˇ', &['Õ']), + ('Õ¸', &['Õ']), + ('Õš', &['Õ']), + ('Õē', &['Õ']), + ('Õģ', &['Õ']), + ('Õŧ', &['Õ']), + ('ÕŊ', &['Õ']), + ('Õž', &['Õ']), + ('Õŋ', &['Õ']), + ('Ö', &['Õ']), + ('Ö', &['Õ']), + ('Ö', &['Õ']), + ('Ö', &['Õ']), + ('Ö', &['Õ']), + ('Ö
', &['Õ']), + ('Ö', &['Õ']), + ('á ', &['â´']), + ('áĄ', &['â´']), + ('áĸ', &['â´']), + ('áŖ', &['â´']), + ('á¤', &['â´']), + ('áĨ', &['â´
']), + ('áĻ', &['â´']), + ('á§', &['â´']), + ('á¨', &['â´']), + ('áŠ', &['â´']), + ('áĒ', &['â´']), + ('áĢ', &['â´']), + ('áŦ', &['â´']), + ('á', &['â´']), + ('áŽ', &['â´']), + ('á¯', &['â´']), + ('á°', &['â´']), + ('áą', &['â´']), + ('á˛', &['â´']), + ('áŗ', &['â´']), + ('á´', &['â´']), + ('áĩ', &['â´']), + ('áļ', &['â´']), + ('áˇ', &['â´']), + ('á¸', &['â´']), + ('áš', &['â´']), + ('áē', &['â´']), + ('áģ', &['â´']), + ('áŧ', &['â´']), + ('áŊ', &['â´']), + ('áž', &['â´']), + ('áŋ', &['â´']), + ('á', &['â´ ']), + ('á', &['â´Ą']), + ('á', &['â´ĸ']), + ('á', &['â´Ŗ']), + ('á', &['â´¤']), + ('á
', &['â´Ĩ']), + ('á', &['â´§']), + ('á', &['â´']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á', &['á˛']), + ('á ', &['Რ']), + ('áĄ', &['Ქ']), + ('áĸ', &['á˛ĸ']), + ('áŖ', &['á˛Ŗ']), + ('á¤', &['Ფ']), + ('áĨ', &['á˛Ĩ']), + ('áĻ', &['á˛Ļ']), + ('á§', &['Ყ']), + ('á¨', &['Შ']), + ('áŠ', &['ᲊ']), + ('áĒ', &['á˛Ē']), + ('áĢ', &['á˛Ģ']), + ('áŦ', &['á˛Ŧ']), + ('á', &['á˛']), + ('áŽ', &['᲎']), + ('á¯', &['á˛¯']), + ('á°', &['Ჰ']), + ('áą', &['Ჹ']), + ('á˛', &['Ჲ']), + ('áŗ', &['á˛ŗ']), + ('á´', &['Ჴ']), + ('áĩ', &['á˛ĩ']), + ('áļ', &['á˛ļ']), + ('áˇ', &['Ს']), + ('á¸', &['Ჸ']), + ('áš', &['Ლ']), + ('áē', &['á˛ē']), + ('áŊ', &['á˛Ŋ']), + ('áž', &['Პ']), + ('áŋ', &['á˛ŋ']), + ('á ', &['ę°']), + ('áĄ', &['ęą']), + ('áĸ', &['ę˛']), + ('áŖ', &['ęŗ']), + ('á¤', &['ę´']), + ('áĨ', &['ęĩ']), + ('áĻ', &['ęļ']), + ('á§', &['ęˇ']), + ('á¨', &['ę¸']), + ('áŠ', &['ęš']), + ('áĒ', &['ęē']), + ('áĢ', &['ęģ']), + ('áŦ', &['ęŧ']), + ('á', &['ęŊ']), + ('áŽ', &['ęž']), + ('á¯', &['ęŋ']), + ('á°', &['ęŽ']), + ('áą', &['ęŽ']), + ('á˛', &['ęŽ']), + ('áŗ', &['ęŽ']), + ('á´', &['ęŽ']), + ('áĩ', &['ęŽ
']), + ('áļ', &['ęŽ']), + ('áˇ', &['ęŽ']), + ('á¸', &['ęŽ']), + ('áš', &['ęŽ']), + ('áē', &['ęŽ']), + ('áģ', &['ęŽ']), + ('áŧ', &['ęŽ']), + ('áŊ', &['ęŽ']), + ('áž', &['ęŽ']), + ('áŋ', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á
', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ']), + ('á', &['ęŽ ']), + ('á', &['ꎥ']), + ('á', &['ęŽĸ']), + ('á', &['ęŽŖ']), + ('á', &['ꎤ']), + ('á', &['ęŽĨ']), + ('á', &['ęŽĻ']), + ('á', &['ꎧ']), + ('á', &['ꎨ']), + ('á', &['ꎊ']), + ('á', &['ęŽĒ']), + ('á', &['ęŽĢ']), + ('á', &['ęŽŦ']), + ('á', &['ęŽ']), + ('á', &['ꎎ']), + ('á', &['ęŽ¯']), + ('á ', &['ꎰ']), + ('áĄ', &['ꎹ']), + ('áĸ', &['ꎲ']), + ('áŖ', &['ęŽŗ']), + ('á¤', &['ꎴ']), + ('áĨ', &['ęŽĩ']), + ('áĻ', &['ęŽļ']), + ('á§', &['ꎡ']), + ('á¨', &['ꎸ']), + ('áŠ', &['ꎚ']), + ('áĒ', &['ęŽē']), + ('áĢ', &['ęŽģ']), + ('áŦ', &['ęŽŧ']), + ('á', &['ęŽŊ']), + ('áŽ', &['ꎞ']), + ('á¯', &['ęŽŋ']), + ('á°', &['á¸']), + ('áą', &['áš']), + ('á˛', &['áē']), + ('áŗ', &['áģ']), + ('á´', &['áŧ']), + ('áĩ', &['áŊ']), + ('á¸', &['á°']), + ('áš', &['áą']), + ('áē', &['á˛']), + ('áģ', &['áŗ']), + ('áŧ', &['á´']), + ('áŊ', &['áĩ']), + ('á˛', &['Đ', 'в']), + ('á˛', &['Đ', 'Đ´']), + ('á˛', &['Đ', 'Đž']), + ('á˛', &['ĐĄ', 'Ņ']), + ('á˛', &['Đĸ', 'Ņ', 'á˛
']), + ('á˛
', &['Đĸ', 'Ņ', 'á˛']), + ('á˛', &['ĐĒ', 'Ņ']), + ('á˛', &['Ņĸ', 'ŅŖ']), + ('á˛', &['ę', 'ę']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('á˛', &['á']), + ('Რ', &['á ']), + ('Ქ', &['áĄ']), + ('á˛ĸ', &['áĸ']), + ('á˛Ŗ', &['áŖ']), + ('Ფ', &['á¤']), + ('á˛Ĩ', &['áĨ']), + ('á˛Ļ', &['áĻ']), + ('Ყ', &['á§']), + ('Შ', &['á¨']), + ('ᲊ', &['áŠ']), + ('á˛Ē', &['áĒ']), + ('á˛Ģ', &['áĢ']), + ('á˛Ŧ', &['áŦ']), + ('á˛', &['á']), + ('᲎', &['áŽ']), + ('á˛¯', &['á¯']), + ('Ჰ', &['á°']), + ('Ჹ', &['áą']), + ('Ჲ', &['á˛']), + ('á˛ŗ', &['áŗ']), + ('Ჴ', &['á´']), + ('á˛ĩ', &['áĩ']), + ('á˛ļ', &['áļ']), + ('Ს', &['áˇ']), + ('Ჸ', &['á¸']), + ('Ლ', &['áš']), + ('á˛ē', &['áē']), + ('á˛Ŋ', &['áŊ']), + ('Პ', &['áž']), + ('á˛ŋ', &['áŋ']), + ('áĩš', &['ęŊ']), + ('áĩŊ', &['âąŖ']), + ('áļ', &['ę']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸
']), + ('á¸
', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('á¸', &['á¸']), + ('Ḡ', &['ḥ']), + ('ḥ', &['Ḡ']), + ('á¸ĸ', &['á¸Ŗ']), + ('á¸Ŗ', &['á¸ĸ']), + ('Ḥ', &['á¸Ĩ']), + ('á¸Ĩ', &['Ḥ']), + ('á¸Ļ', &['ḧ']), + ('ḧ', &['á¸Ļ']), + ('Ḩ', &['Ḋ']), + ('Ḋ', &['Ḩ']), + ('á¸Ē', &['á¸Ģ']), + ('á¸Ģ', &['á¸Ē']), + ('á¸Ŧ', &['á¸']), + ('á¸', &['á¸Ŧ']), + ('Ḏ', &['ḯ']), + ('ḯ', &['Ḏ']), + ('Ḱ', &['ḹ']), + ('ḹ', &['Ḱ']), + ('Ḳ', &['á¸ŗ']), + ('á¸ŗ', &['Ḳ']), + ('Ḵ', &['á¸ĩ']), + ('á¸ĩ', &['Ḵ']), + ('á¸ļ', &['ḡ']), + ('ḡ', &['á¸ļ']), + ('Ḹ', &['Ḛ']), + ('Ḛ', &['Ḹ']), + ('á¸ē', &['á¸ģ']), + ('á¸ģ', &['á¸ē']), + ('á¸ŧ', &['á¸Ŋ']), + ('á¸Ŋ', &['á¸ŧ']), + ('Ḟ', &['á¸ŋ']), + ('á¸ŋ', &['Ḟ']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš
']), + ('áš
', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš', &['áš']), + ('áš ', &['ᚥ', 'áē']), + ('ᚥ', &['áš ', 'áē']), + ('ášĸ', &['ášŖ']), + ('ášŖ', &['ášĸ']), + ('ᚤ', &['ášĨ']), + ('ášĨ', &['ᚤ']), + ('ášĻ', &['ᚧ']), + ('ᚧ', &['ášĻ']), + ('ᚨ', &['ᚊ']), + ('ᚊ', &['ᚨ']), + ('ášĒ', &['ášĢ']), + ('ášĢ', &['ášĒ']), + ('ášŦ', &['áš']), + ('áš', &['ášŦ']), + ('ᚎ', &['ᚯ']), + ('ᚯ', &['ᚎ']), + ('áš°', &['ášą']), + ('ášą', &['áš°']), + ('ᚲ', &['ášŗ']), + ('ášŗ', &['ᚲ']), + ('áš´', &['ášĩ']), + ('ášĩ', &['áš´']), + ('ášļ', &['ᚡ']), + ('ᚡ', &['ášļ']), + ('ᚸ', &['ášš']), + ('ášš', &['ᚸ']), + ('ášē', &['ášģ']), + ('ášģ', &['ášē']), + ('ášŧ', &['ášŊ']), + ('ášŊ', &['ášŧ']), + ('ášž', &['ášŋ']), + ('ášŋ', &['ášž']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē
']), + ('áē
', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áē']), + ('áē', &['áš ', 'ᚥ']), + ('áē', &['Ã']), + ('áē ', &['áēĄ']), + ('áēĄ', &['áē ']), + ('áēĸ', &['áēŖ']), + ('áēŖ', &['áēĸ']), + ('áē¤', &['áēĨ']), + ('áēĨ', &['áē¤']), + ('áēĻ', &['áē§']), + ('áē§', &['áēĻ']), + ('áē¨', &['áēŠ']), + ('áēŠ', &['áē¨']), + ('áēĒ', &['áēĢ']), + ('áēĢ', &['áēĒ']), + ('áēŦ', &['áē']), + ('áē', &['áēŦ']), + ('áēŽ', &['áē¯']), + ('áē¯', &['áēŽ']), + ('áē°', &['áēą']), + ('áēą', &['áē°']), + ('áē˛', &['áēŗ']), + ('áēŗ', &['áē˛']), + ('áē´', &['áēĩ']), + ('áēĩ', &['áē´']), + ('áēļ', &['áēˇ']), + ('áēˇ', &['áēļ']), + ('áē¸', &['áēš']), + ('áēš', &['áē¸']), + ('áēē', &['áēģ']), + ('áēģ', &['áēē']), + ('áēŧ', &['áēŊ']), + ('áēŊ', &['áēŧ']), + ('áēž', &['áēŋ']), + ('áēŋ', &['áēž']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ
']), + ('áģ
', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ', &['áģ']), + ('áģ ', &['áģĄ']), + ('áģĄ', &['áģ ']), + ('áģĸ', &['áģŖ']), + ('áģŖ', &['áģĸ']), + ('áģ¤', &['áģĨ']), + ('áģĨ', &['áģ¤']), + ('áģĻ', &['áģ§']), + ('áģ§', &['áģĻ']), + ('áģ¨', &['áģŠ']), + ('áģŠ', &['áģ¨']), + ('áģĒ', &['áģĢ']), + ('áģĢ', &['áģĒ']), + ('áģŦ', &['áģ']), + ('áģ', &['áģŦ']), + ('áģŽ', &['áģ¯']), + ('áģ¯', &['áģŽ']), + ('áģ°', &['áģą']), + ('áģą', &['áģ°']), + ('áģ˛', &['áģŗ']), + ('áģŗ', &['áģ˛']), + ('áģ´', &['áģĩ']), + ('áģĩ', &['áģ´']), + ('áģļ', &['áģˇ']), + ('áģˇ', &['áģļ']), + ('áģ¸', &['áģš']), + ('áģš', &['áģ¸']), + ('áģē', &['áģģ']), + ('áģģ', &['áģē']), + ('áģŧ', &['áģŊ']), + ('áģŊ', &['áģŧ']), + ('áģž', &['áģŋ']), + ('áģŋ', &['áģž']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ
', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ
']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ', &['áŧ']), + ('áŧ ', &['áŧ¨']), + ('áŧĄ', &['áŧŠ']), + ('áŧĸ', &['áŧĒ']), + ('áŧŖ', &['áŧĢ']), + ('áŧ¤', &['áŧŦ']), + ('áŧĨ', &['áŧ']), + ('áŧĻ', &['áŧŽ']), + ('áŧ§', &['áŧ¯']), + ('áŧ¨', &['áŧ ']), + ('áŧŠ', &['áŧĄ']), + ('áŧĒ', &['áŧĸ']), + ('áŧĢ', &['áŧŖ']), + ('áŧŦ', &['áŧ¤']), + ('áŧ', &['áŧĨ']), + ('áŧŽ', &['áŧĻ']), + ('áŧ¯', &['áŧ§']), + ('áŧ°', &['áŧ¸']), + ('áŧą', &['áŧš']), + ('áŧ˛', &['áŧē']), + ('áŧŗ', &['áŧģ']), + ('áŧ´', &['áŧŧ']), + ('áŧĩ', &['áŧŊ']), + ('áŧļ', &['áŧž']), + ('áŧˇ', &['áŧŋ']), + ('áŧ¸', &['áŧ°']), + ('áŧš', &['áŧą']), + ('áŧē', &['áŧ˛']), + ('áŧģ', &['áŧŗ']), + ('áŧŧ', &['áŧ´']), + ('áŧŊ', &['áŧĩ']), + ('áŧž', &['áŧļ']), + ('áŧŋ', &['áŧˇ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ
', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ
']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ', &['áŊ']), + ('áŊ ', &['áŊ¨']), + ('áŊĄ', &['áŊŠ']), + ('áŊĸ', &['áŊĒ']), + ('áŊŖ', &['áŊĢ']), + ('áŊ¤', &['áŊŦ']), + ('áŊĨ', &['áŊ']), + ('áŊĻ', &['áŊŽ']), + ('áŊ§', &['áŊ¯']), + ('áŊ¨', &['áŊ ']), + ('áŊŠ', &['áŊĄ']), + ('áŊĒ', &['áŊĸ']), + ('áŊĢ', &['áŊŖ']), + ('áŊŦ', &['áŊ¤']), + ('áŊ', &['áŊĨ']), + ('áŊŽ', &['áŊĻ']), + ('áŊ¯', &['áŊ§']), + ('áŊ°', &['ážē']), + ('áŊą', &['ážģ']), + ('áŊ˛', &['áŋ']), + ('áŊŗ', &['áŋ']), + ('áŊ´', &['áŋ']), + ('áŊĩ', &['áŋ']), + ('áŊļ', &['áŋ']), + ('áŊˇ', &['áŋ']), + ('áŊ¸', &['áŋ¸']), + ('áŊš', &['áŋš']), + ('áŊē', &['áŋĒ']), + ('áŊģ', &['áŋĢ']), + ('áŊŧ', &['áŋē']), + ('áŊŊ', &['áŋģ']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž
', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž
']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž', &['áž']), + ('áž ', &['ឨ']), + ('ឥ', &['ដ']), + ('ážĸ', &['ážĒ']), + ('ážŖ', &['ážĢ']), + ('ឤ', &['ážŦ']), + ('ážĨ', &['áž']), + ('ážĻ', &['ណ']), + ('ឧ', &['ឯ']), + ('ឨ', &['áž ']), + ('ដ', &['ឥ']), + ('ážĒ', &['ážĸ']), + ('ážĢ', &['ážŖ']), + ('ážŦ', &['ឤ']), + ('áž', &['ážĨ']), + ('ណ', &['ážĻ']), + ('ឯ', &['ឧ']), + ('áž°', &['ី']), + ('ážą', &['ážš']), + ('ážŗ', &['ážŧ']), + ('ី', &['áž°']), + ('ážš', &['ážą']), + ('ážē', &['áŊ°']), + ('ážģ', &['áŊą']), + ('ážŧ', &['ážŗ']), + ('ážž', &['\u{345}', 'Î', 'Κ']), + ('áŋ', &['áŋ']), + ('áŋ', &['áŊ˛']), + ('áŋ', &['áŊŗ']), + ('áŋ', &['áŊ´']), + ('áŋ', &['áŊĩ']), + ('áŋ', &['áŋ']), + ('áŋ', &['áŋ']), + ('áŋ', &['áŋ']), + ('áŋ', &['áŋ']), + ('áŋ', &['áŋ']), + ('áŋ', &['áŊļ']), + ('áŋ', &['áŊˇ']), + ('áŋ ', &['áŋ¨']), + ('áŋĄ', &['áŋŠ']), + ('áŋĨ', &['áŋŦ']), + ('áŋ¨', &['áŋ ']), + ('áŋŠ', &['áŋĄ']), + ('áŋĒ', &['áŊē']), + ('áŋĢ', &['áŊģ']), + ('áŋŦ', &['áŋĨ']), + ('áŋŗ', &['áŋŧ']), + ('áŋ¸', &['áŊ¸']), + ('áŋš', &['áŊš']), + ('áŋē', &['áŊŧ']), + ('áŋģ', &['áŊŊ']), + ('áŋŧ', &['áŋŗ']), + ('âĻ', &['Ί', 'Ī']), + ('âĒ', &['K', 'k']), + ('âĢ', &['Ã
', 'ÃĨ']), + ('â˛', &['â
']), + ('â
', &['â˛']), + ('â
', &['â
°']), + ('â
Ą', &['â
ą']), + ('â
ĸ', &['â
˛']), + ('â
Ŗ', &['â
ŗ']), + ('â
¤', &['â
´']), + ('â
Ĩ', &['â
ĩ']), + ('â
Ļ', &['â
ļ']), + ('â
§', &['â
ˇ']), + ('â
¨', &['â
¸']), + ('â
Š', &['â
š']), + ('â
Ē', &['â
ē']), + ('â
Ģ', &['â
ģ']), + ('â
Ŧ', &['â
ŧ']), + ('â
', &['â
Ŋ']), + ('â
Ž', &['â
ž']), + ('â
¯', &['â
ŋ']), + ('â
°', &['â
']), + ('â
ą', &['â
Ą']), + ('â
˛', &['â
ĸ']), + ('â
ŗ', &['â
Ŗ']), + ('â
´', &['â
¤']), + ('â
ĩ', &['â
Ĩ']), + ('â
ļ', &['â
Ļ']), + ('â
ˇ', &['â
§']), + ('â
¸', &['â
¨']), + ('â
š', &['â
Š']), + ('â
ē', &['â
Ē']), + ('â
ģ', &['â
Ģ']), + ('â
ŧ', &['â
Ŧ']), + ('â
Ŋ', &['â
']), + ('â
ž', &['â
Ž']), + ('â
ŋ', &['â
¯']), + ('â', &['â']), + ('â', &['â']), + ('âļ', &['â']), + ('âˇ', &['â']), + ('â¸', &['â']), + ('âš', &['â']), + ('âē', &['â']), + ('âģ', &['â']), + ('âŧ', &['â']), + ('âŊ', &['â']), + ('âž', &['â']), + ('âŋ', &['â']), + ('â', &['â']), + ('â', &['â']), + ('â', &['â']), + ('â', &['â']), + ('â', &['â']), + ('â
', &['â']), + ('â', &['â ']), + ('â', &['âĄ']), + ('â', &['âĸ']), + ('â', &['âŖ']), + ('â', &['â¤']), + ('â', &['âĨ']), + ('â', &['âĻ']), + ('â', &['â§']), + ('â', &['â¨']), + ('â', &['âŠ']), + ('â', &['âļ']), + ('â', &['âˇ']), + ('â', &['â¸']), + ('â', &['âš']), + ('â', &['âē']), + ('â', &['âģ']), + ('â', &['âŧ']), + ('â', &['âŊ']), + ('â', &['âž']), + ('â', &['âŋ']), + ('â', &['â']), + ('â', &['â']), + ('â', &['â']), + ('â', &['â']), + ('â', &['â']), + ('â', &['â
']), + ('â ', &['â']), + ('âĄ', &['â']), + ('âĸ', &['â']), + ('âŖ', &['â']), + ('â¤', &['â']), + ('âĨ', &['â']), + ('âĻ', &['â']), + ('â§', &['â']), + ('â¨', &['â']), + ('âŠ', &['â']), + ('â°', &['â°°']), + ('â°', &['â°ą']), + ('â°', &['â°˛']), + ('â°', &['â°ŗ']), + ('â°', &['â°´']), + ('â°
', &['â°ĩ']), + ('â°', &['â°ļ']), + ('â°', &['â°ˇ']), + ('â°', &['â°¸']), + ('â°', &['â°š']), + ('â°', &['â°ē']), + ('â°', &['â°ģ']), + ('â°', &['â°ŧ']), + ('â°', &['â°Ŋ']), + ('â°', &['â°ž']), + ('â°', &['â°ŋ']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą
']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â°', &['âą']), + ('â° ', &['âą']), + ('â°Ą', &['âą']), + ('â°ĸ', &['âą']), + ('â°Ŗ', &['âą']), + ('â°¤', &['âą']), + ('â°Ĩ', &['âą']), + ('â°Ļ', &['âą']), + ('â°§', &['âą']), + ('â°¨', &['âą']), + ('â°Š', &['âą']), + ('â°Ē', &['âą']), + ('â°Ģ', &['âą']), + ('â°Ŧ', &['âą']), + ('â°', &['âą']), + ('â°Ž', &['âą']), + ('â°°', &['â°']), + ('â°ą', &['â°']), + ('â°˛', &['â°']), + ('â°ŗ', &['â°']), + ('â°´', &['â°']), + ('â°ĩ', &['â°
']), + ('â°ļ', &['â°']), + ('â°ˇ', &['â°']), + ('â°¸', &['â°']), + ('â°š', &['â°']), + ('â°ē', &['â°']), + ('â°ģ', &['â°']), + ('â°ŧ', &['â°']), + ('â°Ŋ', &['â°']), + ('â°ž', &['â°']), + ('â°ŋ', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą
', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â°']), + ('âą', &['â° ']), + ('âą', &['â°Ą']), + ('âą', &['â°ĸ']), + ('âą', &['â°Ŗ']), + ('âą', &['â°¤']), + ('âą', &['â°Ĩ']), + ('âą', &['â°Ļ']), + ('âą', &['â°§']), + ('âą', &['â°¨']), + ('âą', &['â°Š']), + ('âą', &['â°Ē']), + ('âą', &['â°Ģ']), + ('âą', &['â°Ŧ']), + ('âą', &['â°']), + ('âą', &['â°Ž']), + ('âą ', &['⹥']), + ('⹥', &['âą ']), + ('âąĸ', &['ÉĢ']), + ('âąŖ', &['áĩŊ']), + ('⹤', &['ÉŊ']), + ('âąĨ', &['Čē']), + ('âąĻ', &['Čž']), + ('⹧', &['⹨']), + ('⹨', &['⹧']), + ('⹊', &['âąĒ']), + ('âąĒ', &['⹊']), + ('âąĢ', &['âąŦ']), + ('âąŦ', &['âąĢ']), + ('âą', &['É']), + ('⹎', &['Éą']), + ('âą¯', &['É']), + ('âą°', &['É']), + ('⹲', &['âąŗ']), + ('âąŗ', &['⹲']), + ('âąĩ', &['âąļ']), + ('âąļ', &['âąĩ']), + ('âąž', &['Čŋ']), + ('âąŋ', &['É']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛
']), + ('â˛
', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('â˛', &['â˛']), + ('Ⲡ', &['ⲥ']), + ('ⲥ', &['Ⲡ']), + ('â˛ĸ', &['â˛Ŗ']), + ('â˛Ŗ', &['â˛ĸ']), + ('Ⲥ', &['â˛Ĩ']), + ('â˛Ĩ', &['Ⲥ']), + ('â˛Ļ', &['ⲧ']), + ('ⲧ', &['â˛Ļ']), + ('Ⲩ', &['Ⲋ']), + ('Ⲋ', &['Ⲩ']), + ('â˛Ē', &['â˛Ģ']), + ('â˛Ģ', &['â˛Ē']), + ('â˛Ŧ', &['â˛']), + ('â˛', &['â˛Ŧ']), + ('Ⲏ', &['â˛¯']), + ('â˛¯', &['Ⲏ']), + ('Ⲱ', &['ⲹ']), + ('ⲹ', &['Ⲱ']), + ('Ⲳ', &['â˛ŗ']), + ('â˛ŗ', &['Ⲳ']), + ('Ⲵ', &['â˛ĩ']), + ('â˛ĩ', &['Ⲵ']), + ('â˛ļ', &['ⲡ']), + ('ⲡ', &['â˛ļ']), + ('Ⲹ', &['Ⲛ']), + ('Ⲛ', &['Ⲹ']), + ('â˛ē', &['â˛ģ']), + ('â˛ģ', &['â˛ē']), + ('â˛ŧ', &['â˛Ŋ']), + ('â˛Ŋ', &['â˛ŧ']), + ('Ⲟ', &['â˛ŋ']), + ('â˛ŋ', &['Ⲟ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ
']), + ('âŗ
', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ', &['âŗ']), + ('âŗ ', &['âŗĄ']), + ('âŗĄ', &['âŗ ']), + ('âŗĸ', &['âŗŖ']), + ('âŗŖ', &['âŗĸ']), + ('âŗĢ', &['âŗŦ']), + ('âŗŦ', &['âŗĢ']), + ('âŗ', &['âŗŽ']), + ('âŗŽ', &['âŗ']), + ('âŗ˛', &['âŗŗ']), + ('âŗŗ', &['âŗ˛']), + ('â´', &['á ']), + ('â´', &['áĄ']), + ('â´', &['áĸ']), + ('â´', &['áŖ']), + ('â´', &['á¤']), + ('â´
', &['áĨ']), + ('â´', &['áĻ']), + ('â´', &['á§']), + ('â´', &['á¨']), + ('â´', &['áŠ']), + ('â´', &['áĒ']), + ('â´', &['áĢ']), + ('â´', &['áŦ']), + ('â´', &['á']), + ('â´', &['áŽ']), + ('â´', &['á¯']), + ('â´', &['á°']), + ('â´', &['áą']), + ('â´', &['á˛']), + ('â´', &['áŗ']), + ('â´', &['á´']), + ('â´', &['áĩ']), + ('â´', &['áļ']), + ('â´', &['áˇ']), + ('â´', &['á¸']), + ('â´', &['áš']), + ('â´', &['áē']), + ('â´', &['áģ']), + ('â´', &['áŧ']), + ('â´', &['áŊ']), + ('â´', &['áž']), + ('â´', &['áŋ']), + ('â´ ', &['á']), + ('â´Ą', &['á']), + ('â´ĸ', &['á']), + ('â´Ŗ', &['á']), + ('â´¤', &['á']), + ('â´Ĩ', &['á
']), + ('â´§', &['á']), + ('â´', &['á']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę
']), + ('ę
', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['á˛', 'ę']), + ('ę', &['á˛', 'ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę ', &['ęĄ']), + ('ęĄ', &['ę ']), + ('ęĸ', &['ęŖ']), + ('ęŖ', &['ęĸ']), + ('ę¤', &['ęĨ']), + ('ęĨ', &['ę¤']), + ('ęĻ', &['ę§']), + ('ę§', &['ęĻ']), + ('ę¨', &['ęŠ']), + ('ęŠ', &['ę¨']), + ('ęĒ', &['ęĢ']), + ('ęĢ', &['ęĒ']), + ('ęŦ', &['ę']), + ('ę', &['ęŦ']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę
']), + ('ę
', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ęĸ', &['ęŖ']), + ('ęŖ', &['ęĸ']), + ('ę¤', &['ęĨ']), + ('ęĨ', &['ę¤']), + ('ęĻ', &['ę§']), + ('ę§', &['ęĻ']), + ('ę¨', &['ęŠ']), + ('ęŠ', &['ę¨']), + ('ęĒ', &['ęĢ']), + ('ęĢ', &['ęĒ']), + ('ęŦ', &['ę']), + ('ę', &['ęŦ']), + ('ęŽ', &['ę¯']), + ('ę¯', &['ęŽ']), + ('ę˛', &['ęŗ']), + ('ęŗ', &['ę˛']), + ('ę´', &['ęĩ']), + ('ęĩ', &['ę´']), + ('ęļ', &['ęˇ']), + ('ęˇ', &['ęļ']), + ('ę¸', &['ęš']), + ('ęš', &['ę¸']), + ('ęē', &['ęģ']), + ('ęģ', &['ęē']), + ('ęŧ', &['ęŊ']), + ('ęŊ', &['ęŧ']), + ('ęž', &['ęŋ']), + ('ęŋ', &['ęž']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę
']), + ('ę
', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę ', &['ęĄ']), + ('ęĄ', &['ę ']), + ('ęĸ', &['ęŖ']), + ('ęŖ', &['ęĸ']), + ('ę¤', &['ęĨ']), + ('ęĨ', &['ę¤']), + ('ęĻ', &['ę§']), + ('ę§', &['ęĻ']), + ('ę¨', &['ęŠ']), + ('ęŠ', &['ę¨']), + ('ęĒ', &['ęĢ']), + ('ęĢ', &['ęĒ']), + ('ęŦ', &['ę']), + ('ę', &['ęŦ']), + ('ęŽ', &['ę¯']), + ('ę¯', &['ęŽ']), + ('ęš', &['ęē']), + ('ęē', &['ęš']), + ('ęģ', &['ęŧ']), + ('ęŧ', &['ęģ']), + ('ęŊ', &['áĩš']), + ('ęž', &['ęŋ']), + ('ęŋ', &['ęž']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę
']), + ('ę
', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ÉĨ']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę ', &['ęĄ']), + ('ęĄ', &['ę ']), + ('ęĸ', &['ęŖ']), + ('ęŖ', &['ęĸ']), + ('ę¤', &['ęĨ']), + ('ęĨ', &['ę¤']), + ('ęĻ', &['ę§']), + ('ę§', &['ęĻ']), + ('ę¨', &['ęŠ']), + ('ęŠ', &['ę¨']), + ('ęĒ', &['ÉĻ']), + ('ęĢ', &['É']), + ('ęŦ', &['ÉĄ']), + ('ę', &['ÉŦ']), + ('ęŽ', &['ÉĒ']), + ('ę°', &['Ę']), + ('ęą', &['Ę']), + ('ę˛', &['Ę']), + ('ęŗ', &['ę']), + ('ę´', &['ęĩ']), + ('ęĩ', &['ę´']), + ('ęļ', &['ęˇ']), + ('ęˇ', &['ęļ']), + ('ę¸', &['ęš']), + ('ęš', &['ę¸']), + ('ęē', &['ęģ']), + ('ęģ', &['ęē']), + ('ęŧ', &['ęŊ']), + ('ęŊ', &['ęŧ']), + ('ęž', &['ęŋ']), + ('ęŋ', &['ęž']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę', &['ę']), + ('ę
', &['Ę']), + ('ę', &['áļ']), + ('\u{a7c7}', &['\u{a7c8}']), + ('\u{a7c8}', &['\u{a7c7}']), + ('\u{a7c9}', &['\u{a7ca}']), + ('\u{a7ca}', &['\u{a7c9}']), + ('\u{a7f5}', &['\u{a7f6}']), + ('\u{a7f6}', &['\u{a7f5}']), + ('ę', &['ęŗ']), + ('ę°', &['á ']), + ('ęą', &['áĄ']), + ('ę˛', &['áĸ']), + ('ęŗ', &['áŖ']), + ('ę´', &['á¤']), + ('ęĩ', &['áĨ']), + ('ęļ', &['áĻ']), + ('ęˇ', &['á§']), + ('ę¸', &['á¨']), + ('ęš', &['áŠ']), + ('ęē', &['áĒ']), + ('ęģ', &['áĢ']), + ('ęŧ', &['áŦ']), + ('ęŊ', &['á']), + ('ęž', &['áŽ']), + ('ęŋ', &['á¯']), + ('ęŽ', &['á°']), + ('ęŽ', &['áą']), + ('ęŽ', &['á˛']), + ('ęŽ', &['áŗ']), + ('ęŽ', &['á´']), + ('ęŽ
', &['áĩ']), + ('ęŽ', &['áļ']), + ('ęŽ', &['áˇ']), + ('ęŽ', &['á¸']), + ('ęŽ', &['áš']), + ('ęŽ', &['áē']), + ('ęŽ', &['áģ']), + ('ęŽ', &['áŧ']), + ('ęŽ', &['áŊ']), + ('ęŽ', &['áž']), + ('ęŽ', &['áŋ']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á
']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ', &['á']), + ('ęŽ ', &['á']), + ('ꎥ', &['á']), + ('ęŽĸ', &['á']), + ('ęŽŖ', &['á']), + ('ꎤ', &['á']), + ('ęŽĨ', &['á']), + ('ęŽĻ', &['á']), + ('ꎧ', &['á']), + ('ꎨ', &['á']), + ('ꎊ', &['á']), + ('ęŽĒ', &['á']), + ('ęŽĢ', &['á']), + ('ęŽŦ', &['á']), + ('ęŽ', &['á']), + ('ꎎ', &['á']), + ('ęŽ¯', &['á']), + ('ꎰ', &['á ']), + ('ꎹ', &['áĄ']), + ('ꎲ', &['áĸ']), + ('ęŽŗ', &['áŖ']), + ('ꎴ', &['á¤']), + ('ęŽĩ', &['áĨ']), + ('ęŽļ', &['áĻ']), + ('ꎡ', &['á§']), + ('ꎸ', &['á¨']), + ('ꎚ', &['áŠ']), + ('ęŽē', &['áĒ']), + ('ęŽģ', &['áĢ']), + ('ęŽŧ', &['áŦ']), + ('ęŽŊ', &['á']), + ('ꎞ', &['áŽ']), + ('ęŽŋ', &['á¯']), + ('īŧĄ', &['īŊ']), + ('īŧĸ', &['īŊ']), + ('īŧŖ', &['īŊ']), + ('īŧ¤', &['īŊ']), + ('īŧĨ', &['īŊ
']), + ('īŧĻ', &['īŊ']), + ('īŧ§', &['īŊ']), + ('īŧ¨', &['īŊ']), + ('īŧŠ', &['īŊ']), + ('īŧĒ', &['īŊ']), + ('īŧĢ', &['īŊ']), + ('īŧŦ', &['īŊ']), + ('īŧ', &['īŊ']), + ('īŧŽ', &['īŊ']), + ('īŧ¯', &['īŊ']), + ('īŧ°', &['īŊ']), + ('īŧą', &['īŊ']), + ('īŧ˛', &['īŊ']), + ('īŧŗ', &['īŊ']), + ('īŧ´', &['īŊ']), + ('īŧĩ', &['īŊ']), + ('īŧļ', &['īŊ']), + ('īŧˇ', &['īŊ']), + ('īŧ¸', &['īŊ']), + ('īŧš', &['īŊ']), + ('īŧē', &['īŊ']), + ('īŊ', &['īŧĄ']), + ('īŊ', &['īŧĸ']), + ('īŊ', &['īŧŖ']), + ('īŊ', &['īŧ¤']), + ('īŊ
', &['īŧĨ']), + ('īŊ', &['īŧĻ']), + ('īŊ', &['īŧ§']), + ('īŊ', &['īŧ¨']), + ('īŊ', &['īŧŠ']), + ('īŊ', &['īŧĒ']), + ('īŊ', &['īŧĢ']), + ('īŊ', &['īŧŦ']), + ('īŊ', &['īŧ']), + ('īŊ', &['īŧŽ']), + ('īŊ', &['īŧ¯']), + ('īŊ', &['īŧ°']), + ('īŊ', &['īŧą']), + ('īŊ', &['īŧ˛']), + ('īŊ', &['īŧŗ']), + ('īŊ', &['īŧ´']), + ('īŊ', &['īŧĩ']), + ('īŊ', &['īŧļ']), + ('īŊ', &['īŧˇ']), + ('īŊ', &['īŧ¸']), + ('īŊ', &['īŧš']), + ('īŊ', &['īŧē']), + ('đ', &['đ¨']), + ('đ', &['đŠ']), + ('đ', &['đĒ']), + ('đ', &['đĢ']), + ('đ', &['đŦ']), + ('đ
', &['đ']), + ('đ', &['đŽ']), + ('đ', &['đ¯']), + ('đ', &['đ°']), + ('đ', &['đą']), + ('đ', &['đ˛']), + ('đ', &['đŗ']), + ('đ', &['đ´']), + ('đ', &['đĩ']), + ('đ', &['đļ']), + ('đ', &['đˇ']), + ('đ', &['đ¸']), + ('đ', &['đš']), + ('đ', &['đē']), + ('đ', &['đģ']), + ('đ', &['đŧ']), + ('đ', &['đŊ']), + ('đ', &['đž']), + ('đ', &['đŋ']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ', &['đ
']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ ', &['đ']), + ('đĄ', &['đ']), + ('đĸ', &['đ']), + ('đŖ', &['đ']), + ('đ¤', &['đ']), + ('đĨ', &['đ']), + ('đĻ', &['đ']), + ('đ§', &['đ']), + ('đ¨', &['đ']), + ('đŠ', &['đ']), + ('đĒ', &['đ']), + ('đĢ', &['đ']), + ('đŦ', &['đ']), + ('đ', &['đ
']), + ('đŽ', &['đ']), + ('đ¯', &['đ']), + ('đ°', &['đ']), + ('đą', &['đ']), + ('đ˛', &['đ']), + ('đŗ', &['đ']), + ('đ´', &['đ']), + ('đĩ', &['đ']), + ('đļ', &['đ']), + ('đˇ', &['đ']), + ('đ¸', &['đ']), + ('đš', &['đ']), + ('đē', &['đ']), + ('đģ', &['đ']), + ('đŧ', &['đ']), + ('đŊ', &['đ']), + ('đž', &['đ']), + ('đŋ', &['đ']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ
', &['đ']), + ('đ', &['đ']), + ('đ', &['đ']), + ('đ', &['đ ']), + ('đ', &['đĄ']), + ('đ', &['đĸ']), + ('đ', &['đŖ']), + ('đ', &['đ¤']), + ('đ', &['đĨ']), + ('đ', &['đĻ']), + ('đ', &['đ§']), + ('đ°', &['đ']), + ('đą', &['đ']), + ('đ˛', &['đ']), + ('đŗ', &['đ']), + ('đ´', &['đ']), + ('đĩ', &['đ']), + ('đļ', &['đ']), + ('đˇ', &['đ']), + ('đ¸', &['đ ']), + ('đš', &['đĄ']), + ('đē', &['đĸ']), + ('đģ', &['đŖ']), + ('đŧ', &['đ¤']), + ('đŊ', &['đĨ']), + ('đž', &['đĻ']), + ('đŋ', &['đ§']), + ('đ', &['đ¨']), + ('đ', &['đŠ']), + ('đ', &['đĒ']), + ('đ', &['đĢ']), + ('đ', &['đŦ']), + ('đ
', &['đ']), + ('đ', &['đŽ']), + ('đ', &['đ¯']), + ('đ', &['đ°']), + ('đ', &['đą']), + ('đ', &['đ˛']), + ('đ', &['đŗ']), + ('đ', &['đ´']), + ('đ', &['đĩ']), + ('đ', &['đļ']), + ('đ', &['đˇ']), + ('đ', &['đ¸']), + ('đ', &['đš']), + ('đ', &['đē']), + ('đ', &['đģ']), + ('đ', &['đ°']), + ('đ', &['đą']), + ('đ', &['đ˛']), + ('đ', &['đŗ']), + ('đ', &['đ´']), + ('đ', &['đĩ']), + ('đ', &['đļ']), + ('đ', &['đˇ']), + ('đ ', &['đ¸']), + ('đĄ', &['đš']), + ('đĸ', &['đē']), + ('đŖ', &['đģ']), + ('đ¤', &['đŧ']), + ('đĨ', &['đŊ']), + ('đĻ', &['đž']), + ('đ§', &['đŋ']), + ('đ¨', &['đ']), + ('đŠ', &['đ']), + ('đĒ', &['đ']), + ('đĢ', &['đ']), + ('đŦ', &['đ']), + ('đ', &['đ
']), + ('đŽ', &['đ']), + ('đ¯', &['đ']), + ('đ°', &['đ']), + ('đą', &['đ']), + ('đ˛', &['đ']), + ('đŗ', &['đ']), + ('đ´', &['đ']), + ('đĩ', &['đ']), + ('đļ', &['đ']), + ('đˇ', &['đ']), + ('đ¸', &['đ']), + ('đš', &['đ']), + ('đē', &['đ']), + ('đģ', &['đ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛
', &['đŗ
']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛', &['đŗ']), + ('đ˛ ', &['đŗ ']), + ('đ˛Ą', &['đŗĄ']), + ('đ˛ĸ', &['đŗĸ']), + ('đ˛Ŗ', &['đŗŖ']), + ('đ˛¤', &['đŗ¤']), + ('đ˛Ĩ', &['đŗĨ']), + ('đ˛Ļ', &['đŗĻ']), + ('đ˛§', &['đŗ§']), + ('đ˛¨', &['đŗ¨']), + ('đ˛Š', &['đŗŠ']), + ('đ˛Ē', &['đŗĒ']), + ('đ˛Ģ', &['đŗĢ']), + ('đ˛Ŧ', &['đŗŦ']), + ('đ˛', &['đŗ']), + ('đ˛Ž', &['đŗŽ']), + ('đ˛¯', &['đŗ¯']), + ('đ˛°', &['đŗ°']), + ('đ˛ą', &['đŗą']), + ('đ˛˛', &['đŗ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ
', &['đ˛
']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ', &['đ˛']), + ('đŗ ', &['đ˛ ']), + ('đŗĄ', &['đ˛Ą']), + ('đŗĸ', &['đ˛ĸ']), + ('đŗŖ', &['đ˛Ŗ']), + ('đŗ¤', &['đ˛¤']), + ('đŗĨ', &['đ˛Ĩ']), + ('đŗĻ', &['đ˛Ļ']), + ('đŗ§', &['đ˛§']), + ('đŗ¨', &['đ˛¨']), + ('đŗŠ', &['đ˛Š']), + ('đŗĒ', &['đ˛Ē']), + ('đŗĢ', &['đ˛Ģ']), + ('đŗŦ', &['đ˛Ŧ']), + ('đŗ', &['đ˛']), + ('đŗŽ', &['đ˛Ž']), + ('đŗ¯', &['đ˛¯']), + ('đŗ°', &['đ˛°']), + ('đŗą', &['đ˛ą']), + ('đŗ˛', &['đ˛˛']), + ('đĸ ', &['đŖ']), + ('đĸĄ', &['đŖ']), + ('đĸĸ', &['đŖ']), + ('đĸŖ', &['đŖ']), + ('đĸ¤', &['đŖ']), + ('đĸĨ', &['đŖ
']), + ('đĸĻ', &['đŖ']), + ('đĸ§', &['đŖ']), + ('đĸ¨', &['đŖ']), + ('đĸŠ', &['đŖ']), + ('đĸĒ', &['đŖ']), + ('đĸĢ', &['đŖ']), + ('đĸŦ', &['đŖ']), + ('đĸ', &['đŖ']), + ('đĸŽ', &['đŖ']), + ('đĸ¯', &['đŖ']), + ('đĸ°', &['đŖ']), + ('đĸą', &['đŖ']), + ('đĸ˛', &['đŖ']), + ('đĸŗ', &['đŖ']), + ('đĸ´', &['đŖ']), + ('đĸĩ', &['đŖ']), + ('đĸļ', &['đŖ']), + ('đĸˇ', &['đŖ']), + ('đĸ¸', &['đŖ']), + ('đĸš', &['đŖ']), + ('đĸē', &['đŖ']), + ('đĸģ', &['đŖ']), + ('đĸŧ', &['đŖ']), + ('đĸŊ', &['đŖ']), + ('đĸž', &['đŖ']), + ('đĸŋ', &['đŖ']), + ('đŖ', &['đĸ ']), + ('đŖ', &['đĸĄ']), + ('đŖ', &['đĸĸ']), + ('đŖ', &['đĸŖ']), + ('đŖ', &['đĸ¤']), + ('đŖ
', &['đĸĨ']), + ('đŖ', &['đĸĻ']), + ('đŖ', &['đĸ§']), + ('đŖ', &['đĸ¨']), + ('đŖ', &['đĸŠ']), + ('đŖ', &['đĸĒ']), + ('đŖ', &['đĸĢ']), + ('đŖ', &['đĸŦ']), + ('đŖ', &['đĸ']), + ('đŖ', &['đĸŽ']), + ('đŖ', &['đĸ¯']), + ('đŖ', &['đĸ°']), + ('đŖ', &['đĸą']), + ('đŖ', &['đĸ˛']), + ('đŖ', &['đĸŗ']), + ('đŖ', &['đĸ´']), + ('đŖ', &['đĸĩ']), + ('đŖ', &['đĸļ']), + ('đŖ', &['đĸˇ']), + ('đŖ', &['đĸ¸']), + ('đŖ', &['đĸš']), + ('đŖ', &['đĸē']), + ('đŖ', &['đĸģ']), + ('đŖ', &['đĸŧ']), + ('đŖ', &['đĸŊ']), + ('đŖ', &['đĸž']), + ('đŖ', &['đĸŋ']), + ('đš', &['đš ']), + ('đš', &['đšĄ']), + ('đš', &['đšĸ']), + ('đš', &['đšŖ']), + ('đš', &['đš¤']), + ('đš
', &['đšĨ']), + ('đš', &['đšĻ']), + ('đš', &['đš§']), + ('đš', &['đš¨']), + ('đš', &['đšŠ']), + ('đš', &['đšĒ']), + ('đš', &['đšĢ']), + ('đš', &['đšŦ']), + ('đš', &['đš']), + ('đš', &['đšŽ']), + ('đš', &['đš¯']), + ('đš', &['đš°']), + ('đš', &['đšą']), + ('đš', &['đš˛']), + ('đš', &['đšŗ']), + ('đš', &['đš´']), + ('đš', &['đšĩ']), + ('đš', &['đšļ']), + ('đš', &['đšˇ']), + ('đš', &['đš¸']), + ('đš', &['đšš']), + ('đš', &['đšē']), + ('đš', &['đšģ']), + ('đš', &['đšŧ']), + ('đš', &['đšŊ']), + ('đš', &['đšž']), + ('đš', &['đšŋ']), + ('đš ', &['đš']), + ('đšĄ', &['đš']), + ('đšĸ', &['đš']), + ('đšŖ', &['đš']), + ('đš¤', &['đš']), + ('đšĨ', &['đš
']), + ('đšĻ', &['đš']), + ('đš§', &['đš']), + ('đš¨', &['đš']), + ('đšŠ', &['đš']), + ('đšĒ', &['đš']), + ('đšĢ', &['đš']), + ('đšŦ', &['đš']), + ('đš', &['đš']), + ('đšŽ', &['đš']), + ('đš¯', &['đš']), + ('đš°', &['đš']), + ('đšą', &['đš']), + ('đš˛', &['đš']), + ('đšŗ', &['đš']), + ('đš´', &['đš']), + ('đšĩ', &['đš']), + ('đšļ', &['đš']), + ('đšˇ', &['đš']), + ('đš¸', &['đš']), + ('đšš', &['đš']), + ('đšē', &['đš']), + ('đšģ', &['đš']), + ('đšŧ', &['đš']), + ('đšŊ', &['đš']), + ('đšž', &['đš']), + ('đšŋ', &['đš']), + ('đ¤', &['đ¤ĸ']), + ('đ¤', &['đ¤Ŗ']), + ('đ¤', &['đ¤¤']), + ('đ¤', &['đ¤Ĩ']), + ('đ¤', &['đ¤Ļ']), + ('đ¤
', &['đ¤§']), + ('đ¤', &['đ¤¨']), + ('đ¤', &['đ¤Š']), + ('đ¤', &['đ¤Ē']), + ('đ¤', &['đ¤Ģ']), + ('đ¤', &['đ¤Ŧ']), + ('đ¤', &['đ¤']), + ('đ¤', &['đ¤Ž']), + ('đ¤', &['đ¤¯']), + ('đ¤', &['đ¤°']), + ('đ¤', &['đ¤ą']), + ('đ¤', &['đ¤˛']), + ('đ¤', &['đ¤ŗ']), + ('đ¤', &['đ¤´']), + ('đ¤', &['đ¤ĩ']), + ('đ¤', &['đ¤ļ']), + ('đ¤', &['đ¤ˇ']), + ('đ¤', &['đ¤¸']), + ('đ¤', &['đ¤š']), + ('đ¤', &['đ¤ē']), + ('đ¤', &['đ¤ģ']), + ('đ¤', &['đ¤ŧ']), + ('đ¤', &['đ¤Ŋ']), + ('đ¤', &['đ¤ž']), + ('đ¤', &['đ¤ŋ']), + ('đ¤', &['đĨ']), + ('đ¤', &['đĨ']), + ('đ¤ ', &['đĨ']), + ('đ¤Ą', &['đĨ']), + ('đ¤ĸ', &['đ¤']), + ('đ¤Ŗ', &['đ¤']), + ('đ¤¤', &['đ¤']), + ('đ¤Ĩ', &['đ¤']), + ('đ¤Ļ', &['đ¤']), + ('đ¤§', &['đ¤
']), + ('đ¤¨', &['đ¤']), + ('đ¤Š', &['đ¤']), + ('đ¤Ē', &['đ¤']), + ('đ¤Ģ', &['đ¤']), + ('đ¤Ŧ', &['đ¤']), + ('đ¤', &['đ¤']), + ('đ¤Ž', &['đ¤']), + ('đ¤¯', &['đ¤']), + ('đ¤°', &['đ¤']), + ('đ¤ą', &['đ¤']), + ('đ¤˛', &['đ¤']), + ('đ¤ŗ', &['đ¤']), + ('đ¤´', &['đ¤']), + ('đ¤ĩ', &['đ¤']), + ('đ¤ļ', &['đ¤']), + ('đ¤ˇ', &['đ¤']), + ('đ¤¸', &['đ¤']), + ('đ¤š', &['đ¤']), + ('đ¤ē', &['đ¤']), + ('đ¤ģ', &['đ¤']), + ('đ¤ŧ', &['đ¤']), + ('đ¤Ŋ', &['đ¤']), + ('đ¤ž', &['đ¤']), + ('đ¤ŋ', &['đ¤']), + ('đĨ', &['đ¤']), + ('đĨ', &['đ¤']), + ('đĨ', &['đ¤ ']), + ('đĨ', &['đ¤Ą']), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/general_category.rs b/vendor/regex-syntax/src/unicode_tables/general_category.rs new file mode 100644 index 000000000..33b7b7e6e --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/general_category.rs @@ -0,0 +1,6307 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category ucd-13.0.0 --chars --exclude surrogate +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Cased_Letter", CASED_LETTER), + ("Close_Punctuation", CLOSE_PUNCTUATION), + ("Connector_Punctuation", CONNECTOR_PUNCTUATION), + ("Control", CONTROL), + ("Currency_Symbol", CURRENCY_SYMBOL), + ("Dash_Punctuation", DASH_PUNCTUATION), + ("Decimal_Number", DECIMAL_NUMBER), + ("Enclosing_Mark", ENCLOSING_MARK), + ("Final_Punctuation", FINAL_PUNCTUATION), + ("Format", FORMAT), + ("Initial_Punctuation", INITIAL_PUNCTUATION), + ("Letter", LETTER), + ("Letter_Number", LETTER_NUMBER), + ("Line_Separator", LINE_SEPARATOR), + ("Lowercase_Letter", LOWERCASE_LETTER), + ("Mark", MARK), + ("Math_Symbol", MATH_SYMBOL), + ("Modifier_Letter", MODIFIER_LETTER), + ("Modifier_Symbol", MODIFIER_SYMBOL), + ("Nonspacing_Mark", NONSPACING_MARK), + ("Number", NUMBER), + ("Open_Punctuation", OPEN_PUNCTUATION), + ("Other", OTHER), + ("Other_Letter", OTHER_LETTER), + ("Other_Number", OTHER_NUMBER), + ("Other_Punctuation", OTHER_PUNCTUATION), + ("Other_Symbol", OTHER_SYMBOL), + ("Paragraph_Separator", PARAGRAPH_SEPARATOR), + ("Private_Use", PRIVATE_USE), + ("Punctuation", PUNCTUATION), + ("Separator", SEPARATOR), + ("Space_Separator", SPACE_SEPARATOR), + ("Spacing_Mark", SPACING_MARK), + ("Symbol", SYMBOL), + ("Titlecase_Letter", TITLECASE_LETTER), + ("Unassigned", UNASSIGNED), + ("Uppercase_Letter", UPPERCASE_LETTER), +]; + +pub const CASED_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('Âĩ', 'Âĩ'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Æē'), + ('Æŧ', 'Æŋ'), + ('Į', 'Ę'), + ('Ę', 'Ę¯'), + ('Í°', 'Íŗ'), + ('Íļ', '͡'), + ('Íģ', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ ', 'Ö'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŊ', 'áŋ'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('á´', 'á´Ģ'), + ('áĩĢ', 'áĩˇ'), + ('áĩš', 'áļ'), + ('á¸', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'â'), + ('â¯', 'â´'), + ('âš', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â', 'â'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âąģ'), + ('âąž', 'âŗ¤'), + ('âŗĢ', 'âŗŽ'), + ('âŗ˛', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĸ', 'ę¯'), + ('ęą', 'ę'), + ('ę', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', '\u{a7f6}'), + ('ęē', 'ęē'), + ('ęŦ°', 'ę'), + ('ę ', '\u{ab68}'), + ('ę°', 'ęŽŋ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đĸ ', 'đŖ'), + ('đš', 'đšŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ¤', 'đĨ'), +]; + +pub const CLOSE_PUNCTUATION: &'static [(char, char)] = &[ + (')', ')'), + (']', ']'), + ('}', '}'), + ('āŧģ', 'āŧģ'), + ('āŧŊ', 'āŧŊ'), + ('á', 'á'), + ('â', 'â'), + ('âž', 'âž'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âĒ', 'âĒ'), + ('âŠ', 'âŠ'), + ('âĢ', 'âĢ'), + ('â', 'â'), + ('â¯', 'â¯'), + ('âą', 'âą'), + ('âŗ', 'âŗ'), + ('âĩ', 'âĩ'), + ('â', 'â'), + ('â§', 'â§'), + ('âŠ', 'âŠ'), + ('âĢ', 'âĢ'), + ('â', 'â'), + ('â¯', 'â¯'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('â§', 'â§'), + ('â§', 'â§'), + ('â§Ŋ', 'â§Ŋ'), + ('â¸Ŗ', 'â¸Ŗ'), + ('â¸Ĩ', 'â¸Ĩ'), + ('⸧', '⸧'), + ('⸊', '⸊'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ī´ž', 'ī´ž'), + ('ī¸', 'ī¸'), + ('ī¸ļ', 'ī¸ļ'), + ('ī¸¸', 'ī¸¸'), + ('ī¸ē', 'ī¸ē'), + ('ī¸ŧ', 'ī¸ŧ'), + ('ī¸ž', 'ī¸ž'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), + ('īŧŊ', 'īŧŊ'), + ('īŊ', 'īŊ'), + ('īŊ ', 'īŊ '), + ('īŊŖ', 'īŊŖ'), +]; + +pub const CONNECTOR_PUNCTUATION: &'static [(char, char)] = &[ + ('_', '_'), + ('âŋ', 'â'), + ('â', 'â'), + ('ī¸ŗ', 'ī¸´'), + ('īš', 'īš'), + ('īŧŋ', 'īŧŋ'), +]; + +pub const CONTROL: &'static [(char, char)] = + &[('\u{0}', '\u{1f}'), ('\u{7f}', '\u{9f}')]; + +pub const CURRENCY_SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), + ('Âĸ', 'ÂĨ'), + ('Ö', 'Ö'), + ('Ø', 'Ø'), + ('ßž', 'ßŋ'), + ('ā§˛', 'ā§ŗ'), + ('ā§ģ', 'ā§ģ'), + ('āĢą', 'āĢą'), + ('ā¯š', 'ā¯š'), + ('ā¸ŋ', 'ā¸ŋ'), + ('á', 'á'), + ('â ', 'âŋ'), + ('ę ¸', 'ę ¸'), + ('īˇŧ', 'īˇŧ'), + ('īšŠ', 'īšŠ'), + ('īŧ', 'īŧ'), + ('īŋ ', 'īŋĄ'), + ('īŋĨ', 'īŋĻ'), + ('đŋ', 'đŋ '), + ('đŋ', 'đŋ'), + ('đ˛°', 'đ˛°'), +]; + +pub const DASH_PUNCTUATION: &'static [(char, char)] = &[ + ('-', '-'), + ('Ö', 'Ö'), + ('Öž', 'Öž'), + ('á', 'á'), + ('á ', 'á '), + ('â', 'â'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸ē', 'â¸ģ'), + ('âš', 'âš'), + ('ã', 'ã'), + ('ã°', 'ã°'), + ('ã ', 'ã '), + ('ī¸ą', 'ī¸˛'), + ('īš', 'īš'), + ('īšŖ', 'īšŖ'), + ('īŧ', 'īŧ'), + ('\u{10ead}', '\u{10ead}'), +]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('Ų ', 'ŲŠ'), + ('Û°', 'Ûš'), + ('ß', 'ß'), + ('āĨĻ', 'āĨ¯'), + ('ā§Ļ', 'ā§¯'), + ('āŠĻ', 'āŠ¯'), + ('āĢĻ', 'āĢ¯'), + ('āĻ', 'ā¯'), + ('ā¯Ļ', 'ā¯¯'), + ('āąĻ', 'āą¯'), + ('āŗĻ', 'āŗ¯'), + ('āĩĻ', 'āĩ¯'), + ('āˇĻ', 'āˇ¯'), + ('āš', 'āš'), + ('āģ', 'āģ'), + ('āŧ ', 'āŧŠ'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áŠ'), + ('á ', 'á '), + ('áĨ', 'áĨ'), + ('á§', 'á§'), + ('áĒ', 'áĒ'), + ('áĒ', 'áĒ'), + ('á', 'á'), + ('Ꮀ', '᎚'), + ('áą', 'áą'), + ('áą', 'áą'), + ('ę ', 'ęŠ'), + ('ęŖ', 'ęŖ'), + ('ę¤', 'ę¤'), + ('ę§', 'ę§'), + ('꧰', '꧚'), + ('ęŠ', 'ęŠ'), + ('ę¯°', 'ę¯š'), + ('īŧ', 'īŧ'), + ('đ ', 'đŠ'), + ('đ´°', 'đ´š'), + ('đĻ', 'đ¯'), + ('đ°', 'đš'), + ('đļ', 'đŋ'), + ('đ', 'đ'), + ('đ°', 'đš'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đš'), + ('đŖ ', 'đŖŠ'), + ('\u{11950}', '\u{11959}'), + ('đą', 'đą'), + ('đĩ', 'đĩ'), + ('đļ ', 'đļŠ'), + ('đŠ ', 'đŠŠ'), + ('đ', 'đ'), + ('đ', 'đŋ'), + ('đ
', 'đ
'), + ('đ°', 'đš'), + ('đĨ', 'đĨ'), + ('\u{1fbf0}', '\u{1fbf9}'), +]; + +pub const ENCLOSING_MARK: &'static [(char, char)] = &[ + ('\u{488}', '\u{489}'), + ('\u{1abe}', '\u{1abe}'), + ('\u{20dd}', '\u{20e0}'), + ('\u{20e2}', '\u{20e4}'), + ('\u{a670}', '\u{a672}'), +]; + +pub const FINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('Âģ', 'Âģ'), + ('â', 'â'), + ('â', 'â'), + ('âē', 'âē'), + ('â¸', 'â¸'), + ('â¸
', 'â¸
'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('⸥', '⸥'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{600}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), + ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{13430}', '\u{13438}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const INITIAL_PUNCTUATION: &'static [(char, char)] = &[ + ('ÂĢ', 'ÂĢ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âš', 'âš'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('⸠', '⸠'), +]; + +pub const LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Ë'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('ËŦ', 'ËŦ'), + ('ËŽ', 'ËŽ'), + ('Í°', 'Í´'), + ('Íļ', '͡'), + ('Íē', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ', 'Õ'), + ('Õ ', 'Ö'), + ('×', '×Ē'), + ('ׯ', 'ײ'), + ('Ø ', 'Ų'), + ('ŲŽ', 'Ų¯'), + ('Ųą', 'Û'), + ('Û', 'Û'), + ('ÛĨ', 'ÛĻ'), + ('ÛŽ', 'Û¯'), + ('Ûē', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', 'Ü'), + ('Ü', 'ܯ'), + ('Ũ', 'ŪĨ'), + ('Ūą', 'Ūą'), + ('ß', 'ßĒ'), + ('ß´', 'ßĩ'), + ('ßē', 'ßē'), + ('ā ', 'ā '), + ('ā ', 'ā '), + ('ā ¤', 'ā ¤'), + ('ā ¨', 'ā ¨'), + ('āĄ', 'āĄ'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('ā¤', 'ā¤š'), + ('ā¤Ŋ', 'ā¤Ŋ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨĄ'), + ('āĨą', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('āĻŊ', 'āĻŊ'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§Ą'), + ('ā§°', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠ˛', 'āŠ´'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('āĒŊ', 'āĒŊ'), + ('āĢ', 'āĢ'), + ('āĢ ', 'āĢĄ'), + ('āĢš', 'āĢš'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('āŦŊ', 'āŦŊ'), + ('ā', 'ā'), + ('ā', 'āĄ'), + ('āą', 'āą'), + ('āŽ', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('ā¯', 'ā¯'), + ('ā°
', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'ā°Ŋ'), + ('āą', 'āą'), + ('āą ', 'āąĄ'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('ā˛Ŋ', 'ā˛Ŋ'), + ('āŗ', 'āŗ'), + ('āŗ ', 'āŗĄ'), + ('āŗą', 'āŗ˛'), + ('\u{d04}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', 'ā´ē'), + ('ā´Ŋ', 'ā´Ŋ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩĄ'), + ('āĩē', 'āĩŋ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('ā¸', 'ā¸°'), + ('ā¸˛', 'ā¸ŗ'), + ('āš', 'āš'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āē°'), + ('āē˛', 'āēŗ'), + ('āēŊ', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('āŊ', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('āž', 'āž'), + ('á', 'áĒ'), + ('áŋ', 'áŋ'), + ('á', 'á'), + ('á', 'á'), + ('áĄ', 'áĄ'), + ('áĨ', 'áĻ'), + ('áŽ', 'á°'), + ('áĩ', 'á'), + ('á', 'á'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áą', 'á¸'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áą'), + ('á', 'á'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('á', 'áŗ'), + ('á', 'á'), + ('á', 'á'), + ('á ', '᥸'), + ('áĸ', 'áĸ'), + ('áĸ', 'áĸ¨'), + ('áĸĒ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á¨', 'á¨'), + ('ᨠ', 'áŠ'), + ('áĒ§', 'áĒ§'), + ('áŦ
', 'áŦŗ'), + ('á
', 'á'), + ('áŽ', 'Ꭰ'), + ('ᎎ', 'Ꭿ'), + ('áŽē', 'á¯Ĩ'), + ('á°', 'á°Ŗ'), + ('áą', 'áą'), + ('áą', 'áąŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('áŗŠ', 'áŗŦ'), + ('áŗŽ', 'áŗŗ'), + ('áŗĩ', 'áŗļ'), + ('áŗē', 'áŗē'), + ('á´', 'áļŋ'), + ('á¸', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'â'), + ('â¯', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â', 'â'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗ¤'), + ('âŗĢ', 'âŗŽ'), + ('âŗ˛', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ¯'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('ⸯ', 'ⸯ'), + ('ã
', 'ã'), + ('ãą', 'ãĩ'), + ('ãģ', 'ãŧ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãĄ', 'ãē'), + ('ãŧ', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ã°', 'ãŋ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ęŊ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĒ', 'ęĢ'), + ('ę', 'ęŽ'), + ('ęŋ', 'ę'), + ('ę ', 'ęĨ'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ę '), + ('ę ', 'ę
'), + ('ę ', 'ę '), + ('ę ', 'ę ĸ'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', 'ęĸŗ'), + ('ęŖ˛', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', 'ęŖž'), + ('ę¤', 'ę¤Ĩ'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('ęĻ', 'ęĻ˛'), + ('ę§', 'ę§'), + ('ę§ ', 'ꧤ'), + ('ę§Ļ', 'ę§¯'), + ('ę§ē', '꧞'), + ('ę¨', 'ꨨ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ ', 'ęŠļ'), + ('ęŠē', 'ęŠē'), + ('ꊞ', 'ęĒ¯'), + ('ęĒą', 'ęĒą'), + ('ęĒĩ', 'ęĒļ'), + ('ęĒš', 'ęĒŊ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ ', 'ęĢĒ'), + ('ęĢ˛', 'ęĢ´'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab69}'), + ('ę°', 'ę¯ĸ'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇģ'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), + ('īŊĻ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', 'đĢ¤'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đ´', 'đ´Ŗ'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', 'đŊ
'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', 'đˇ'), + ('đ', 'đ¯'), + ('đ', 'đ¨'), + ('đ', 'đĻ'), + ('đ
', 'đ
'), + ('\u{11147}', '\u{11147}'), + ('đ
', 'đ
˛'), + ('đ
ļ', 'đ
ļ'), + ('đ', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĢ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đ', 'đ´'), + ('đ', 'đ'), + ('đ', '\u{11461}'), + ('đ', 'đ¯'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đ¯'), + ('đ', 'đ'), + ('đ', 'đĒ'), + ('đ¸', 'đ¸'), + ('đ', 'đ'), + ('đ ', 'đ Ģ'), + ('đĸ ', 'đŖ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{1192f}'), + ('\u{1193f}', '\u{1193f}'), + ('\u{11941}', '\u{11941}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', 'đ§'), + ('đ§Ą', 'đ§Ą'), + ('đ§Ŗ', 'đ§Ŗ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨˛'), + ('đ¨ē', 'đ¨ē'), + ('đŠ', 'đŠ'), + ('đŠ', 'đĒ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', 'đ°Ž'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', 'đ´°'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('đļ', 'đļ'), + ('đģ ', 'đģ˛'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đĢ', 'đĢ'), + ('đŦ', 'đŦ¯'), + ('đ', 'đ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đš', 'đšŋ'), + ('đŧ', 'đŊ'), + ('đŊ', 'đŊ'), + ('đž', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', 'đŋŖ'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŦ'), + ('đˇ', 'đŊ'), + ('đ
', 'đ
'), + ('đ', 'đĢ'), + ('đ ', 'đŖ'), + ('đ¤', 'đĨ'), + ('đĨ', 'đĨ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const LETTER_NUMBER: &'static [(char, char)] = &[ + ('áŽ', 'á°'), + ('â
', 'â'), + ('â
', 'â'), + ('ã', 'ã'), + ('ãĄ', 'ãŠ'), + ('ã¸', 'ãē'), + ('ęĻ', 'ę¯'), + ('đ
', 'đ
´'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŽ'), +]; + +pub const LINE_SEPARATOR: &'static [(char, char)] = + &[('\u{2028}', '\u{2028}')]; + +pub const LOWERCASE_LETTER: &'static [(char, char)] = &[ + ('a', 'z'), + ('Âĩ', 'Âĩ'), + ('Ã', 'Ãļ'), + ('ø', 'Ãŋ'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä
', 'Ä
'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('ÄĄ', 'ÄĄ'), + ('ÄŖ', 'ÄŖ'), + ('ÄĨ', 'ÄĨ'), + ('ħ', 'ħ'), + ('ÄŠ', 'ÄŠ'), + ('ÄĢ', 'ÄĢ'), + ('Ä', 'Ä'), + ('į', 'į'), + ('Äą', 'Äą'), + ('Äŗ', 'Äŗ'), + ('Äĩ', 'Äĩ'), + ('ġ', 'ĸ'), + ('Äē', 'Äē'), + ('Äŧ', 'Äŧ'), + ('Äž', 'Äž'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('ÅĄ', 'ÅĄ'), + ('ÅŖ', 'ÅŖ'), + ('ÅĨ', 'ÅĨ'), + ('ŧ', 'ŧ'), + ('ÅŠ', 'ÅŠ'), + ('ÅĢ', 'ÅĢ'), + ('Å', 'Å'), + ('ů', 'ů'), + ('Åą', 'Åą'), + ('Åŗ', 'Åŗ'), + ('Åĩ', 'Åĩ'), + ('Åˇ', 'Åˇ'), + ('Åē', 'Åē'), + ('Åŧ', 'Åŧ'), + ('Åž', 'Æ'), + ('Æ', 'Æ'), + ('Æ
', 'Æ
'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('ÆĄ', 'ÆĄ'), + ('ÆŖ', 'ÆŖ'), + ('ÆĨ', 'ÆĨ'), + ('ƨ', 'ƨ'), + ('ÆĒ', 'ÆĢ'), + ('Æ', 'Æ'), + ('Æ°', 'Æ°'), + ('Æ´', 'Æ´'), + ('Æļ', 'Æļ'), + ('Æš', 'Æē'), + ('ÆŊ', 'Æŋ'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('ĮĄ', 'ĮĄ'), + ('ĮŖ', 'ĮŖ'), + ('ĮĨ', 'ĮĨ'), + ('Į§', 'Į§'), + ('ĮŠ', 'ĮŠ'), + ('ĮĢ', 'ĮĢ'), + ('Į', 'Į'), + ('Į¯', 'Į°'), + ('Įŗ', 'Įŗ'), + ('Įĩ', 'Įĩ'), + ('Įš', 'Įš'), + ('Įģ', 'Įģ'), + ('ĮŊ', 'ĮŊ'), + ('Įŋ', 'Įŋ'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č
', 'Č
'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('ČĄ', 'ČĄ'), + ('ČŖ', 'ČŖ'), + ('ČĨ', 'ČĨ'), + ('ȧ', 'ȧ'), + ('ČŠ', 'ČŠ'), + ('ČĢ', 'ČĢ'), + ('Č', 'Č'), + ('Č¯', 'Č¯'), + ('Čą', 'Čą'), + ('Čŗ', 'Čš'), + ('Čŧ', 'Čŧ'), + ('Čŋ', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'Ę'), + ('Ę', 'Ę¯'), + ('Íą', 'Íą'), + ('Íŗ', 'Íŗ'), + ('͡', '͡'), + ('Íģ', 'ÍŊ'), + ('Î', 'Î'), + ('ÎŦ', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('ĪĄ', 'ĪĄ'), + ('ĪŖ', 'ĪŖ'), + ('ĪĨ', 'ĪĨ'), + ('Ī§', 'Ī§'), + ('ĪŠ', 'ĪŠ'), + ('ĪĢ', 'ĪĢ'), + ('Ī', 'Ī'), + ('Ī¯', 'Īŗ'), + ('Īĩ', 'Īĩ'), + ('Ī¸', 'Ī¸'), + ('Īģ', 'Īŧ'), + ('Đ°', 'Ņ'), + ('ŅĄ', 'ŅĄ'), + ('ŅŖ', 'ŅŖ'), + ('ŅĨ', 'ŅĨ'), + ('Ņ§', 'Ņ§'), + ('ŅŠ', 'ŅŠ'), + ('ŅĢ', 'ŅĢ'), + ('Ņ', 'Ņ'), + ('Ņ¯', 'Ņ¯'), + ('Ņą', 'Ņą'), + ('Ņŗ', 'Ņŗ'), + ('Ņĩ', 'Ņĩ'), + ('Ņˇ', 'Ņˇ'), + ('Ņš', 'Ņš'), + ('Ņģ', 'Ņģ'), + ('ŅŊ', 'ŅŊ'), + ('Ņŋ', 'Ņŋ'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('ŌĄ', 'ŌĄ'), + ('ŌŖ', 'ŌŖ'), + ('ŌĨ', 'ŌĨ'), + ('Ō§', 'Ō§'), + ('ŌŠ', 'ŌŠ'), + ('ŌĢ', 'ŌĢ'), + ('Ō', 'Ō'), + ('Ō¯', 'Ō¯'), + ('Ōą', 'Ōą'), + ('Ōŗ', 'Ōŗ'), + ('Ōĩ', 'Ōĩ'), + ('Ōˇ', 'Ōˇ'), + ('Ōš', 'Ōš'), + ('Ōģ', 'Ōģ'), + ('ŌŊ', 'ŌŊ'), + ('Ōŋ', 'Ōŋ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('ĶĄ', 'ĶĄ'), + ('ĶŖ', 'ĶŖ'), + ('ĶĨ', 'ĶĨ'), + ('Ķ§', 'Ķ§'), + ('ĶŠ', 'ĶŠ'), + ('ĶĢ', 'ĶĢ'), + ('Ķ', 'Ķ'), + ('Ķ¯', 'Ķ¯'), + ('Ķą', 'Ķą'), + ('Ķŗ', 'Ķŗ'), + ('Ķĩ', 'Ķĩ'), + ('Ķˇ', 'Ķˇ'), + ('Ķš', 'Ķš'), + ('Ķģ', 'Ķģ'), + ('ĶŊ', 'ĶŊ'), + ('Ķŋ', 'Ķŋ'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô
', 'Ô
'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('ÔĄ', 'ÔĄ'), + ('ÔŖ', 'ÔŖ'), + ('ÔĨ', 'ÔĨ'), + ('Ô§', 'Ô§'), + ('ÔŠ', 'ÔŠ'), + ('ÔĢ', 'ÔĢ'), + ('Ô', 'Ô'), + ('Ô¯', 'Ô¯'), + ('Õ ', 'Ö'), + ('á', 'áē'), + ('áŊ', 'áŋ'), + ('á¸', 'áŊ'), + ('á˛', 'á˛'), + ('á´', 'á´Ģ'), + ('áĩĢ', 'áĩˇ'), + ('áĩš', 'áļ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸
', 'á¸
'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('ḥ', 'ḥ'), + ('á¸Ŗ', 'á¸Ŗ'), + ('á¸Ĩ', 'á¸Ĩ'), + ('ḧ', 'ḧ'), + ('Ḋ', 'Ḋ'), + ('á¸Ģ', 'á¸Ģ'), + ('á¸', 'á¸'), + ('ḯ', 'ḯ'), + ('ḹ', 'ḹ'), + ('á¸ŗ', 'á¸ŗ'), + ('á¸ĩ', 'á¸ĩ'), + ('ḡ', 'ḡ'), + ('Ḛ', 'Ḛ'), + ('á¸ģ', 'á¸ģ'), + ('á¸Ŋ', 'á¸Ŋ'), + ('á¸ŋ', 'á¸ŋ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš
', 'áš
'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('ᚥ', 'ᚥ'), + ('ášŖ', 'ášŖ'), + ('ášĨ', 'ášĨ'), + ('ᚧ', 'ᚧ'), + ('ᚊ', 'ᚊ'), + ('ášĢ', 'ášĢ'), + ('áš', 'áš'), + ('ᚯ', 'ᚯ'), + ('ášą', 'ášą'), + ('ášŗ', 'ášŗ'), + ('ášĩ', 'ášĩ'), + ('ᚡ', 'ᚡ'), + ('ášš', 'ášš'), + ('ášģ', 'ášģ'), + ('ášŊ', 'ášŊ'), + ('ášŋ', 'ášŋ'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē
', 'áē
'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áēĄ', 'áēĄ'), + ('áēŖ', 'áēŖ'), + ('áēĨ', 'áēĨ'), + ('áē§', 'áē§'), + ('áēŠ', 'áēŠ'), + ('áēĢ', 'áēĢ'), + ('áē', 'áē'), + ('áē¯', 'áē¯'), + ('áēą', 'áēą'), + ('áēŗ', 'áēŗ'), + ('áēĩ', 'áēĩ'), + ('áēˇ', 'áēˇ'), + ('áēš', 'áēš'), + ('áēģ', 'áēģ'), + ('áēŊ', 'áēŊ'), + ('áēŋ', 'áēŋ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ
', 'áģ
'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģĄ', 'áģĄ'), + ('áģŖ', 'áģŖ'), + ('áģĨ', 'áģĨ'), + ('áģ§', 'áģ§'), + ('áģŠ', 'áģŠ'), + ('áģĢ', 'áģĢ'), + ('áģ', 'áģ'), + ('áģ¯', 'áģ¯'), + ('áģą', 'áģą'), + ('áģŗ', 'áģŗ'), + ('áģĩ', 'áģĩ'), + ('áģˇ', 'áģˇ'), + ('áģš', 'áģš'), + ('áģģ', 'áģģ'), + ('áģŊ', 'áģŊ'), + ('áģŋ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŧ§'), + ('áŧ°', 'áŧˇ'), + ('áŊ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ ', 'áŊ§'), + ('áŊ°', 'áŊŊ'), + ('áž', 'áž'), + ('áž', 'áž'), + ('áž ', 'ឧ'), + ('áž°', 'áž´'), + ('ážļ', 'ឡ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋ§'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋˇ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¯', 'â¯'), + ('â´', 'â´'), + ('âš', 'âš'), + ('âŧ', 'âŊ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â', 'â'), + ('â°°', 'âą'), + ('⹥', '⹥'), + ('âąĨ', 'âąĻ'), + ('⹨', '⹨'), + ('âąĒ', 'âąĒ'), + ('âąŦ', 'âąŦ'), + ('âąą', 'âąą'), + ('âąŗ', 'âą´'), + ('âąļ', 'âąģ'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛
', 'â˛
'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('ⲥ', 'ⲥ'), + ('â˛Ŗ', 'â˛Ŗ'), + ('â˛Ĩ', 'â˛Ĩ'), + ('ⲧ', 'ⲧ'), + ('Ⲋ', 'Ⲋ'), + ('â˛Ģ', 'â˛Ģ'), + ('â˛', 'â˛'), + ('â˛¯', 'â˛¯'), + ('ⲹ', 'ⲹ'), + ('â˛ŗ', 'â˛ŗ'), + ('â˛ĩ', 'â˛ĩ'), + ('ⲡ', 'ⲡ'), + ('Ⲛ', 'Ⲛ'), + ('â˛ģ', 'â˛ģ'), + ('â˛Ŋ', 'â˛Ŋ'), + ('â˛ŋ', 'â˛ŋ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ
', 'âŗ
'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗĄ', 'âŗĄ'), + ('âŗŖ', 'âŗ¤'), + ('âŗŦ', 'âŗŦ'), + ('âŗŽ', 'âŗŽ'), + ('âŗŗ', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ęą'), + ('ęŗ', 'ęŗ'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ę¯'), + ('ęą', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ę¯', 'ę¯'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('\u{a7c8}', '\u{a7c8}'), + ('\u{a7ca}', '\u{a7ca}'), + ('\u{a7f6}', '\u{a7f6}'), + ('ęē', 'ęē'), + ('ęŦ°', 'ę'), + ('ę ', '\u{ab68}'), + ('ę°', 'ęŽŋ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŊ', 'īŊ'), + ('đ¨', 'đ'), + ('đ', 'đģ'), + ('đŗ', 'đŗ˛'), + ('đŖ', 'đŖ'), + ('đš ', 'đšŋ'), + ('đ', 'đŗ'), + ('đ', 'đ'), + ('đ', 'đ§'), + ('đ', 'đ'), + ('đļ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ'), + ('đĒ', 'đ'), + ('đ', 'đˇ'), + ('đ', 'đĢ'), + ('đ', 'đ'), + ('đē', 'đ'), + ('đŽ', 'đ'), + ('đĸ', 'đģ'), + ('đ', 'đ¯'), + ('đ', 'đĨ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đŧ', 'đ'), + ('đ', 'đ'), + ('đļ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đ'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¤ĸ', 'đĨ'), +]; + +pub const MARK: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', 'ā¤'), + ('\u{93a}', '\u{93c}'), + ('ā¤ž', 'āĨ'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'āĻ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ā¨'), + ('\u{a3c}', '\u{a3c}'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'āĒ'), + ('\u{abc}', '\u{abc}'), + ('āĒž', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', 'āŦ'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b44}'), + ('ā', 'ā'), + ('ā', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3e}', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ā˛'), + ('\u{cbc}', '\u{cbc}'), + ('ā˛ž', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', 'ā´'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'āļ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇ˛', 'āˇŗ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ecd}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('āŧž', 'āŧŋ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('áĢ', '\u{103e}'), + ('á', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('áĸ', 'á¤'), + ('á§', 'á'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('á', 'á'), + ('á', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1734}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', '\u{193b}'), + ('\u{1a17}', '\u{1a1b}'), + ('áŠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ac0}'), + ('\u{1b00}', 'áŦ'), + ('\u{1b34}', 'á'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'áŽ'), + ('Ꭵ', '\u{1bad}'), + ('\u{1be6}', 'á¯ŗ'), + ('á°¤', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('áŗˇ', '\u{1cf9}'), + ('\u{1dc0}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('ę Ŗ', 'ę §'), + ('\u{a82c}', '\u{a82c}'), + ('ęĸ', 'ęĸ'), + ('ęĸ´', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', 'ęĨ'), + ('\u{a980}', 'ęĻ'), + ('\u{a9b3}', 'ę§'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ęŠ'), + ('ęŠģ', 'ęŠŊ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ęĢĢ', 'ęĢ¯'), + ('ęĢĩ', '\u{aaf6}'), + ('ę¯Ŗ', 'ę¯Ē'), + ('ę¯Ŧ', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10f46}', '\u{10f50}'), + ('đ', 'đ'), + ('\u{11038}', '\u{11046}'), + ('\u{1107f}', 'đ'), + ('đ°', '\u{110ba}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11134}'), + ('đ
', 'đ
'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', 'đ'), + ('đŗ', 'đ'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111ce}', '\u{111cf}'), + ('đŦ', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{112df}', '\u{112ea}'), + ('\u{11300}', 'đ'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đĸ', 'đŖ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('đĩ', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114c3}'), + ('\u{115af}', '\u{115b5}'), + ('đ¸', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('đ°', '\u{11640}'), + ('\u{116ab}', '\u{116b7}'), + ('\u{1171d}', '\u{1172b}'), + ('đ Ŧ', '\u{1183a}'), + ('\u{11930}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{1193e}'), + ('\u{11940}', '\u{11940}'), + ('\u{11942}', '\u{11943}'), + ('đ§', '\u{119d7}'), + ('\u{119da}', '\u{119e0}'), + ('đ§¤', 'đ§¤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', 'đ¨š'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a99}'), + ('đ°¯', '\u{11c36}'), + ('\u{11c38}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('đ˛Š', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('đļ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', '\u{11d97}'), + ('\u{11ef3}', 'đģļ'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('đŊ', 'đž'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{16ff0}', '\u{16ff1}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1d165}', '\u{1d169}'), + ('đ
', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const MATH_SYMBOL: &'static [(char, char)] = &[ + ('+', '+'), + ('<', '>'), + ('|', '|'), + ('~', '~'), + ('ÂŦ', 'ÂŦ'), + ('Âą', 'Âą'), + ('Ã', 'Ã'), + ('Ãˇ', 'Ãˇ'), + ('Īļ', 'Īļ'), + ('Ø', 'Ø'), + ('â', 'â'), + ('â', 'â'), + ('âē', 'âŧ'), + ('â', 'â'), + ('â', 'â'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('â ', 'â '), + ('âŖ', 'âŖ'), + ('âĻ', 'âĻ'), + ('âŽ', 'âŽ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â´', 'âŋ'), + ('â ', 'âĄ'), + ('âŧ', 'âŧ'), + ('â', 'âŗ'), + ('â', 'âĄ'), + ('âˇ', 'âˇ'), + ('â', 'â'), + ('â¸', 'âŋ'), + ('â¯', 'â¯'), + ('â', 'â'), + ('â', 'âĨ'), + ('â°', 'âŋ'), + ('â¤', 'âĻ'), + ('âĻ', 'â§'), + ('â§', 'â§ģ'), + ('⧞', 'âĢŋ'), + ('âŦ°', 'â'), + ('â', 'â'), + ('īŦŠ', 'īŦŠ'), + ('īšĸ', 'īšĸ'), + ('īš¤', 'īšĻ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ'), + ('īŋĸ', 'īŋĸ'), + ('īŋŠ', 'īŋŦ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đģ', 'đģ'), + ('đ', 'đ'), + ('đĩ', 'đĩ'), + ('đ', 'đ'), + ('đ¯', 'đ¯'), + ('đ', 'đ'), + ('đŠ', 'đŠ'), + ('đ', 'đ'), + ('đģ°', 'đģą'), +]; + +pub const MODIFIER_LETTER: &'static [(char, char)] = &[ + ('Ę°', 'Ë'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('ËŦ', 'ËŦ'), + ('ËŽ', 'ËŽ'), + ('Í´', 'Í´'), + ('Íē', 'Íē'), + ('Õ', 'Õ'), + ('Ų', 'Ų'), + ('ÛĨ', 'ÛĻ'), + ('ß´', 'ßĩ'), + ('ßē', 'ßē'), + ('ā ', 'ā '), + ('ā ¤', 'ā ¤'), + ('ā ¨', 'ā ¨'), + ('āĨą', 'āĨą'), + ('āš', 'āš'), + ('āģ', 'āģ'), + ('áŧ', 'áŧ'), + ('á', 'á'), + ('áĄ', 'áĄ'), + ('áĒ§', 'áĒ§'), + ('Ṹ', 'áąŊ'), + ('á´Ŧ', 'áĩĒ'), + ('áĩ¸', 'áĩ¸'), + ('áļ', 'áļŋ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('âąŧ', 'âąŊ'), + ('âĩ¯', 'âĩ¯'), + ('ⸯ', 'ⸯ'), + ('ã
', 'ã
'), + ('ãą', 'ãĩ'), + ('ãģ', 'ãģ'), + ('ã', 'ã'), + ('ãŧ', 'ãž'), + ('ę', 'ę'), + ('ę¸', 'ęŊ'), + ('ę', 'ę'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę°', 'ę°'), + ('ę', 'ę'), + ('ę¸', 'ęš'), + ('ę§', 'ę§'), + ('ę§Ļ', 'ę§Ļ'), + ('ꊰ', 'ꊰ'), + ('ęĢ', 'ęĢ'), + ('ęĢŗ', 'ęĢ´'), + ('ę', 'ę'), + ('\u{ab69}', '\u{ab69}'), + ('īŊ°', 'īŊ°'), + ('\u{ff9e}', '\u{ff9f}'), + ('đ', 'đ'), + ('đž', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', 'đŋŖ'), + ('đˇ', 'đŊ'), + ('đĨ', 'đĨ'), +]; + +pub const MODIFIER_SYMBOL: &'static [(char, char)] = &[ + ('^', '^'), + ('`', '`'), + ('¨', '¨'), + ('¯', '¯'), + ('´', '´'), + ('¸', '¸'), + ('Ë', 'Ë
'), + ('Ë', 'Ë'), + ('ËĨ', 'ËĢ'), + ('Ë', 'Ë'), + ('˯', 'Ëŋ'), + ('Íĩ', 'Íĩ'), + ('Î', 'Î
'), + ('ážŊ', 'ážŊ'), + ('ážŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ¯'), + ('áŋŊ', 'áŋž'), + ('ã', 'ã'), + ('ę', 'ę'), + ('ę ', 'ęĄ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('\u{ab6a}', '\u{ab6b}'), + ('īŽ˛', 'ī¯'), + ('īŧž', 'īŧž'), + ('īŊ', 'īŊ'), + ('īŋŖ', 'īŋŖ'), + ('đģ', 'đŋ'), +]; + +pub const NONSPACING_MARK: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{487}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3f}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b56}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ecd}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1734}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b34}'), + ('\u{1b36}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302d}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a66f}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('\u{aaec}', '\u{aaed}'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{11340}', '\u{11340}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('²', 'Âŗ'), + ('š', 'š'), + ('Âŧ', 'ž'), + ('Ų ', 'ŲŠ'), + ('Û°', 'Ûš'), + ('ß', 'ß'), + ('āĨĻ', 'āĨ¯'), + ('ā§Ļ', 'ā§¯'), + ('ā§´', 'ā§š'), + ('āŠĻ', 'āŠ¯'), + ('āĢĻ', 'āĢ¯'), + ('āĻ', 'ā¯'), + ('ā˛', 'āˇ'), + ('ā¯Ļ', 'ā¯˛'), + ('āąĻ', 'āą¯'), + ('āą¸', 'āąž'), + ('āŗĻ', 'āŗ¯'), + ('āĩ', 'āĩ'), + ('āĩĻ', 'āĩ¸'), + ('āˇĻ', 'āˇ¯'), + ('āš', 'āš'), + ('āģ', 'āģ'), + ('āŧ ', 'āŧŗ'), + ('á', 'á'), + ('á', 'á'), + ('áŠ', 'áŧ'), + ('áŽ', 'á°'), + ('á ', 'áŠ'), + ('á°', 'áš'), + ('á ', 'á '), + ('áĨ', 'áĨ'), + ('á§', 'á§'), + ('áĒ', 'áĒ'), + ('áĒ', 'áĒ'), + ('á', 'á'), + ('Ꮀ', '᎚'), + ('áą', 'áą'), + ('áą', 'áą'), + ('â°', 'â°'), + ('â´', 'âš'), + ('â', 'â'), + ('â
', 'â'), + ('â
', 'â'), + ('â ', 'â'), + ('âĒ', 'âŋ'), + ('âļ', 'â'), + ('âŗŊ', 'âŗŊ'), + ('ã', 'ã'), + ('ãĄ', 'ãŠ'), + ('ã¸', 'ãē'), + ('ã', 'ã'), + ('ã ', 'ãŠ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãą', 'ãŋ'), + ('ę ', 'ęŠ'), + ('ęĻ', 'ę¯'), + ('ę °', 'ę ĩ'), + ('ęŖ', 'ęŖ'), + ('ę¤', 'ę¤'), + ('ę§', 'ę§'), + ('꧰', '꧚'), + ('ęŠ', 'ęŠ'), + ('ę¯°', 'ę¯š'), + ('īŧ', 'īŧ'), + ('đ', 'đŗ'), + ('đ
', 'đ
¸'), + ('đ', 'đ'), + ('đĄ', 'đģ'), + ('đ ', 'đŖ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đŠ'), + ('đĄ', 'đĄ'), + ('đĄš', 'đĄŋ'), + ('đĸ§', 'đĸ¯'), + ('đŖģ', 'đŖŋ'), + ('đ¤', 'đ¤'), + ('đĻŧ', 'đĻŊ'), + ('đ§', 'đ§'), + ('đ§', 'đ§ŋ'), + ('đŠ', 'đŠ'), + ('đŠŊ', 'đŠž'), + ('đĒ', 'đĒ'), + ('đĢĢ', 'đĢ¯'), + ('đ', 'đ'), + ('đ¸', 'đŋ'), + ('đŽŠ', 'đŽ¯'), + ('đŗē', 'đŗŋ'), + ('đ´°', 'đ´š'), + ('đš ', 'đšž'), + ('đŧ', 'đŧĻ'), + ('đŊ', 'đŊ'), + ('\u{10fc5}', '\u{10fcb}'), + ('đ', 'đ¯'), + ('đ°', 'đš'), + ('đļ', 'đŋ'), + ('đ', 'đ'), + ('đĄ', 'đ´'), + ('đ°', 'đš'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đģ'), + ('đŖ ', 'đŖ˛'), + ('\u{11950}', '\u{11959}'), + ('đą', 'đąŦ'), + ('đĩ', 'đĩ'), + ('đļ ', 'đļŠ'), + ('đŋ', 'đŋ'), + ('đ', 'đŽ'), + ('đŠ ', 'đŠŠ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đē', 'đē'), + ('đ ', 'đŗ'), + ('đ ', 'đ¸'), + ('đ', 'đŋ'), + ('đ
', 'đ
'), + ('đ°', 'đš'), + ('đŖ', 'đŖ'), + ('đĨ', 'đĨ'), + ('đąą', 'đ˛Ģ'), + ('đ˛', 'đ˛¯'), + ('đ˛ą', 'đ˛´'), + ('đ´', 'đ´'), + ('đ´¯', 'đ´Ŋ'), + ('đ', 'đ'), + ('\u{1fbf0}', '\u{1fbf9}'), +]; + +pub const OPEN_PUNCTUATION: &'static [(char, char)] = &[ + ('(', '('), + ('[', '['), + ('{', '{'), + ('āŧē', 'āŧē'), + ('āŧŧ', 'āŧŧ'), + ('á', 'á'), + ('â', 'â'), + ('â', 'â'), + ('â
', 'â
'), + ('âŊ', 'âŊ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âŠ', 'âŠ'), + ('â¨', 'â¨'), + ('âĒ', 'âĒ'), + ('âŦ', 'âŦ'), + ('âŽ', 'âŽ'), + ('â°', 'â°'), + ('â˛', 'â˛'), + ('â´', 'â´'), + ('â
', 'â
'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'âĒ'), + ('âŦ', 'âŦ'), + ('âŽ', 'âŽ'), + ('âĻ', 'âĻ'), + ('âĻ
', 'âĻ
'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ'), + ('â§', 'â§'), + ('â§', 'â§'), + ('â§ŧ', 'â§ŧ'), + ('â¸ĸ', 'â¸ĸ'), + ('⸤', '⸤'), + ('â¸Ļ', 'â¸Ļ'), + ('⸨', '⸨'), + ('âš', 'âš'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ī´ŋ', 'ī´ŋ'), + ('ī¸', 'ī¸'), + ('ī¸ĩ', 'ī¸ĩ'), + ('ī¸ˇ', 'ī¸ˇ'), + ('ī¸š', 'ī¸š'), + ('ī¸ģ', 'ī¸ģ'), + ('ī¸Ŋ', 'ī¸Ŋ'), + ('ī¸ŋ', 'ī¸ŋ'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), + ('īŧģ', 'īŧģ'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ'), + ('īŊĸ', 'īŊĸ'), +]; + +pub const OTHER: &'static [(char, char)] = &[ + ('\u{0}', '\u{1f}'), + ('\u{7f}', '\u{9f}'), + ('\u{ad}', '\u{ad}'), + ('\u{378}', '\u{379}'), + ('\u{380}', '\u{383}'), + ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), + ('\u{3a2}', '\u{3a2}'), + ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), + ('\u{58b}', '\u{58c}'), + ('\u{590}', '\u{590}'), + ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ee}'), + ('\u{5f5}', '\u{605}'), + ('\u{61c}', '\u{61d}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70e}', '\u{70f}'), + ('\u{74b}', '\u{74c}'), + ('\u{7b2}', '\u{7bf}'), + ('\u{7fb}', '\u{7fc}'), + ('\u{82e}', '\u{82f}'), + ('\u{83f}', '\u{83f}'), + ('\u{85c}', '\u{85d}'), + ('\u{85f}', '\u{85f}'), + ('\u{86b}', '\u{89f}'), + ('\u{8b5}', '\u{8b5}'), + ('\u{8c8}', '\u{8d2}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{984}', '\u{984}'), + ('\u{98d}', '\u{98e}'), + ('\u{991}', '\u{992}'), + ('\u{9a9}', '\u{9a9}'), + ('\u{9b1}', '\u{9b1}'), + ('\u{9b3}', '\u{9b5}'), + ('\u{9ba}', '\u{9bb}'), + ('\u{9c5}', '\u{9c6}'), + ('\u{9c9}', '\u{9ca}'), + ('\u{9cf}', '\u{9d6}'), + ('\u{9d8}', '\u{9db}'), + ('\u{9de}', '\u{9de}'), + ('\u{9e4}', '\u{9e5}'), + ('\u{9ff}', '\u{a00}'), + ('\u{a04}', '\u{a04}'), + ('\u{a0b}', '\u{a0e}'), + ('\u{a11}', '\u{a12}'), + ('\u{a29}', '\u{a29}'), + ('\u{a31}', '\u{a31}'), + ('\u{a34}', '\u{a34}'), + ('\u{a37}', '\u{a37}'), + ('\u{a3a}', '\u{a3b}'), + ('\u{a3d}', '\u{a3d}'), + ('\u{a43}', '\u{a46}'), + ('\u{a49}', '\u{a4a}'), + ('\u{a4e}', '\u{a50}'), + ('\u{a52}', '\u{a58}'), + ('\u{a5d}', '\u{a5d}'), + ('\u{a5f}', '\u{a65}'), + ('\u{a77}', '\u{a80}'), + ('\u{a84}', '\u{a84}'), + ('\u{a8e}', '\u{a8e}'), + ('\u{a92}', '\u{a92}'), + ('\u{aa9}', '\u{aa9}'), + ('\u{ab1}', '\u{ab1}'), + ('\u{ab4}', '\u{ab4}'), + ('\u{aba}', '\u{abb}'), + ('\u{ac6}', '\u{ac6}'), + ('\u{aca}', '\u{aca}'), + ('\u{ace}', '\u{acf}'), + ('\u{ad1}', '\u{adf}'), + ('\u{ae4}', '\u{ae5}'), + ('\u{af2}', '\u{af8}'), + ('\u{b00}', '\u{b00}'), + ('\u{b04}', '\u{b04}'), + ('\u{b0d}', '\u{b0e}'), + ('\u{b11}', '\u{b12}'), + ('\u{b29}', '\u{b29}'), + ('\u{b31}', '\u{b31}'), + ('\u{b34}', '\u{b34}'), + ('\u{b3a}', '\u{b3b}'), + ('\u{b45}', '\u{b46}'), + ('\u{b49}', '\u{b4a}'), + ('\u{b4e}', '\u{b54}'), + ('\u{b58}', '\u{b5b}'), + ('\u{b5e}', '\u{b5e}'), + ('\u{b64}', '\u{b65}'), + ('\u{b78}', '\u{b81}'), + ('\u{b84}', '\u{b84}'), + ('\u{b8b}', '\u{b8d}'), + ('\u{b91}', '\u{b91}'), + ('\u{b96}', '\u{b98}'), + ('\u{b9b}', '\u{b9b}'), + ('\u{b9d}', '\u{b9d}'), + ('\u{ba0}', '\u{ba2}'), + ('\u{ba5}', '\u{ba7}'), + ('\u{bab}', '\u{bad}'), + ('\u{bba}', '\u{bbd}'), + ('\u{bc3}', '\u{bc5}'), + ('\u{bc9}', '\u{bc9}'), + ('\u{bce}', '\u{bcf}'), + ('\u{bd1}', '\u{bd6}'), + ('\u{bd8}', '\u{be5}'), + ('\u{bfb}', '\u{bff}'), + ('\u{c0d}', '\u{c0d}'), + ('\u{c11}', '\u{c11}'), + ('\u{c29}', '\u{c29}'), + ('\u{c3a}', '\u{c3c}'), + ('\u{c45}', '\u{c45}'), + ('\u{c49}', '\u{c49}'), + ('\u{c4e}', '\u{c54}'), + ('\u{c57}', '\u{c57}'), + ('\u{c5b}', '\u{c5f}'), + ('\u{c64}', '\u{c65}'), + ('\u{c70}', '\u{c76}'), + ('\u{c8d}', '\u{c8d}'), + ('\u{c91}', '\u{c91}'), + ('\u{ca9}', '\u{ca9}'), + ('\u{cb4}', '\u{cb4}'), + ('\u{cba}', '\u{cbb}'), + ('\u{cc5}', '\u{cc5}'), + ('\u{cc9}', '\u{cc9}'), + ('\u{cce}', '\u{cd4}'), + ('\u{cd7}', '\u{cdd}'), + ('\u{cdf}', '\u{cdf}'), + ('\u{ce4}', '\u{ce5}'), + ('\u{cf0}', '\u{cf0}'), + ('\u{cf3}', '\u{cff}'), + ('\u{d0d}', '\u{d0d}'), + ('\u{d11}', '\u{d11}'), + ('\u{d45}', '\u{d45}'), + ('\u{d49}', '\u{d49}'), + ('\u{d50}', '\u{d53}'), + ('\u{d64}', '\u{d65}'), + ('\u{d80}', '\u{d80}'), + ('\u{d84}', '\u{d84}'), + ('\u{d97}', '\u{d99}'), + ('\u{db2}', '\u{db2}'), + ('\u{dbc}', '\u{dbc}'), + ('\u{dbe}', '\u{dbf}'), + ('\u{dc7}', '\u{dc9}'), + ('\u{dcb}', '\u{dce}'), + ('\u{dd5}', '\u{dd5}'), + ('\u{dd7}', '\u{dd7}'), + ('\u{de0}', '\u{de5}'), + ('\u{df0}', '\u{df1}'), + ('\u{df5}', '\u{e00}'), + ('\u{e3b}', '\u{e3e}'), + ('\u{e5c}', '\u{e80}'), + ('\u{e83}', '\u{e83}'), + ('\u{e85}', '\u{e85}'), + ('\u{e8b}', '\u{e8b}'), + ('\u{ea4}', '\u{ea4}'), + ('\u{ea6}', '\u{ea6}'), + ('\u{ebe}', '\u{ebf}'), + ('\u{ec5}', '\u{ec5}'), + ('\u{ec7}', '\u{ec7}'), + ('\u{ece}', '\u{ecf}'), + ('\u{eda}', '\u{edb}'), + ('\u{ee0}', '\u{eff}'), + ('\u{f48}', '\u{f48}'), + ('\u{f6d}', '\u{f70}'), + ('\u{f98}', '\u{f98}'), + ('\u{fbd}', '\u{fbd}'), + ('\u{fcd}', '\u{fcd}'), + ('\u{fdb}', '\u{fff}'), + ('\u{10c6}', '\u{10c6}'), + ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), + ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), + ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), + ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), + ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), + ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), + ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), + ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), + ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), + ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), + ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), + ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), + ('\u{170d}', '\u{170d}'), + ('\u{1715}', '\u{171f}'), + ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), + ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), + ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), + ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), + ('\u{180e}', '\u{180f}'), + ('\u{181a}', '\u{181f}'), + ('\u{1879}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), + ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), + ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), + ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), + ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), + ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), + ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), + ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), + ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), + ('\u{1ac1}', '\u{1aff}'), + ('\u{1b4c}', '\u{1b4f}'), + ('\u{1b7d}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), + ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), + ('\u{1c89}', '\u{1c8f}'), + ('\u{1cbb}', '\u{1cbc}'), + ('\u{1cc8}', '\u{1ccf}'), + ('\u{1cfb}', '\u{1cff}'), + ('\u{1dfa}', '\u{1dfa}'), + ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), + ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), + ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), + ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), + ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), + ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), + ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), + ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), + ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{206f}'), + ('\u{2072}', '\u{2073}'), + ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), + ('\u{20c0}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), + ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), + ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), + ('\u{2b96}', '\u{2b96}'), + ('\u{2c2f}', '\u{2c2f}'), + ('\u{2c5f}', '\u{2c5f}'), + ('\u{2cf4}', '\u{2cf8}'), + ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), + ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), + ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), + ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), + ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), + ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), + ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), + ('\u{2e53}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), + ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), + ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), + ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), + ('\u{3130}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), + ('\u{31e4}', '\u{31ef}'), + ('\u{321f}', '\u{321f}'), + ('\u{9ffd}', '\u{9fff}'), + ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), + ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), + ('\u{a7c0}', '\u{a7c1}'), + ('\u{a7cb}', '\u{a7f4}'), + ('\u{a82d}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), + ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), + ('\u{a8da}', '\u{a8df}'), + ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), + ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), + ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), + ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), + ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), + ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), + ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), + ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab6c}', '\u{ab6f}'), + ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), + ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), + ('\u{d7fc}', '\u{f8ff}'), + ('\u{fa6e}', '\u{fa6f}'), + ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), + ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), + ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), + ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), + ('\u{fbc2}', '\u{fbd2}'), + ('\u{fd40}', '\u{fd4f}'), + ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdef}'), + ('\u{fdfe}', '\u{fdff}'), + ('\u{fe1a}', '\u{fe1f}'), + ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), + ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), + ('\u{fefd}', '\u{ff00}'), + ('\u{ffbf}', '\u{ffc1}'), + ('\u{ffc8}', '\u{ffc9}'), + ('\u{ffd0}', '\u{ffd1}'), + ('\u{ffd8}', '\u{ffd9}'), + ('\u{ffdd}', '\u{ffdf}'), + ('\u{ffe7}', '\u{ffe7}'), + ('\u{ffef}', '\u{fffb}'), + ('\u{fffe}', '\u{ffff}'), + ('\u{1000c}', '\u{1000c}'), + ('\u{10027}', '\u{10027}'), + ('\u{1003b}', '\u{1003b}'), + ('\u{1003e}', '\u{1003e}'), + ('\u{1004e}', '\u{1004f}'), + ('\u{1005e}', '\u{1007f}'), + ('\u{100fb}', '\u{100ff}'), + ('\u{10103}', '\u{10106}'), + ('\u{10134}', '\u{10136}'), + ('\u{1018f}', '\u{1018f}'), + ('\u{1019d}', '\u{1019f}'), + ('\u{101a1}', '\u{101cf}'), + ('\u{101fe}', '\u{1027f}'), + ('\u{1029d}', '\u{1029f}'), + ('\u{102d1}', '\u{102df}'), + ('\u{102fc}', '\u{102ff}'), + ('\u{10324}', '\u{1032c}'), + ('\u{1034b}', '\u{1034f}'), + ('\u{1037b}', '\u{1037f}'), + ('\u{1039e}', '\u{1039e}'), + ('\u{103c4}', '\u{103c7}'), + ('\u{103d6}', '\u{103ff}'), + ('\u{1049e}', '\u{1049f}'), + ('\u{104aa}', '\u{104af}'), + ('\u{104d4}', '\u{104d7}'), + ('\u{104fc}', '\u{104ff}'), + ('\u{10528}', '\u{1052f}'), + ('\u{10564}', '\u{1056e}'), + ('\u{10570}', '\u{105ff}'), + ('\u{10737}', '\u{1073f}'), + ('\u{10756}', '\u{1075f}'), + ('\u{10768}', '\u{107ff}'), + ('\u{10806}', '\u{10807}'), + ('\u{10809}', '\u{10809}'), + ('\u{10836}', '\u{10836}'), + ('\u{10839}', '\u{1083b}'), + ('\u{1083d}', '\u{1083e}'), + ('\u{10856}', '\u{10856}'), + ('\u{1089f}', '\u{108a6}'), + ('\u{108b0}', '\u{108df}'), + ('\u{108f3}', '\u{108f3}'), + ('\u{108f6}', '\u{108fa}'), + ('\u{1091c}', '\u{1091e}'), + ('\u{1093a}', '\u{1093e}'), + ('\u{10940}', '\u{1097f}'), + ('\u{109b8}', '\u{109bb}'), + ('\u{109d0}', '\u{109d1}'), + ('\u{10a04}', '\u{10a04}'), + ('\u{10a07}', '\u{10a0b}'), + ('\u{10a14}', '\u{10a14}'), + ('\u{10a18}', '\u{10a18}'), + ('\u{10a36}', '\u{10a37}'), + ('\u{10a3b}', '\u{10a3e}'), + ('\u{10a49}', '\u{10a4f}'), + ('\u{10a59}', '\u{10a5f}'), + ('\u{10aa0}', '\u{10abf}'), + ('\u{10ae7}', '\u{10aea}'), + ('\u{10af7}', '\u{10aff}'), + ('\u{10b36}', '\u{10b38}'), + ('\u{10b56}', '\u{10b57}'), + ('\u{10b73}', '\u{10b77}'), + ('\u{10b92}', '\u{10b98}'), + ('\u{10b9d}', '\u{10ba8}'), + ('\u{10bb0}', '\u{10bff}'), + ('\u{10c49}', '\u{10c7f}'), + ('\u{10cb3}', '\u{10cbf}'), + ('\u{10cf3}', '\u{10cf9}'), + ('\u{10d28}', '\u{10d2f}'), + ('\u{10d3a}', '\u{10e5f}'), + ('\u{10e7f}', '\u{10e7f}'), + ('\u{10eaa}', '\u{10eaa}'), + ('\u{10eae}', '\u{10eaf}'), + ('\u{10eb2}', '\u{10eff}'), + ('\u{10f28}', '\u{10f2f}'), + ('\u{10f5a}', '\u{10faf}'), + ('\u{10fcc}', '\u{10fdf}'), + ('\u{10ff7}', '\u{10fff}'), + ('\u{1104e}', '\u{11051}'), + ('\u{11070}', '\u{1107e}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110c2}', '\u{110cf}'), + ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), + ('\u{11135}', '\u{11135}'), + ('\u{11148}', '\u{1114f}'), + ('\u{11177}', '\u{1117f}'), + ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), + ('\u{11212}', '\u{11212}'), + ('\u{1123f}', '\u{1127f}'), + ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), + ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), + ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), + ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), + ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), + ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), + ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133a}'), + ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), + ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), + ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), + ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), + ('\u{1145c}', '\u{1145c}'), + ('\u{11462}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), + ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), + ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), + ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), + ('\u{116b9}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), + ('\u{1171b}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), + ('\u{11740}', '\u{117ff}'), + ('\u{1183c}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), + ('\u{11907}', '\u{11908}'), + ('\u{1190a}', '\u{1190b}'), + ('\u{11914}', '\u{11914}'), + ('\u{11917}', '\u{11917}'), + ('\u{11936}', '\u{11936}'), + ('\u{11939}', '\u{1193a}'), + ('\u{11947}', '\u{1194f}'), + ('\u{1195a}', '\u{1199f}'), + ('\u{119a8}', '\u{119a9}'), + ('\u{119d8}', '\u{119d9}'), + ('\u{119e5}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), + ('\u{11aa3}', '\u{11abf}'), + ('\u{11af9}', '\u{11bff}'), + ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), + ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), + ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), + ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), + ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), + ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), + ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11d5f}'), + ('\u{11d66}', '\u{11d66}'), + ('\u{11d69}', '\u{11d69}'), + ('\u{11d8f}', '\u{11d8f}'), + ('\u{11d92}', '\u{11d92}'), + ('\u{11d99}', '\u{11d9f}'), + ('\u{11daa}', '\u{11edf}'), + ('\u{11ef9}', '\u{11faf}'), + ('\u{11fb1}', '\u{11fbf}'), + ('\u{11ff2}', '\u{11ffe}'), + ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), + ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12fff}'), + ('\u{1342f}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), + ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), + ('\u{16a6a}', '\u{16a6d}'), + ('\u{16a70}', '\u{16acf}'), + ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), + ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), + ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), + ('\u{16b90}', '\u{16e3f}'), + ('\u{16e9b}', '\u{16eff}'), + ('\u{16f4b}', '\u{16f4e}'), + ('\u{16f88}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), + ('\u{16fe5}', '\u{16fef}'), + ('\u{16ff2}', '\u{16fff}'), + ('\u{187f8}', '\u{187ff}'), + ('\u{18cd6}', '\u{18cff}'), + ('\u{18d09}', '\u{1afff}'), + ('\u{1b11f}', '\u{1b14f}'), + ('\u{1b153}', '\u{1b163}'), + ('\u{1b168}', '\u{1b16f}'), + ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), + ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), + ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca0}', '\u{1cfff}'), + ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{1d1e9}', '\u{1d1ff}'), + ('\u{1d246}', '\u{1d2df}'), + ('\u{1d2f4}', '\u{1d2ff}'), + ('\u{1d357}', '\u{1d35f}'), + ('\u{1d379}', '\u{1d3ff}'), + ('\u{1d455}', '\u{1d455}'), + ('\u{1d49d}', '\u{1d49d}'), + ('\u{1d4a0}', '\u{1d4a1}'), + ('\u{1d4a3}', '\u{1d4a4}'), + ('\u{1d4a7}', '\u{1d4a8}'), + ('\u{1d4ad}', '\u{1d4ad}'), + ('\u{1d4ba}', '\u{1d4ba}'), + ('\u{1d4bc}', '\u{1d4bc}'), + ('\u{1d4c4}', '\u{1d4c4}'), + ('\u{1d506}', '\u{1d506}'), + ('\u{1d50b}', '\u{1d50c}'), + ('\u{1d515}', '\u{1d515}'), + ('\u{1d51d}', '\u{1d51d}'), + ('\u{1d53a}', '\u{1d53a}'), + ('\u{1d53f}', '\u{1d53f}'), + ('\u{1d545}', '\u{1d545}'), + ('\u{1d547}', '\u{1d549}'), + ('\u{1d551}', '\u{1d551}'), + ('\u{1d6a6}', '\u{1d6a7}'), + ('\u{1d7cc}', '\u{1d7cd}'), + ('\u{1da8c}', '\u{1da9a}'), + ('\u{1daa0}', '\u{1daa0}'), + ('\u{1dab0}', '\u{1dfff}'), + ('\u{1e007}', '\u{1e007}'), + ('\u{1e019}', '\u{1e01a}'), + ('\u{1e022}', '\u{1e022}'), + ('\u{1e025}', '\u{1e025}'), + ('\u{1e02b}', '\u{1e0ff}'), + ('\u{1e12d}', '\u{1e12f}'), + ('\u{1e13e}', '\u{1e13f}'), + ('\u{1e14a}', '\u{1e14d}'), + ('\u{1e150}', '\u{1e2bf}'), + ('\u{1e2fa}', '\u{1e2fe}'), + ('\u{1e300}', '\u{1e7ff}'), + ('\u{1e8c5}', '\u{1e8c6}'), + ('\u{1e8d7}', '\u{1e8ff}'), + ('\u{1e94c}', '\u{1e94f}'), + ('\u{1e95a}', '\u{1e95d}'), + ('\u{1e960}', '\u{1ec70}'), + ('\u{1ecb5}', '\u{1ed00}'), + ('\u{1ed3e}', '\u{1edff}'), + ('\u{1ee04}', '\u{1ee04}'), + ('\u{1ee20}', '\u{1ee20}'), + ('\u{1ee23}', '\u{1ee23}'), + ('\u{1ee25}', '\u{1ee26}'), + ('\u{1ee28}', '\u{1ee28}'), + ('\u{1ee33}', '\u{1ee33}'), + ('\u{1ee38}', '\u{1ee38}'), + ('\u{1ee3a}', '\u{1ee3a}'), + ('\u{1ee3c}', '\u{1ee41}'), + ('\u{1ee43}', '\u{1ee46}'), + ('\u{1ee48}', '\u{1ee48}'), + ('\u{1ee4a}', '\u{1ee4a}'), + ('\u{1ee4c}', '\u{1ee4c}'), + ('\u{1ee50}', '\u{1ee50}'), + ('\u{1ee53}', '\u{1ee53}'), + ('\u{1ee55}', '\u{1ee56}'), + ('\u{1ee58}', '\u{1ee58}'), + ('\u{1ee5a}', '\u{1ee5a}'), + ('\u{1ee5c}', '\u{1ee5c}'), + ('\u{1ee5e}', '\u{1ee5e}'), + ('\u{1ee60}', '\u{1ee60}'), + ('\u{1ee63}', '\u{1ee63}'), + ('\u{1ee65}', '\u{1ee66}'), + ('\u{1ee6b}', '\u{1ee6b}'), + ('\u{1ee73}', '\u{1ee73}'), + ('\u{1ee78}', '\u{1ee78}'), + ('\u{1ee7d}', '\u{1ee7d}'), + ('\u{1ee7f}', '\u{1ee7f}'), + ('\u{1ee8a}', '\u{1ee8a}'), + ('\u{1ee9c}', '\u{1eea0}'), + ('\u{1eea4}', '\u{1eea4}'), + ('\u{1eeaa}', '\u{1eeaa}'), + ('\u{1eebc}', '\u{1eeef}'), + ('\u{1eef2}', '\u{1efff}'), + ('\u{1f02c}', '\u{1f02f}'), + ('\u{1f094}', '\u{1f09f}'), + ('\u{1f0af}', '\u{1f0b0}'), + ('\u{1f0c0}', '\u{1f0c0}'), + ('\u{1f0d0}', '\u{1f0d0}'), + ('\u{1f0f6}', '\u{1f0ff}'), + ('\u{1f1ae}', '\u{1f1e5}'), + ('\u{1f203}', '\u{1f20f}'), + ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '\u{1f24f}'), + ('\u{1f252}', '\u{1f25f}'), + ('\u{1f266}', '\u{1f2ff}'), + ('\u{1f6d8}', '\u{1f6df}'), + ('\u{1f6ed}', '\u{1f6ef}'), + ('\u{1f6fd}', '\u{1f6ff}'), + ('\u{1f774}', '\u{1f77f}'), + ('\u{1f7d9}', '\u{1f7df}'), + ('\u{1f7ec}', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8af}'), + ('\u{1f8b2}', '\u{1f8ff}'), + ('\u{1f979}', '\u{1f979}'), + ('\u{1f9cc}', '\u{1f9cc}'), + ('\u{1fa54}', '\u{1fa5f}'), + ('\u{1fa6e}', '\u{1fa6f}'), + ('\u{1fa75}', '\u{1fa77}'), + ('\u{1fa7b}', '\u{1fa7f}'), + ('\u{1fa87}', '\u{1fa8f}'), + ('\u{1faa9}', '\u{1faaf}'), + ('\u{1fab7}', '\u{1fabf}'), + ('\u{1fac3}', '\u{1facf}'), + ('\u{1fad7}', '\u{1faff}'), + ('\u{1fb93}', '\u{1fb93}'), + ('\u{1fbcb}', '\u{1fbef}'), + ('\u{1fbfa}', '\u{1ffff}'), + ('\u{2a6de}', '\u{2a6ff}'), + ('\u{2b735}', '\u{2b73f}'), + ('\u{2b81e}', '\u{2b81f}'), + ('\u{2cea2}', '\u{2ceaf}'), + ('\u{2ebe1}', '\u{2f7ff}'), + ('\u{2fa1e}', '\u{2ffff}'), + ('\u{3134b}', '\u{e00ff}'), + ('\u{e01f0}', '\u{10ffff}'), +]; + +pub const OTHER_LETTER: &'static [(char, char)] = &[ + ('ÂĒ', 'ÂĒ'), + ('Âē', 'Âē'), + ('Æģ', 'Æģ'), + ('Į', 'Į'), + ('Ę', 'Ę'), + ('×', '×Ē'), + ('ׯ', 'ײ'), + ('Ø ', 'Øŋ'), + ('Ų', 'Ų'), + ('ŲŽ', 'Ų¯'), + ('Ųą', 'Û'), + ('Û', 'Û'), + ('ÛŽ', 'Û¯'), + ('Ûē', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', 'Ü'), + ('Ü', 'ܯ'), + ('Ũ', 'ŪĨ'), + ('Ūą', 'Ūą'), + ('ß', 'ßĒ'), + ('ā ', 'ā '), + ('āĄ', 'āĄ'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('ā¤', 'ā¤š'), + ('ā¤Ŋ', 'ā¤Ŋ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨĄ'), + ('āĨ˛', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('āĻŊ', 'āĻŊ'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§Ą'), + ('ā§°', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠ˛', 'āŠ´'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('āĒŊ', 'āĒŊ'), + ('āĢ', 'āĢ'), + ('āĢ ', 'āĢĄ'), + ('āĢš', 'āĢš'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('āŦŊ', 'āŦŊ'), + ('ā', 'ā'), + ('ā', 'āĄ'), + ('āą', 'āą'), + ('āŽ', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('ā¯', 'ā¯'), + ('ā°
', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'ā°Ŋ'), + ('āą', 'āą'), + ('āą ', 'āąĄ'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('ā˛Ŋ', 'ā˛Ŋ'), + ('āŗ', 'āŗ'), + ('āŗ ', 'āŗĄ'), + ('āŗą', 'āŗ˛'), + ('\u{d04}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', 'ā´ē'), + ('ā´Ŋ', 'ā´Ŋ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩĄ'), + ('āĩē', 'āĩŋ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('ā¸', 'ā¸°'), + ('ā¸˛', 'ā¸ŗ'), + ('āš', 'āš
'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āē°'), + ('āē˛', 'āēŗ'), + ('āēŊ', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('āŊ', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('āž', 'āž'), + ('á', 'áĒ'), + ('áŋ', 'áŋ'), + ('á', 'á'), + ('á', 'á'), + ('áĄ', 'áĄ'), + ('áĨ', 'áĻ'), + ('áŽ', 'á°'), + ('áĩ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áą', 'á¸'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áą'), + ('á', 'á'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('á', 'áŗ'), + ('á', 'á'), + ('á ', 'áĄ'), + ('áĄ', '᥸'), + ('áĸ', 'áĸ'), + ('áĸ', 'áĸ¨'), + ('áĸĒ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á¨', 'á¨'), + ('ᨠ', 'áŠ'), + ('áŦ
', 'áŦŗ'), + ('á
', 'á'), + ('áŽ', 'Ꭰ'), + ('ᎎ', 'Ꭿ'), + ('áŽē', 'á¯Ĩ'), + ('á°', 'á°Ŗ'), + ('áą', 'áą'), + ('áą', 'ṡ'), + ('áŗŠ', 'áŗŦ'), + ('áŗŽ', 'áŗŗ'), + ('áŗĩ', 'áŗļ'), + ('áŗē', 'áŗē'), + ('âĩ', 'â¸'), + ('â´°', 'âĩ§'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('ã', 'ã'), + ('ãŧ', 'ãŧ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãĄ', 'ãē'), + ('ãŋ', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ã°', 'ãŋ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ęˇ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĒ', 'ęĢ'), + ('ęŽ', 'ęŽ'), + ('ę ', 'ęĨ'), + ('ę', 'ę'), + ('ęˇ', 'ęˇ'), + ('ęģ', 'ę '), + ('ę ', 'ę
'), + ('ę ', 'ę '), + ('ę ', 'ę ĸ'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', 'ęĸŗ'), + ('ęŖ˛', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', 'ęŖž'), + ('ę¤', 'ę¤Ĩ'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('ęĻ', 'ęĻ˛'), + ('ę§ ', 'ꧤ'), + ('ꧧ', 'ę§¯'), + ('ę§ē', '꧞'), + ('ę¨', 'ꨨ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ ', 'ęŠ¯'), + ('ꊹ', 'ęŠļ'), + ('ęŠē', 'ęŠē'), + ('ꊞ', 'ęĒ¯'), + ('ęĒą', 'ęĒą'), + ('ęĒĩ', 'ęĒļ'), + ('ęĒš', 'ęĒŊ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ ', 'ęĢĒ'), + ('ęĢ˛', 'ęĢ˛'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ę¯', 'ę¯ĸ'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇģ'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('īŊĻ', 'īŊ¯'), + ('īŊą', 'īž'), + ('īž ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', 'đĢ¤'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ´', 'đ´Ŗ'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', 'đŊ
'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', 'đˇ'), + ('đ', 'đ¯'), + ('đ', 'đ¨'), + ('đ', 'đĻ'), + ('đ
', 'đ
'), + ('\u{11147}', '\u{11147}'), + ('đ
', 'đ
˛'), + ('đ
ļ', 'đ
ļ'), + ('đ', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĢ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đ', 'đ´'), + ('đ', 'đ'), + ('đ', '\u{11461}'), + ('đ', 'đ¯'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đ¯'), + ('đ', 'đ'), + ('đ', 'đĒ'), + ('đ¸', 'đ¸'), + ('đ', 'đ'), + ('đ ', 'đ Ģ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{1192f}'), + ('\u{1193f}', '\u{1193f}'), + ('\u{11941}', '\u{11941}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', 'đ§'), + ('đ§Ą', 'đ§Ą'), + ('đ§Ŗ', 'đ§Ŗ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨˛'), + ('đ¨ē', 'đ¨ē'), + ('đŠ', 'đŠ'), + ('đŠ', 'đĒ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', 'đ°Ž'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', 'đ´°'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('đļ', 'đļ'), + ('đģ ', 'đģ˛'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đĢ', 'đĢ'), + ('đŦ', 'đŦ¯'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đŧ', 'đŊ'), + ('đŊ', 'đŊ'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ', 'đŦ'), + ('đ
', 'đ
'), + ('đ', 'đĢ'), + ('đ ', 'đŖ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const OTHER_NUMBER: &'static [(char, char)] = &[ + ('²', 'Âŗ'), + ('š', 'š'), + ('Âŧ', 'ž'), + ('ā§´', 'ā§š'), + ('ā˛', 'āˇ'), + ('ā¯°', 'ā¯˛'), + ('āą¸', 'āąž'), + ('āĩ', 'āĩ'), + ('āĩ°', 'āĩ¸'), + ('āŧĒ', 'āŧŗ'), + ('áŠ', 'áŧ'), + ('á°', 'áš'), + ('á§', 'á§'), + ('â°', 'â°'), + ('â´', 'âš'), + ('â', 'â'), + ('â
', 'â
'), + ('â', 'â'), + ('â ', 'â'), + ('âĒ', 'âŋ'), + ('âļ', 'â'), + ('âŗŊ', 'âŗŊ'), + ('ã', 'ã'), + ('ã ', 'ãŠ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãą', 'ãŋ'), + ('ę °', 'ę ĩ'), + ('đ', 'đŗ'), + ('đ
ĩ', 'đ
¸'), + ('đ', 'đ'), + ('đĄ', 'đģ'), + ('đ ', 'đŖ'), + ('đĄ', 'đĄ'), + ('đĄš', 'đĄŋ'), + ('đĸ§', 'đĸ¯'), + ('đŖģ', 'đŖŋ'), + ('đ¤', 'đ¤'), + ('đĻŧ', 'đĻŊ'), + ('đ§', 'đ§'), + ('đ§', 'đ§ŋ'), + ('đŠ', 'đŠ'), + ('đŠŊ', 'đŠž'), + ('đĒ', 'đĒ'), + ('đĢĢ', 'đĢ¯'), + ('đ', 'đ'), + ('đ¸', 'đŋ'), + ('đŽŠ', 'đŽ¯'), + ('đŗē', 'đŗŋ'), + ('đš ', 'đšž'), + ('đŧ', 'đŧĻ'), + ('đŊ', 'đŊ'), + ('\u{10fc5}', '\u{10fcb}'), + ('đ', 'đĨ'), + ('đĄ', 'đ´'), + ('đē', 'đģ'), + ('đŖĒ', 'đŖ˛'), + ('đą', 'đąŦ'), + ('đŋ', 'đŋ'), + ('đ', 'đĄ'), + ('đē', 'đē'), + ('đ ', 'đŗ'), + ('đ ', 'đ¸'), + ('đŖ', 'đŖ'), + ('đąą', 'đ˛Ģ'), + ('đ˛', 'đ˛¯'), + ('đ˛ą', 'đ˛´'), + ('đ´', 'đ´'), + ('đ´¯', 'đ´Ŋ'), + ('đ', 'đ'), +]; + +pub const OTHER_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), + ('%', '\''), + ('*', '*'), + (',', ','), + ('.', '/'), + (':', ';'), + ('?', '@'), + ('\\', '\\'), + ('ÂĄ', 'ÂĄ'), + ('§', '§'), + ('Âļ', '¡'), + ('Âŋ', 'Âŋ'), + ('Íž', 'Íž'), + ('Î', 'Î'), + ('Õ', 'Õ'), + ('Ö', 'Ö'), + ('×', '×'), + ('×', '×'), + ('×', '×'), + ('×ŗ', '×´'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('ŲĒ', 'Ų'), + ('Û', 'Û'), + ('Ü', 'Ü'), + ('ߡ', 'ßš'), + ('ā °', 'ā ž'), + ('āĄ', 'āĄ'), + ('āĨ¤', 'āĨĨ'), + ('āĨ°', 'āĨ°'), + ('ā§Ŋ', 'ā§Ŋ'), + ('āŠļ', 'āŠļ'), + ('āĢ°', 'āĢ°'), + ('āąˇ', 'āąˇ'), + ('ā˛', 'ā˛'), + ('āˇ´', 'āˇ´'), + ('āš', 'āš'), + ('āš', 'āš'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ'), + ('āž
', 'āž
'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), + ('á', 'á'), + ('áģ', 'áģ'), + ('á ', 'á¨'), + ('áŽ', 'áŽ'), + ('áĢ', 'á'), + ('áĩ', 'áļ'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á
'), + ('á ', 'á '), + ('áĨ', 'áĨ
'), + ('á¨', 'á¨'), + ('áĒ ', 'áĒĻ'), + ('áĒ¨', 'áĒ'), + ('á', 'á '), + ('á¯ŧ', 'á¯ŋ'), + ('á°ģ', 'á°ŋ'), + ('áąž', 'áąŋ'), + ('áŗ', 'áŗ'), + ('áŗ', 'áŗ'), + ('â', 'â'), + ('â ', 'â§'), + ('â°', 'â¸'), + ('âģ', 'âž'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âŗš', 'âŗŧ'), + ('âŗž', 'âŗŋ'), + ('âĩ°', 'âĩ°'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸Ē', '⸎'), + ('⸰', '⸚'), + ('â¸ŧ', 'â¸ŋ'), + ('âš', 'âš'), + ('âš', 'âš'), + ('\u{2e52}', '\u{2e52}'), + ('ã', 'ã'), + ('ãŊ', 'ãŊ'), + ('ãģ', 'ãģ'), + ('ęž', 'ęŋ'), + ('ę', 'ę'), + ('ęŗ', 'ęŗ'), + ('ęž', 'ęž'), + ('ę˛', 'ęˇ'), + ('ꥴ', 'ꥡ'), + ('ęŖ', 'ęŖ'), + ('ęŖ¸', 'ęŖē'), + ('ęŖŧ', 'ęŖŧ'), + ('ꤎ', 'ę¤¯'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ęĢ°', 'ęĢą'), + ('ę¯Ģ', 'ę¯Ģ'), + ('ī¸', 'ī¸'), + ('ī¸', 'ī¸'), + ('ī¸°', 'ī¸°'), + ('īš
', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īšĄ'), + ('īš¨', 'īš¨'), + ('īšĒ', 'īšĢ'), + ('īŧ', 'īŧ'), + ('īŧ
', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ '), + ('īŧŧ', 'īŧŧ'), + ('īŊĄ', 'īŊĄ'), + ('īŊ¤', 'īŊĨ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¯', 'đ¯'), + ('đĄ', 'đĄ'), + ('đ¤', 'đ¤'), + ('đ¤ŋ', 'đ¤ŋ'), + ('đŠ', 'đŠ'), + ('đŠŋ', 'đŠŋ'), + ('đĢ°', 'đĢļ'), + ('đŦš', 'đŦŋ'), + ('đŽ', 'đŽ'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đģ', 'đŧ'), + ('đž', 'đ'), + ('đ
', 'đ
'), + ('đ
´', 'đ
ĩ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¸', 'đŊ'), + ('đŠ', 'đŠ'), + ('đ', 'đ'), + ('\u{1145a}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đŦ'), + ('đŧ', 'đž'), + ('đ ģ', 'đ ģ'), + ('\u{11944}', '\u{11946}'), + ('đ§ĸ', 'đ§ĸ'), + ('đ¨ŋ', 'đŠ'), + ('đĒ', 'đĒ'), + ('đĒ', 'đĒĸ'), + ('đą', 'đą
'), + ('đą°', 'đąą'), + ('đģˇ', 'đģ¸'), + ('đŋŋ', 'đŋŋ'), + ('đ°', 'đ´'), + ('đŠŽ', 'đŠ¯'), + ('đĢĩ', 'đĢĩ'), + ('đŦˇ', 'đŦģ'), + ('đ', 'đ'), + ('đē', 'đē'), + ('đŋĸ', 'đŋĸ'), + ('đ˛', 'đ˛'), + ('đĒ', 'đĒ'), + ('đĨ', 'đĨ'), +]; + +pub const OTHER_SYMBOL: &'static [(char, char)] = &[ + ('ÂĻ', 'ÂĻ'), + ('Š', 'Š'), + ('ÂŽ', 'ÂŽ'), + ('°', '°'), + ('Ō', 'Ō'), + ('Ö', 'Ö'), + ('Ø', 'Ø'), + ('Û', 'Û'), + ('ÛŠ', 'ÛŠ'), + ('ÛŊ', 'Ûž'), + ('ßļ', 'ßļ'), + ('ā§ē', 'ā§ē'), + ('ā°', 'ā°'), + ('ā¯ŗ', 'ā¯¸'), + ('ā¯ē', 'ā¯ē'), + ('āąŋ', 'āąŋ'), + ('āĩ', 'āĩ'), + ('āĩš', 'āĩš'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ'), + ('āŧ´', 'āŧ´'), + ('āŧļ', 'āŧļ'), + ('āŧ¸', 'āŧ¸'), + ('āžž', 'āŋ
'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('áĨ', 'áĨ'), + ('á§', 'á§ŋ'), + ('áĄ', 'áĒ'), + ('á´', 'áŧ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'âŖ'), + ('âĨ', 'âĨ'), + ('â§', 'â§'), + ('âŠ', 'âŠ'), + ('âŽ', 'âŽ'), + ('âē', 'âģ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âĄ', 'âĸ'), + ('â¤', 'âĨ'), + ('â§', 'â'), + ('â¯', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'âŗ'), + ('â', 'â'), + ('â', 'â'), + ('âĸ', 'â¨'), + ('âĢ', 'âģ'), + ('âŊ', 'â'), + ('â´', 'â'), + ('âĸ', 'âĻ'), + ('â', 'â'), + ('â', 'âŠ'), + ('â', 'âļ'), + ('â¸', 'â'), + ('â', 'âˇ'), + ('â', 'âŽ'), + ('â°', 'â§'), + ('â', 'âŋ'), + ('â ', 'âŖŋ'), + ('âŦ', 'âŦ¯'), + ('â
', 'â'), + ('â', 'âŗ'), + ('âļ', 'âŽ'), + ('\u{2b97}', 'â¯ŋ'), + ('âŗĨ', 'âŗĒ'), + ('\u{2e50}', '\u{2e51}'), + ('âē', 'âē'), + ('âē', 'âģŗ'), + ('âŧ', 'âŋ'), + ('âŋ°', 'âŋģ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã ', 'ã '), + ('ãļ', 'ãˇ'), + ('ãž', 'ãŋ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ãŖ'), + ('ã', 'ã'), + ('ãĒ', 'ã'), + ('ã', 'ã'), + ('ã ', 'ãŋ'), + ('ã', 'ã°'), + ('ã', 'ãŋ'), + ('äˇ', 'äˇŋ'), + ('ę', 'ę'), + ('ę ¨', 'ę Ģ'), + ('ę ļ', 'ę ˇ'), + ('ę š', 'ę š'), + ('ꊡ', 'ꊚ'), + ('īˇŊ', 'īˇŊ'), + ('īŋ¤', 'īŋ¤'), + ('īŋ¨', 'īŋ¨'), + ('īŋ', 'īŋŽ'), + ('īŋŧ', 'īŋŊ'), + ('đˇ', 'đŋ'), + ('đ
š', 'đ'), + ('đ', 'đ'), + ('đ', '\u{1019c}'), + ('đ ', 'đ '), + ('đ', 'đŧ'), + ('đĄˇ', 'đĄ¸'), + ('đĢ', 'đĢ'), + ('đŋ', 'đŋ'), + ('đŋ', 'đŋ'), + ('đŋĄ', 'đŋą'), + ('đŦŧ', 'đŦŋ'), + ('đ
', 'đ
'), + ('đ˛', 'đ˛'), + ('đ', 'đĩ'), + ('đ', 'đĻ'), + ('đŠ', 'đ
¤'), + ('đ
Ē', 'đ
Ŧ'), + ('đ', 'đ'), + ('đ', 'đŠ'), + ('đŽ', 'đ¨'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ ', 'đ§ŋ'), + ('đ¨ˇ', 'đ¨ē'), + ('đŠ', 'đŠ´'), + ('đŠļ', 'đĒ'), + ('đĒ
', 'đĒ'), + ('đ
', 'đ
'), + ('đ˛Ŧ', 'đ˛Ŧ'), + ('đ´Ž', 'đ´Ž'), + ('đ', 'đĢ'), + ('đ°', 'đ'), + ('đ ', 'đŽ'), + ('đą', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('\u{1f10d}', '\u{1f1ad}'), + ('đĻ', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đĨ'), + ('đ', 'đē'), + ('đ', '\u{1f6d7}'), + ('đ ', 'đŦ'), + ('đ°', '\u{1f6fc}'), + ('đ', 'đŗ'), + ('đ', 'đ'), + ('đ ', 'đĢ'), + ('đ ', 'đ '), + ('đ ', 'đĄ'), + ('đĄ', 'đĄ'), + ('đĄ ', 'đĸ'), + ('đĸ', 'đĸ'), + ('\u{1f8b0}', '\u{1f8b1}'), + ('đ¤', '\u{1f978}'), + ('đĨē', '\u{1f9cb}'), + ('đ§', 'đŠ'), + ('đŠ ', 'đŠ'), + ('đŠ°', '\u{1fa74}'), + ('đŠ¸', 'đŠē'), + ('đĒ', '\u{1fa86}'), + ('đĒ', '\u{1faa8}'), + ('\u{1fab0}', '\u{1fab6}'), + ('\u{1fac0}', '\u{1fac2}'), + ('\u{1fad0}', '\u{1fad6}'), + ('\u{1fb00}', '\u{1fb92}'), + ('\u{1fb94}', '\u{1fbca}'), +]; + +pub const PARAGRAPH_SEPARATOR: &'static [(char, char)] = + &[('\u{2029}', '\u{2029}')]; + +pub const PRIVATE_USE: &'static [(char, char)] = &[ + ('\u{e000}', '\u{f8ff}'), + ('\u{f0000}', '\u{ffffd}'), + ('\u{100000}', '\u{10fffd}'), +]; + +pub const PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), + ('%', '*'), + (',', '/'), + (':', ';'), + ('?', '@'), + ('[', ']'), + ('_', '_'), + ('{', '{'), + ('}', '}'), + ('ÂĄ', 'ÂĄ'), + ('§', '§'), + ('ÂĢ', 'ÂĢ'), + ('Âļ', '¡'), + ('Âģ', 'Âģ'), + ('Âŋ', 'Âŋ'), + ('Íž', 'Íž'), + ('Î', 'Î'), + ('Õ', 'Õ'), + ('Ö', 'Ö'), + ('Öž', 'Öž'), + ('×', '×'), + ('×', '×'), + ('×', '×'), + ('×ŗ', '×´'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('ŲĒ', 'Ų'), + ('Û', 'Û'), + ('Ü', 'Ü'), + ('ߡ', 'ßš'), + ('ā °', 'ā ž'), + ('āĄ', 'āĄ'), + ('āĨ¤', 'āĨĨ'), + ('āĨ°', 'āĨ°'), + ('ā§Ŋ', 'ā§Ŋ'), + ('āŠļ', 'āŠļ'), + ('āĢ°', 'āĢ°'), + ('āąˇ', 'āąˇ'), + ('ā˛', 'ā˛'), + ('āˇ´', 'āˇ´'), + ('āš', 'āš'), + ('āš', 'āš'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ'), + ('āŧē', 'āŧŊ'), + ('āž
', 'āž
'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), + ('á', 'á'), + ('áģ', 'áģ'), + ('á ', 'á¨'), + ('á', 'á'), + ('áŽ', 'áŽ'), + ('á', 'á'), + ('áĢ', 'á'), + ('áĩ', 'áļ'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á '), + ('áĨ', 'áĨ
'), + ('á¨', 'á¨'), + ('áĒ ', 'áĒĻ'), + ('áĒ¨', 'áĒ'), + ('á', 'á '), + ('á¯ŧ', 'á¯ŋ'), + ('á°ģ', 'á°ŋ'), + ('áąž', 'áąŋ'), + ('áŗ', 'áŗ'), + ('áŗ', 'áŗ'), + ('â', 'â§'), + ('â°', 'â'), + ('â
', 'â'), + ('â', 'â'), + ('âŊ', 'âž'), + ('â', 'â'), + ('â', 'â'), + ('âŠ', 'âĒ'), + ('â¨', 'âĩ'), + ('â
', 'â'), + ('âĻ', 'â¯'), + ('âĻ', 'âĻ'), + ('â§', 'â§'), + ('â§ŧ', 'â§Ŋ'), + ('âŗš', 'âŗŧ'), + ('âŗž', 'âŗŋ'), + ('âĩ°', 'âĩ°'), + ('â¸', '⸎'), + ('⸰', 'âš'), + ('\u{2e52}', '\u{2e52}'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã°', 'ã°'), + ('ãŊ', 'ãŊ'), + ('ã ', 'ã '), + ('ãģ', 'ãģ'), + ('ęž', 'ęŋ'), + ('ę', 'ę'), + ('ęŗ', 'ęŗ'), + ('ęž', 'ęž'), + ('ę˛', 'ęˇ'), + ('ꥴ', 'ꥡ'), + ('ęŖ', 'ęŖ'), + ('ęŖ¸', 'ęŖē'), + ('ęŖŧ', 'ęŖŧ'), + ('ꤎ', 'ę¤¯'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ęĢ°', 'ęĢą'), + ('ę¯Ģ', 'ę¯Ģ'), + ('ī´ž', 'ī´ŋ'), + ('ī¸', 'ī¸'), + ('ī¸°', 'īš'), + ('īš', 'īšĄ'), + ('īšŖ', 'īšŖ'), + ('īš¨', 'īš¨'), + ('īšĒ', 'īšĢ'), + ('īŧ', 'īŧ'), + ('īŧ
', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ '), + ('īŧģ', 'īŧŊ'), + ('īŧŋ', 'īŧŋ'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊĨ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¯', 'đ¯'), + ('đĄ', 'đĄ'), + ('đ¤', 'đ¤'), + ('đ¤ŋ', 'đ¤ŋ'), + ('đŠ', 'đŠ'), + ('đŠŋ', 'đŠŋ'), + ('đĢ°', 'đĢļ'), + ('đŦš', 'đŦŋ'), + ('đŽ', 'đŽ'), + ('\u{10ead}', '\u{10ead}'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đģ', 'đŧ'), + ('đž', 'đ'), + ('đ
', 'đ
'), + ('đ
´', 'đ
ĩ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¸', 'đŊ'), + ('đŠ', 'đŠ'), + ('đ', 'đ'), + ('\u{1145a}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đŦ'), + ('đŧ', 'đž'), + ('đ ģ', 'đ ģ'), + ('\u{11944}', '\u{11946}'), + ('đ§ĸ', 'đ§ĸ'), + ('đ¨ŋ', 'đŠ'), + ('đĒ', 'đĒ'), + ('đĒ', 'đĒĸ'), + ('đą', 'đą
'), + ('đą°', 'đąą'), + ('đģˇ', 'đģ¸'), + ('đŋŋ', 'đŋŋ'), + ('đ°', 'đ´'), + ('đŠŽ', 'đŠ¯'), + ('đĢĩ', 'đĢĩ'), + ('đŦˇ', 'đŦģ'), + ('đ', 'đ'), + ('đē', 'đē'), + ('đŋĸ', 'đŋĸ'), + ('đ˛', 'đ˛'), + ('đĒ', 'đĒ'), + ('đĨ', 'đĨ'), +]; + +pub const SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const SPACE_SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const SPACING_MARK: &'static [(char, char)] = &[ + ('ā¤', 'ā¤'), + ('ā¤ģ', 'ā¤ģ'), + ('ā¤ž', 'āĨ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨ'), + ('āĻ', 'āĻ'), + ('\u{9be}', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('\u{9d7}', '\u{9d7}'), + ('ā¨', 'ā¨'), + ('ā¨ž', 'āŠ'), + ('āĒ', 'āĒ'), + ('āĒž', 'āĢ'), + ('āĢ', 'āĢ'), + ('āĢ', 'āĢ'), + ('āŦ', 'āŦ'), + ('\u{b3e}', '\u{b3e}'), + ('ā', 'ā'), + ('ā', 'ā'), + ('ā', 'ā'), + ('\u{b57}', '\u{b57}'), + ('\u{bbe}', 'āŽŋ'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('\u{bd7}', '\u{bd7}'), + ('ā°', 'ā°'), + ('āą', 'āą'), + ('ā˛', 'ā˛'), + ('ā˛ž', 'ā˛ž'), + ('āŗ', 'āŗ'), + ('āŗ', 'āŗ'), + ('āŗ', 'āŗ'), + ('\u{cd5}', '\u{cd6}'), + ('ā´', 'ā´'), + ('\u{d3e}', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('\u{d57}', '\u{d57}'), + ('āļ', 'āļ'), + ('\u{dcf}', 'āˇ'), + ('āˇ', '\u{ddf}'), + ('āˇ˛', 'āˇŗ'), + ('āŧž', 'āŧŋ'), + ('āŊŋ', 'āŊŋ'), + ('áĢ', 'áŦ'), + ('áą', 'áą'), + ('á¸', 'á¸'), + ('áģ', 'áŧ'), + ('á', 'á'), + ('áĸ', 'á¤'), + ('á§', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('áļ', 'áļ'), + ('áž', 'á
'), + ('á', 'á'), + ('á¤Ŗ', 'á¤Ļ'), + ('ᤊ', 'á¤Ģ'), + ('ᤰ', '᤹'), + ('á¤ŗ', 'ᤸ'), + ('á¨', 'á¨'), + ('áŠ', 'áŠ'), + ('áŠ', 'áŠ'), + ('እ', 'እ'), + ('áŠŖ', 'ኤ'), + ('áŠ', 'ኲ'), + ('áŦ', 'áŦ'), + ('\u{1b35}', '\u{1b35}'), + ('áŦģ', 'áŦģ'), + ('áŦŊ', 'á'), + ('á', 'á'), + ('áŽ', 'áŽ'), + ('Ꭵ', 'Ꭵ'), + ('áŽĻ', 'Ꭷ'), + ('áŽĒ', 'áŽĒ'), + ('ᯧ', 'ᯧ'), + ('á¯Ē', 'á¯Ŧ'), + ('ᯎ', 'ᯎ'), + ('á¯˛', 'á¯ŗ'), + ('á°¤', 'á°Ģ'), + ('á°´', 'á°ĩ'), + ('áŗĄ', 'áŗĄ'), + ('áŗˇ', 'áŗˇ'), + ('\u{302e}', '\u{302f}'), + ('ę Ŗ', 'ę ¤'), + ('ę §', 'ę §'), + ('ęĸ', 'ęĸ'), + ('ęĸ´', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ęĻ', 'ęĻ'), + ('ęĻ´', 'ęĻĩ'), + ('ęĻē', 'ęĻģ'), + ('ęĻž', 'ę§'), + ('ę¨¯', 'ꨰ'), + ('ę¨ŗ', 'ꨴ'), + ('ęŠ', 'ęŠ'), + ('ęŠģ', 'ęŠģ'), + ('ęŠŊ', 'ęŠŊ'), + ('ęĢĢ', 'ęĢĢ'), + ('ęĢŽ', 'ęĢ¯'), + ('ęĢĩ', 'ęĢĩ'), + ('ę¯Ŗ', 'ę¯¤'), + ('ę¯Ļ', 'ę¯§'), + ('ę¯Š', 'ę¯Ē'), + ('ę¯Ŧ', 'ę¯Ŧ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ˛'), + ('đˇ', 'đ¸'), + ('đŦ', 'đŦ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đŗ', 'đĩ'), + ('đŋ', 'đ'), + ('\u{111ce}', '\u{111ce}'), + ('đŦ', 'đŽ'), + ('đ˛', 'đŗ'), + ('đĩ', 'đĩ'), + ('đ ', 'đĸ'), + ('đ', 'đ'), + ('\u{1133e}', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đĸ', 'đŖ'), + ('đĩ', 'đˇ'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('\u{114b0}', 'đ˛'), + ('đš', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('\u{115af}', 'đą'), + ('đ¸', 'đģ'), + ('đž', 'đž'), + ('đ°', 'đ˛'), + ('đģ', 'đŧ'), + ('đž', 'đž'), + ('đŦ', 'đŦ'), + ('đŽ', 'đ¯'), + ('đļ', 'đļ'), + ('đ ', 'đĄ'), + ('đĻ', 'đĻ'), + ('đ Ŧ', 'đ Ž'), + ('đ ¸', 'đ ¸'), + ('\u{11930}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193d}', '\u{1193d}'), + ('\u{11940}', '\u{11940}'), + ('\u{11942}', '\u{11942}'), + ('đ§', 'đ§'), + ('đ§', 'đ§'), + ('đ§¤', 'đ§¤'), + ('đ¨š', 'đ¨š'), + ('đŠ', 'đŠ'), + ('đĒ', 'đĒ'), + ('đ°¯', 'đ°¯'), + ('đ°ž', 'đ°ž'), + ('đ˛Š', 'đ˛Š'), + ('đ˛ą', 'đ˛ą'), + ('đ˛´', 'đ˛´'), + ('đļ', 'đļ'), + ('đļ', 'đļ'), + ('đļ', 'đļ'), + ('đģĩ', 'đģļ'), + ('đŊ', 'đž'), + ('\u{16ff0}', '\u{16ff1}'), + ('\u{1d165}', 'đ
Ļ'), + ('đ
', '\u{1d172}'), +]; + +pub const SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), + ('+', '+'), + ('<', '>'), + ('^', '^'), + ('`', '`'), + ('|', '|'), + ('~', '~'), + ('Âĸ', 'ÂĻ'), + ('¨', 'Š'), + ('ÂŦ', 'ÂŦ'), + ('ÂŽ', 'Âą'), + ('´', '´'), + ('¸', '¸'), + ('Ã', 'Ã'), + ('Ãˇ', 'Ãˇ'), + ('Ë', 'Ë
'), + ('Ë', 'Ë'), + ('ËĨ', 'ËĢ'), + ('Ë', 'Ë'), + ('˯', 'Ëŋ'), + ('Íĩ', 'Íĩ'), + ('Î', 'Î
'), + ('Īļ', 'Īļ'), + ('Ō', 'Ō'), + ('Ö', 'Ö'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Û', 'Û'), + ('ÛŠ', 'ÛŠ'), + ('ÛŊ', 'Ûž'), + ('ßļ', 'ßļ'), + ('ßž', 'ßŋ'), + ('ā§˛', 'ā§ŗ'), + ('ā§ē', 'ā§ģ'), + ('āĢą', 'āĢą'), + ('ā°', 'ā°'), + ('ā¯ŗ', 'ā¯ē'), + ('āąŋ', 'āąŋ'), + ('āĩ', 'āĩ'), + ('āĩš', 'āĩš'), + ('ā¸ŋ', 'ā¸ŋ'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ'), + ('āŧ´', 'āŧ´'), + ('āŧļ', 'āŧļ'), + ('āŧ¸', 'āŧ¸'), + ('āžž', 'āŋ
'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('áĨ', 'áĨ'), + ('á§', 'á§ŋ'), + ('áĄ', 'áĒ'), + ('á´', 'áŧ'), + ('ážŊ', 'ážŊ'), + ('ážŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ¯'), + ('áŋŊ', 'áŋž'), + ('â', 'â'), + ('â', 'â'), + ('âē', 'âŧ'), + ('â', 'â'), + ('â ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'âŖ'), + ('âĨ', 'âĨ'), + ('â§', 'â§'), + ('âŠ', 'âŠ'), + ('âŽ', 'âŽ'), + ('âē', 'âģ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â¨'), + ('âĢ', 'âĻ'), + ('â', 'â'), + ('â', 'âŠ'), + ('â', 'â§'), + ('â', 'â'), + ('â', 'âĨ'), + ('â°', 'âĻ'), + ('âĻ', 'â§'), + ('â§', 'â§ģ'), + ('⧞', 'âŗ'), + ('âļ', 'âŽ'), + ('\u{2b97}', 'â¯ŋ'), + ('âŗĨ', 'âŗĒ'), + ('\u{2e50}', '\u{2e51}'), + ('âē', 'âē'), + ('âē', 'âģŗ'), + ('âŧ', 'âŋ'), + ('âŋ°', 'âŋģ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã ', 'ã '), + ('ãļ', 'ãˇ'), + ('ãž', 'ãŋ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ãŖ'), + ('ã', 'ã'), + ('ãĒ', 'ã'), + ('ã', 'ã'), + ('ã ', 'ãŋ'), + ('ã', 'ã°'), + ('ã', 'ãŋ'), + ('äˇ', 'äˇŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ęĄ'), + ('ę', 'ę'), + ('ę ¨', 'ę Ģ'), + ('ę ļ', 'ę š'), + ('ꊡ', 'ꊚ'), + ('ę', 'ę'), + ('\u{ab6a}', '\u{ab6b}'), + ('īŦŠ', 'īŦŠ'), + ('īŽ˛', 'ī¯'), + ('īˇŧ', 'īˇŊ'), + ('īšĸ', 'īšĸ'), + ('īš¤', 'īšĻ'), + ('īšŠ', 'īšŠ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧž', 'īŧž'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ'), + ('īŋ ', 'īŋĻ'), + ('īŋ¨', 'īŋŽ'), + ('īŋŧ', 'īŋŊ'), + ('đˇ', 'đŋ'), + ('đ
š', 'đ'), + ('đ', 'đ'), + ('đ', '\u{1019c}'), + ('đ ', 'đ '), + ('đ', 'đŧ'), + ('đĄˇ', 'đĄ¸'), + ('đĢ', 'đĢ'), + ('đŋ', 'đŋ'), + ('đŋ', 'đŋą'), + ('đŦŧ', 'đŦŋ'), + ('đ
', 'đ
'), + ('đ˛', 'đ˛'), + ('đ', 'đĩ'), + ('đ', 'đĻ'), + ('đŠ', 'đ
¤'), + ('đ
Ē', 'đ
Ŧ'), + ('đ', 'đ'), + ('đ', 'đŠ'), + ('đŽ', 'đ¨'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đģ', 'đģ'), + ('đ', 'đ'), + ('đĩ', 'đĩ'), + ('đ', 'đ'), + ('đ¯', 'đ¯'), + ('đ', 'đ'), + ('đŠ', 'đŠ'), + ('đ', 'đ'), + ('đ ', 'đ§ŋ'), + ('đ¨ˇ', 'đ¨ē'), + ('đŠ', 'đŠ´'), + ('đŠļ', 'đĒ'), + ('đĒ
', 'đĒ'), + ('đ
', 'đ
'), + ('đŋ', 'đŋ'), + ('đ˛Ŧ', 'đ˛Ŧ'), + ('đ˛°', 'đ˛°'), + ('đ´Ž', 'đ´Ž'), + ('đģ°', 'đģą'), + ('đ', 'đĢ'), + ('đ°', 'đ'), + ('đ ', 'đŽ'), + ('đą', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('\u{1f10d}', '\u{1f1ad}'), + ('đĻ', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đĨ'), + ('đ', '\u{1f6d7}'), + ('đ ', 'đŦ'), + ('đ°', '\u{1f6fc}'), + ('đ', 'đŗ'), + ('đ', 'đ'), + ('đ ', 'đĢ'), + ('đ ', 'đ '), + ('đ ', 'đĄ'), + ('đĄ', 'đĄ'), + ('đĄ ', 'đĸ'), + ('đĸ', 'đĸ'), + ('\u{1f8b0}', '\u{1f8b1}'), + ('đ¤', '\u{1f978}'), + ('đĨē', '\u{1f9cb}'), + ('đ§', 'đŠ'), + ('đŠ ', 'đŠ'), + ('đŠ°', '\u{1fa74}'), + ('đŠ¸', 'đŠē'), + ('đĒ', '\u{1fa86}'), + ('đĒ', '\u{1faa8}'), + ('\u{1fab0}', '\u{1fab6}'), + ('\u{1fac0}', '\u{1fac2}'), + ('\u{1fad0}', '\u{1fad6}'), + ('\u{1fb00}', '\u{1fb92}'), + ('\u{1fb94}', '\u{1fbca}'), +]; + +pub const TITLECASE_LETTER: &'static [(char, char)] = &[ + ('Į
', 'Į
'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į˛', 'Į˛'), + ('áž', 'áž'), + ('áž', 'áž'), + ('ឨ', 'ឯ'), + ('ážŧ', 'ážŧ'), + ('áŋ', 'áŋ'), + ('áŋŧ', 'áŋŧ'), +]; + +pub const UNASSIGNED: &'static [(char, char)] = &[ + ('\u{378}', '\u{379}'), + ('\u{380}', '\u{383}'), + ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), + ('\u{3a2}', '\u{3a2}'), + ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), + ('\u{58b}', '\u{58c}'), + ('\u{590}', '\u{590}'), + ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ee}'), + ('\u{5f5}', '\u{5ff}'), + ('\u{61d}', '\u{61d}'), + ('\u{70e}', '\u{70e}'), + ('\u{74b}', '\u{74c}'), + ('\u{7b2}', '\u{7bf}'), + ('\u{7fb}', '\u{7fc}'), + ('\u{82e}', '\u{82f}'), + ('\u{83f}', '\u{83f}'), + ('\u{85c}', '\u{85d}'), + ('\u{85f}', '\u{85f}'), + ('\u{86b}', '\u{89f}'), + ('\u{8b5}', '\u{8b5}'), + ('\u{8c8}', '\u{8d2}'), + ('\u{984}', '\u{984}'), + ('\u{98d}', '\u{98e}'), + ('\u{991}', '\u{992}'), + ('\u{9a9}', '\u{9a9}'), + ('\u{9b1}', '\u{9b1}'), + ('\u{9b3}', '\u{9b5}'), + ('\u{9ba}', '\u{9bb}'), + ('\u{9c5}', '\u{9c6}'), + ('\u{9c9}', '\u{9ca}'), + ('\u{9cf}', '\u{9d6}'), + ('\u{9d8}', '\u{9db}'), + ('\u{9de}', '\u{9de}'), + ('\u{9e4}', '\u{9e5}'), + ('\u{9ff}', '\u{a00}'), + ('\u{a04}', '\u{a04}'), + ('\u{a0b}', '\u{a0e}'), + ('\u{a11}', '\u{a12}'), + ('\u{a29}', '\u{a29}'), + ('\u{a31}', '\u{a31}'), + ('\u{a34}', '\u{a34}'), + ('\u{a37}', '\u{a37}'), + ('\u{a3a}', '\u{a3b}'), + ('\u{a3d}', '\u{a3d}'), + ('\u{a43}', '\u{a46}'), + ('\u{a49}', '\u{a4a}'), + ('\u{a4e}', '\u{a50}'), + ('\u{a52}', '\u{a58}'), + ('\u{a5d}', '\u{a5d}'), + ('\u{a5f}', '\u{a65}'), + ('\u{a77}', '\u{a80}'), + ('\u{a84}', '\u{a84}'), + ('\u{a8e}', '\u{a8e}'), + ('\u{a92}', '\u{a92}'), + ('\u{aa9}', '\u{aa9}'), + ('\u{ab1}', '\u{ab1}'), + ('\u{ab4}', '\u{ab4}'), + ('\u{aba}', '\u{abb}'), + ('\u{ac6}', '\u{ac6}'), + ('\u{aca}', '\u{aca}'), + ('\u{ace}', '\u{acf}'), + ('\u{ad1}', '\u{adf}'), + ('\u{ae4}', '\u{ae5}'), + ('\u{af2}', '\u{af8}'), + ('\u{b00}', '\u{b00}'), + ('\u{b04}', '\u{b04}'), + ('\u{b0d}', '\u{b0e}'), + ('\u{b11}', '\u{b12}'), + ('\u{b29}', '\u{b29}'), + ('\u{b31}', '\u{b31}'), + ('\u{b34}', '\u{b34}'), + ('\u{b3a}', '\u{b3b}'), + ('\u{b45}', '\u{b46}'), + ('\u{b49}', '\u{b4a}'), + ('\u{b4e}', '\u{b54}'), + ('\u{b58}', '\u{b5b}'), + ('\u{b5e}', '\u{b5e}'), + ('\u{b64}', '\u{b65}'), + ('\u{b78}', '\u{b81}'), + ('\u{b84}', '\u{b84}'), + ('\u{b8b}', '\u{b8d}'), + ('\u{b91}', '\u{b91}'), + ('\u{b96}', '\u{b98}'), + ('\u{b9b}', '\u{b9b}'), + ('\u{b9d}', '\u{b9d}'), + ('\u{ba0}', '\u{ba2}'), + ('\u{ba5}', '\u{ba7}'), + ('\u{bab}', '\u{bad}'), + ('\u{bba}', '\u{bbd}'), + ('\u{bc3}', '\u{bc5}'), + ('\u{bc9}', '\u{bc9}'), + ('\u{bce}', '\u{bcf}'), + ('\u{bd1}', '\u{bd6}'), + ('\u{bd8}', '\u{be5}'), + ('\u{bfb}', '\u{bff}'), + ('\u{c0d}', '\u{c0d}'), + ('\u{c11}', '\u{c11}'), + ('\u{c29}', '\u{c29}'), + ('\u{c3a}', '\u{c3c}'), + ('\u{c45}', '\u{c45}'), + ('\u{c49}', '\u{c49}'), + ('\u{c4e}', '\u{c54}'), + ('\u{c57}', '\u{c57}'), + ('\u{c5b}', '\u{c5f}'), + ('\u{c64}', '\u{c65}'), + ('\u{c70}', '\u{c76}'), + ('\u{c8d}', '\u{c8d}'), + ('\u{c91}', '\u{c91}'), + ('\u{ca9}', '\u{ca9}'), + ('\u{cb4}', '\u{cb4}'), + ('\u{cba}', '\u{cbb}'), + ('\u{cc5}', '\u{cc5}'), + ('\u{cc9}', '\u{cc9}'), + ('\u{cce}', '\u{cd4}'), + ('\u{cd7}', '\u{cdd}'), + ('\u{cdf}', '\u{cdf}'), + ('\u{ce4}', '\u{ce5}'), + ('\u{cf0}', '\u{cf0}'), + ('\u{cf3}', '\u{cff}'), + ('\u{d0d}', '\u{d0d}'), + ('\u{d11}', '\u{d11}'), + ('\u{d45}', '\u{d45}'), + ('\u{d49}', '\u{d49}'), + ('\u{d50}', '\u{d53}'), + ('\u{d64}', '\u{d65}'), + ('\u{d80}', '\u{d80}'), + ('\u{d84}', '\u{d84}'), + ('\u{d97}', '\u{d99}'), + ('\u{db2}', '\u{db2}'), + ('\u{dbc}', '\u{dbc}'), + ('\u{dbe}', '\u{dbf}'), + ('\u{dc7}', '\u{dc9}'), + ('\u{dcb}', '\u{dce}'), + ('\u{dd5}', '\u{dd5}'), + ('\u{dd7}', '\u{dd7}'), + ('\u{de0}', '\u{de5}'), + ('\u{df0}', '\u{df1}'), + ('\u{df5}', '\u{e00}'), + ('\u{e3b}', '\u{e3e}'), + ('\u{e5c}', '\u{e80}'), + ('\u{e83}', '\u{e83}'), + ('\u{e85}', '\u{e85}'), + ('\u{e8b}', '\u{e8b}'), + ('\u{ea4}', '\u{ea4}'), + ('\u{ea6}', '\u{ea6}'), + ('\u{ebe}', '\u{ebf}'), + ('\u{ec5}', '\u{ec5}'), + ('\u{ec7}', '\u{ec7}'), + ('\u{ece}', '\u{ecf}'), + ('\u{eda}', '\u{edb}'), + ('\u{ee0}', '\u{eff}'), + ('\u{f48}', '\u{f48}'), + ('\u{f6d}', '\u{f70}'), + ('\u{f98}', '\u{f98}'), + ('\u{fbd}', '\u{fbd}'), + ('\u{fcd}', '\u{fcd}'), + ('\u{fdb}', '\u{fff}'), + ('\u{10c6}', '\u{10c6}'), + ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), + ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), + ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), + ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), + ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), + ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), + ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), + ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), + ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), + ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), + ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), + ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), + ('\u{170d}', '\u{170d}'), + ('\u{1715}', '\u{171f}'), + ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), + ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), + ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), + ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), + ('\u{180f}', '\u{180f}'), + ('\u{181a}', '\u{181f}'), + ('\u{1879}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), + ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), + ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), + ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), + ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), + ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), + ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), + ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), + ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), + ('\u{1ac1}', '\u{1aff}'), + ('\u{1b4c}', '\u{1b4f}'), + ('\u{1b7d}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), + ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), + ('\u{1c89}', '\u{1c8f}'), + ('\u{1cbb}', '\u{1cbc}'), + ('\u{1cc8}', '\u{1ccf}'), + ('\u{1cfb}', '\u{1cff}'), + ('\u{1dfa}', '\u{1dfa}'), + ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), + ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), + ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), + ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), + ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), + ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), + ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), + ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), + ('\u{2065}', '\u{2065}'), + ('\u{2072}', '\u{2073}'), + ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), + ('\u{20c0}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), + ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), + ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), + ('\u{2b96}', '\u{2b96}'), + ('\u{2c2f}', '\u{2c2f}'), + ('\u{2c5f}', '\u{2c5f}'), + ('\u{2cf4}', '\u{2cf8}'), + ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), + ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), + ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), + ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), + ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), + ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), + ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), + ('\u{2e53}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), + ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), + ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), + ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), + ('\u{3130}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), + ('\u{31e4}', '\u{31ef}'), + ('\u{321f}', '\u{321f}'), + ('\u{9ffd}', '\u{9fff}'), + ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), + ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), + ('\u{a7c0}', '\u{a7c1}'), + ('\u{a7cb}', '\u{a7f4}'), + ('\u{a82d}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), + ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), + ('\u{a8da}', '\u{a8df}'), + ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), + ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), + ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), + ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), + ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), + ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), + ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), + ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab6c}', '\u{ab6f}'), + ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), + ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), + ('\u{d7fc}', '\u{d7ff}'), + ('\u{fa6e}', '\u{fa6f}'), + ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), + ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), + ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), + ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), + ('\u{fbc2}', '\u{fbd2}'), + ('\u{fd40}', '\u{fd4f}'), + ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdef}'), + ('\u{fdfe}', '\u{fdff}'), + ('\u{fe1a}', '\u{fe1f}'), + ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), + ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), + ('\u{fefd}', '\u{fefe}'), + ('\u{ff00}', '\u{ff00}'), + ('\u{ffbf}', '\u{ffc1}'), + ('\u{ffc8}', '\u{ffc9}'), + ('\u{ffd0}', '\u{ffd1}'), + ('\u{ffd8}', '\u{ffd9}'), + ('\u{ffdd}', '\u{ffdf}'), + ('\u{ffe7}', '\u{ffe7}'), + ('\u{ffef}', '\u{fff8}'), + ('\u{fffe}', '\u{ffff}'), + ('\u{1000c}', '\u{1000c}'), + ('\u{10027}', '\u{10027}'), + ('\u{1003b}', '\u{1003b}'), + ('\u{1003e}', '\u{1003e}'), + ('\u{1004e}', '\u{1004f}'), + ('\u{1005e}', '\u{1007f}'), + ('\u{100fb}', '\u{100ff}'), + ('\u{10103}', '\u{10106}'), + ('\u{10134}', '\u{10136}'), + ('\u{1018f}', '\u{1018f}'), + ('\u{1019d}', '\u{1019f}'), + ('\u{101a1}', '\u{101cf}'), + ('\u{101fe}', '\u{1027f}'), + ('\u{1029d}', '\u{1029f}'), + ('\u{102d1}', '\u{102df}'), + ('\u{102fc}', '\u{102ff}'), + ('\u{10324}', '\u{1032c}'), + ('\u{1034b}', '\u{1034f}'), + ('\u{1037b}', '\u{1037f}'), + ('\u{1039e}', '\u{1039e}'), + ('\u{103c4}', '\u{103c7}'), + ('\u{103d6}', '\u{103ff}'), + ('\u{1049e}', '\u{1049f}'), + ('\u{104aa}', '\u{104af}'), + ('\u{104d4}', '\u{104d7}'), + ('\u{104fc}', '\u{104ff}'), + ('\u{10528}', '\u{1052f}'), + ('\u{10564}', '\u{1056e}'), + ('\u{10570}', '\u{105ff}'), + ('\u{10737}', '\u{1073f}'), + ('\u{10756}', '\u{1075f}'), + ('\u{10768}', '\u{107ff}'), + ('\u{10806}', '\u{10807}'), + ('\u{10809}', '\u{10809}'), + ('\u{10836}', '\u{10836}'), + ('\u{10839}', '\u{1083b}'), + ('\u{1083d}', '\u{1083e}'), + ('\u{10856}', '\u{10856}'), + ('\u{1089f}', '\u{108a6}'), + ('\u{108b0}', '\u{108df}'), + ('\u{108f3}', '\u{108f3}'), + ('\u{108f6}', '\u{108fa}'), + ('\u{1091c}', '\u{1091e}'), + ('\u{1093a}', '\u{1093e}'), + ('\u{10940}', '\u{1097f}'), + ('\u{109b8}', '\u{109bb}'), + ('\u{109d0}', '\u{109d1}'), + ('\u{10a04}', '\u{10a04}'), + ('\u{10a07}', '\u{10a0b}'), + ('\u{10a14}', '\u{10a14}'), + ('\u{10a18}', '\u{10a18}'), + ('\u{10a36}', '\u{10a37}'), + ('\u{10a3b}', '\u{10a3e}'), + ('\u{10a49}', '\u{10a4f}'), + ('\u{10a59}', '\u{10a5f}'), + ('\u{10aa0}', '\u{10abf}'), + ('\u{10ae7}', '\u{10aea}'), + ('\u{10af7}', '\u{10aff}'), + ('\u{10b36}', '\u{10b38}'), + ('\u{10b56}', '\u{10b57}'), + ('\u{10b73}', '\u{10b77}'), + ('\u{10b92}', '\u{10b98}'), + ('\u{10b9d}', '\u{10ba8}'), + ('\u{10bb0}', '\u{10bff}'), + ('\u{10c49}', '\u{10c7f}'), + ('\u{10cb3}', '\u{10cbf}'), + ('\u{10cf3}', '\u{10cf9}'), + ('\u{10d28}', '\u{10d2f}'), + ('\u{10d3a}', '\u{10e5f}'), + ('\u{10e7f}', '\u{10e7f}'), + ('\u{10eaa}', '\u{10eaa}'), + ('\u{10eae}', '\u{10eaf}'), + ('\u{10eb2}', '\u{10eff}'), + ('\u{10f28}', '\u{10f2f}'), + ('\u{10f5a}', '\u{10faf}'), + ('\u{10fcc}', '\u{10fdf}'), + ('\u{10ff7}', '\u{10fff}'), + ('\u{1104e}', '\u{11051}'), + ('\u{11070}', '\u{1107e}'), + ('\u{110c2}', '\u{110cc}'), + ('\u{110ce}', '\u{110cf}'), + ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), + ('\u{11135}', '\u{11135}'), + ('\u{11148}', '\u{1114f}'), + ('\u{11177}', '\u{1117f}'), + ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), + ('\u{11212}', '\u{11212}'), + ('\u{1123f}', '\u{1127f}'), + ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), + ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), + ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), + ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), + ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), + ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), + ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133a}'), + ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), + ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), + ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), + ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), + ('\u{1145c}', '\u{1145c}'), + ('\u{11462}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), + ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), + ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), + ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), + ('\u{116b9}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), + ('\u{1171b}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), + ('\u{11740}', '\u{117ff}'), + ('\u{1183c}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), + ('\u{11907}', '\u{11908}'), + ('\u{1190a}', '\u{1190b}'), + ('\u{11914}', '\u{11914}'), + ('\u{11917}', '\u{11917}'), + ('\u{11936}', '\u{11936}'), + ('\u{11939}', '\u{1193a}'), + ('\u{11947}', '\u{1194f}'), + ('\u{1195a}', '\u{1199f}'), + ('\u{119a8}', '\u{119a9}'), + ('\u{119d8}', '\u{119d9}'), + ('\u{119e5}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), + ('\u{11aa3}', '\u{11abf}'), + ('\u{11af9}', '\u{11bff}'), + ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), + ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), + ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), + ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), + ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), + ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), + ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11d5f}'), + ('\u{11d66}', '\u{11d66}'), + ('\u{11d69}', '\u{11d69}'), + ('\u{11d8f}', '\u{11d8f}'), + ('\u{11d92}', '\u{11d92}'), + ('\u{11d99}', '\u{11d9f}'), + ('\u{11daa}', '\u{11edf}'), + ('\u{11ef9}', '\u{11faf}'), + ('\u{11fb1}', '\u{11fbf}'), + ('\u{11ff2}', '\u{11ffe}'), + ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), + ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12fff}'), + ('\u{1342f}', '\u{1342f}'), + ('\u{13439}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), + ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), + ('\u{16a6a}', '\u{16a6d}'), + ('\u{16a70}', '\u{16acf}'), + ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), + ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), + ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), + ('\u{16b90}', '\u{16e3f}'), + ('\u{16e9b}', '\u{16eff}'), + ('\u{16f4b}', '\u{16f4e}'), + ('\u{16f88}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), + ('\u{16fe5}', '\u{16fef}'), + ('\u{16ff2}', '\u{16fff}'), + ('\u{187f8}', '\u{187ff}'), + ('\u{18cd6}', '\u{18cff}'), + ('\u{18d09}', '\u{1afff}'), + ('\u{1b11f}', '\u{1b14f}'), + ('\u{1b153}', '\u{1b163}'), + ('\u{1b168}', '\u{1b16f}'), + ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), + ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), + ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca4}', '\u{1cfff}'), + ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), + ('\u{1d1e9}', '\u{1d1ff}'), + ('\u{1d246}', '\u{1d2df}'), + ('\u{1d2f4}', '\u{1d2ff}'), + ('\u{1d357}', '\u{1d35f}'), + ('\u{1d379}', '\u{1d3ff}'), + ('\u{1d455}', '\u{1d455}'), + ('\u{1d49d}', '\u{1d49d}'), + ('\u{1d4a0}', '\u{1d4a1}'), + ('\u{1d4a3}', '\u{1d4a4}'), + ('\u{1d4a7}', '\u{1d4a8}'), + ('\u{1d4ad}', '\u{1d4ad}'), + ('\u{1d4ba}', '\u{1d4ba}'), + ('\u{1d4bc}', '\u{1d4bc}'), + ('\u{1d4c4}', '\u{1d4c4}'), + ('\u{1d506}', '\u{1d506}'), + ('\u{1d50b}', '\u{1d50c}'), + ('\u{1d515}', '\u{1d515}'), + ('\u{1d51d}', '\u{1d51d}'), + ('\u{1d53a}', '\u{1d53a}'), + ('\u{1d53f}', '\u{1d53f}'), + ('\u{1d545}', '\u{1d545}'), + ('\u{1d547}', '\u{1d549}'), + ('\u{1d551}', '\u{1d551}'), + ('\u{1d6a6}', '\u{1d6a7}'), + ('\u{1d7cc}', '\u{1d7cd}'), + ('\u{1da8c}', '\u{1da9a}'), + ('\u{1daa0}', '\u{1daa0}'), + ('\u{1dab0}', '\u{1dfff}'), + ('\u{1e007}', '\u{1e007}'), + ('\u{1e019}', '\u{1e01a}'), + ('\u{1e022}', '\u{1e022}'), + ('\u{1e025}', '\u{1e025}'), + ('\u{1e02b}', '\u{1e0ff}'), + ('\u{1e12d}', '\u{1e12f}'), + ('\u{1e13e}', '\u{1e13f}'), + ('\u{1e14a}', '\u{1e14d}'), + ('\u{1e150}', '\u{1e2bf}'), + ('\u{1e2fa}', '\u{1e2fe}'), + ('\u{1e300}', '\u{1e7ff}'), + ('\u{1e8c5}', '\u{1e8c6}'), + ('\u{1e8d7}', '\u{1e8ff}'), + ('\u{1e94c}', '\u{1e94f}'), + ('\u{1e95a}', '\u{1e95d}'), + ('\u{1e960}', '\u{1ec70}'), + ('\u{1ecb5}', '\u{1ed00}'), + ('\u{1ed3e}', '\u{1edff}'), + ('\u{1ee04}', '\u{1ee04}'), + ('\u{1ee20}', '\u{1ee20}'), + ('\u{1ee23}', '\u{1ee23}'), + ('\u{1ee25}', '\u{1ee26}'), + ('\u{1ee28}', '\u{1ee28}'), + ('\u{1ee33}', '\u{1ee33}'), + ('\u{1ee38}', '\u{1ee38}'), + ('\u{1ee3a}', '\u{1ee3a}'), + ('\u{1ee3c}', '\u{1ee41}'), + ('\u{1ee43}', '\u{1ee46}'), + ('\u{1ee48}', '\u{1ee48}'), + ('\u{1ee4a}', '\u{1ee4a}'), + ('\u{1ee4c}', '\u{1ee4c}'), + ('\u{1ee50}', '\u{1ee50}'), + ('\u{1ee53}', '\u{1ee53}'), + ('\u{1ee55}', '\u{1ee56}'), + ('\u{1ee58}', '\u{1ee58}'), + ('\u{1ee5a}', '\u{1ee5a}'), + ('\u{1ee5c}', '\u{1ee5c}'), + ('\u{1ee5e}', '\u{1ee5e}'), + ('\u{1ee60}', '\u{1ee60}'), + ('\u{1ee63}', '\u{1ee63}'), + ('\u{1ee65}', '\u{1ee66}'), + ('\u{1ee6b}', '\u{1ee6b}'), + ('\u{1ee73}', '\u{1ee73}'), + ('\u{1ee78}', '\u{1ee78}'), + ('\u{1ee7d}', '\u{1ee7d}'), + ('\u{1ee7f}', '\u{1ee7f}'), + ('\u{1ee8a}', '\u{1ee8a}'), + ('\u{1ee9c}', '\u{1eea0}'), + ('\u{1eea4}', '\u{1eea4}'), + ('\u{1eeaa}', '\u{1eeaa}'), + ('\u{1eebc}', '\u{1eeef}'), + ('\u{1eef2}', '\u{1efff}'), + ('\u{1f02c}', '\u{1f02f}'), + ('\u{1f094}', '\u{1f09f}'), + ('\u{1f0af}', '\u{1f0b0}'), + ('\u{1f0c0}', '\u{1f0c0}'), + ('\u{1f0d0}', '\u{1f0d0}'), + ('\u{1f0f6}', '\u{1f0ff}'), + ('\u{1f1ae}', '\u{1f1e5}'), + ('\u{1f203}', '\u{1f20f}'), + ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '\u{1f24f}'), + ('\u{1f252}', '\u{1f25f}'), + ('\u{1f266}', '\u{1f2ff}'), + ('\u{1f6d8}', '\u{1f6df}'), + ('\u{1f6ed}', '\u{1f6ef}'), + ('\u{1f6fd}', '\u{1f6ff}'), + ('\u{1f774}', '\u{1f77f}'), + ('\u{1f7d9}', '\u{1f7df}'), + ('\u{1f7ec}', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8af}'), + ('\u{1f8b2}', '\u{1f8ff}'), + ('\u{1f979}', '\u{1f979}'), + ('\u{1f9cc}', '\u{1f9cc}'), + ('\u{1fa54}', '\u{1fa5f}'), + ('\u{1fa6e}', '\u{1fa6f}'), + ('\u{1fa75}', '\u{1fa77}'), + ('\u{1fa7b}', '\u{1fa7f}'), + ('\u{1fa87}', '\u{1fa8f}'), + ('\u{1faa9}', '\u{1faaf}'), + ('\u{1fab7}', '\u{1fabf}'), + ('\u{1fac3}', '\u{1facf}'), + ('\u{1fad7}', '\u{1faff}'), + ('\u{1fb93}', '\u{1fb93}'), + ('\u{1fbcb}', '\u{1fbef}'), + ('\u{1fbfa}', '\u{1ffff}'), + ('\u{2a6de}', '\u{2a6ff}'), + ('\u{2b735}', '\u{2b73f}'), + ('\u{2b81e}', '\u{2b81f}'), + ('\u{2cea2}', '\u{2ceaf}'), + ('\u{2ebe1}', '\u{2f7ff}'), + ('\u{2fa1e}', '\u{2ffff}'), + ('\u{3134b}', '\u{e0000}'), + ('\u{e0002}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{effff}'), + ('\u{ffffe}', '\u{fffff}'), + ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const UPPERCASE_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('Ã', 'Ã'), + ('Ã', 'Ã'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä ', 'Ä '), + ('Äĸ', 'Äĸ'), + ('Ĥ', 'Ĥ'), + ('ÄĻ', 'ÄĻ'), + ('Ĩ', 'Ĩ'), + ('ÄĒ', 'ÄĒ'), + ('ÄŦ', 'ÄŦ'), + ('ÄŽ', 'ÄŽ'), + ('Ä°', 'Ä°'), + ('IJ', 'IJ'), + ('Ä´', 'Ä´'), + ('Äļ', 'Äļ'), + ('Äš', 'Äš'), + ('Äģ', 'Äģ'), + ('ÄŊ', 'ÄŊ'), + ('Äŋ', 'Äŋ'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å
', 'Å
'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å ', 'Å '), + ('Åĸ', 'Åĸ'), + ('Ť', 'Ť'), + ('ÅĻ', 'ÅĻ'), + ('Ũ', 'Ũ'), + ('ÅĒ', 'ÅĒ'), + ('ÅŦ', 'ÅŦ'), + ('ÅŽ', 'ÅŽ'), + ('Å°', 'Å°'), + ('Å˛', 'Å˛'), + ('Å´', 'Å´'), + ('Åļ', 'Åļ'), + ('Ÿ', 'Åš'), + ('Åģ', 'Åģ'), + ('ÅŊ', 'ÅŊ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ '), + ('Æĸ', 'Æĸ'), + ('Ƥ', 'Ƥ'), + ('ÆĻ', 'Ƨ'), + ('ÆŠ', 'ÆŠ'), + ('ÆŦ', 'ÆŦ'), + ('ÆŽ', 'Ư'), + ('Æą', 'Æŗ'), + ('Æĩ', 'Æĩ'), + ('Æˇ', 'Ƹ'), + ('Æŧ', 'Æŧ'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į ', 'Į '), + ('Įĸ', 'Įĸ'), + ('Į¤', 'Į¤'), + ('ĮĻ', 'ĮĻ'), + ('Į¨', 'Į¨'), + ('ĮĒ', 'ĮĒ'), + ('ĮŦ', 'ĮŦ'), + ('ĮŽ', 'ĮŽ'), + ('Įą', 'Įą'), + ('Į´', 'Į´'), + ('Įļ', 'Į¸'), + ('Įē', 'Įē'), + ('Įŧ', 'Įŧ'), + ('Įž', 'Įž'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č ', 'Č '), + ('Čĸ', 'Čĸ'), + ('Ȥ', 'Ȥ'), + ('ČĻ', 'ČĻ'), + ('Ȩ', 'Ȩ'), + ('ČĒ', 'ČĒ'), + ('ČŦ', 'ČŦ'), + ('ČŽ', 'ČŽ'), + ('Č°', 'Č°'), + ('Ȳ', 'Ȳ'), + ('Čē', 'Čģ'), + ('ČŊ', 'Čž'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('Í°', 'Í°'), + ('Ͳ', 'Ͳ'), + ('Íļ', 'Íļ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'ÎĢ'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī ', 'Ī '), + ('Īĸ', 'Īĸ'), + ('Ī¤', 'Ī¤'), + ('ĪĻ', 'ĪĻ'), + ('Ī¨', 'Ī¨'), + ('ĪĒ', 'ĪĒ'), + ('ĪŦ', 'ĪŦ'), + ('ĪŽ', 'ĪŽ'), + ('Ī´', 'Ī´'), + ('Īˇ', 'Īˇ'), + ('Īš', 'Īē'), + ('ĪŊ', 'Đ¯'), + ('Ņ ', 'Ņ '), + ('Ņĸ', 'Ņĸ'), + ('Ņ¤', 'Ņ¤'), + ('ŅĻ', 'ŅĻ'), + ('Ņ¨', 'Ņ¨'), + ('ŅĒ', 'ŅĒ'), + ('ŅŦ', 'ŅŦ'), + ('ŅŽ', 'ŅŽ'), + ('Ņ°', 'Ņ°'), + ('Ņ˛', 'Ņ˛'), + ('Ņ´', 'Ņ´'), + ('Ņļ', 'Ņļ'), + ('Ņ¸', 'Ņ¸'), + ('Ņē', 'Ņē'), + ('Ņŧ', 'Ņŧ'), + ('Ņž', 'Ņž'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō ', 'Ō '), + ('Ōĸ', 'Ōĸ'), + ('Ō¤', 'Ō¤'), + ('ŌĻ', 'ŌĻ'), + ('Ō¨', 'Ō¨'), + ('ŌĒ', 'ŌĒ'), + ('ŌŦ', 'ŌŦ'), + ('ŌŽ', 'ŌŽ'), + ('Ō°', 'Ō°'), + ('Ō˛', 'Ō˛'), + ('Ō´', 'Ō´'), + ('Ōļ', 'Ōļ'), + ('Ō¸', 'Ō¸'), + ('Ōē', 'Ōē'), + ('Ōŧ', 'Ōŧ'), + ('Ōž', 'Ōž'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ
', 'Ķ
'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ ', 'Ķ '), + ('Ķĸ', 'Ķĸ'), + ('Ķ¤', 'Ķ¤'), + ('ĶĻ', 'ĶĻ'), + ('Ķ¨', 'Ķ¨'), + ('ĶĒ', 'ĶĒ'), + ('ĶŦ', 'ĶŦ'), + ('ĶŽ', 'ĶŽ'), + ('Ķ°', 'Ķ°'), + ('Ķ˛', 'Ķ˛'), + ('Ķ´', 'Ķ´'), + ('Ķļ', 'Ķļ'), + ('Ķ¸', 'Ķ¸'), + ('Ķē', 'Ķē'), + ('Ķŧ', 'Ķŧ'), + ('Ķž', 'Ķž'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô ', 'Ô '), + ('Ôĸ', 'Ôĸ'), + ('Ô¤', 'Ô¤'), + ('ÔĻ', 'ÔĻ'), + ('Ô¨', 'Ô¨'), + ('ÔĒ', 'ÔĒ'), + ('ÔŦ', 'ÔŦ'), + ('ÔŽ', 'ÔŽ'), + ('Ôą', 'Õ'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('Ḡ', 'Ḡ'), + ('á¸ĸ', 'á¸ĸ'), + ('Ḥ', 'Ḥ'), + ('á¸Ļ', 'á¸Ļ'), + ('Ḩ', 'Ḩ'), + ('á¸Ē', 'á¸Ē'), + ('á¸Ŧ', 'á¸Ŧ'), + ('Ḏ', 'Ḏ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('á¸ļ', 'á¸ļ'), + ('Ḹ', 'Ḹ'), + ('á¸ē', 'á¸ē'), + ('á¸ŧ', 'á¸ŧ'), + ('Ḟ', 'Ḟ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš ', 'áš '), + ('ášĸ', 'ášĸ'), + ('ᚤ', 'ᚤ'), + ('ášĻ', 'ášĻ'), + ('ᚨ', 'ᚨ'), + ('ášĒ', 'ášĒ'), + ('ášŦ', 'ášŦ'), + ('ᚎ', 'ᚎ'), + ('áš°', 'áš°'), + ('ᚲ', 'ᚲ'), + ('áš´', 'áš´'), + ('ášļ', 'ášļ'), + ('ᚸ', 'ᚸ'), + ('ášē', 'ášē'), + ('ášŧ', 'ášŧ'), + ('ášž', 'ášž'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē ', 'áē '), + ('áēĸ', 'áēĸ'), + ('áē¤', 'áē¤'), + ('áēĻ', 'áēĻ'), + ('áē¨', 'áē¨'), + ('áēĒ', 'áēĒ'), + ('áēŦ', 'áēŦ'), + ('áēŽ', 'áēŽ'), + ('áē°', 'áē°'), + ('áē˛', 'áē˛'), + ('áē´', 'áē´'), + ('áēļ', 'áēļ'), + ('áē¸', 'áē¸'), + ('áēē', 'áēē'), + ('áēŧ', 'áēŧ'), + ('áēž', 'áēž'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ ', 'áģ '), + ('áģĸ', 'áģĸ'), + ('áģ¤', 'áģ¤'), + ('áģĻ', 'áģĻ'), + ('áģ¨', 'áģ¨'), + ('áģĒ', 'áģĒ'), + ('áģŦ', 'áģŦ'), + ('áģŽ', 'áģŽ'), + ('áģ°', 'áģ°'), + ('áģ˛', 'áģ˛'), + ('áģ´', 'áģ´'), + ('áģļ', 'áģļ'), + ('áģ¸', 'áģ¸'), + ('áģē', 'áģē'), + ('áģŧ', 'áģŧ'), + ('áģž', 'áģž'), + ('áŧ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ¨', 'áŧ¯'), + ('áŧ¸', 'áŧŋ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ¨', 'áŊ¯'), + ('ី', 'ážģ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ¨', 'áŋŦ'), + ('áŋ¸', 'áŋģ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'â'), + ('â°', 'âŗ'), + ('âž', 'âŋ'), + ('â
', 'â
'), + ('â', 'â'), + ('â°', 'â°Ž'), + ('âą ', 'âą '), + ('âąĸ', '⹤'), + ('⹧', '⹧'), + ('⹊', '⹊'), + ('âąĢ', 'âąĢ'), + ('âą', 'âą°'), + ('⹲', '⹲'), + ('âąĩ', 'âąĩ'), + ('âąž', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('Ⲡ', 'Ⲡ'), + ('â˛ĸ', 'â˛ĸ'), + ('Ⲥ', 'Ⲥ'), + ('â˛Ļ', 'â˛Ļ'), + ('Ⲩ', 'Ⲩ'), + ('â˛Ē', 'â˛Ē'), + ('â˛Ŧ', 'â˛Ŧ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('â˛ļ', 'â˛ļ'), + ('Ⲹ', 'Ⲹ'), + ('â˛ē', 'â˛ē'), + ('â˛ŧ', 'â˛ŧ'), + ('Ⲟ', 'Ⲟ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ ', 'âŗ '), + ('âŗĸ', 'âŗĸ'), + ('âŗĢ', 'âŗĢ'), + ('âŗ', 'âŗ'), + ('âŗ˛', 'âŗ˛'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ę˛', 'ę˛'), + ('ę´', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęŽ'), + ('ę°', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', '\u{a7c7}'), + ('\u{a7c9}', '\u{a7c9}'), + ('\u{a7f5}', '\u{a7f5}'), + ('īŧĄ', 'īŧē'), + ('đ', 'đ§'), + ('đ°', 'đ'), + ('đ˛', 'đ˛˛'), + ('đĸ ', 'đĸŋ'), + ('đš', 'đš'), + ('đ', 'đ'), + ('đ´', 'đ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đĩ'), + ('đ', 'đŠ'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¸', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đŦ', 'đ
'), + ('đ ', 'đš'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đŧ', 'đ'), + ('đ°', 'đ'), + ('đ¨', 'đ'), + ('đĸ', 'đē'), + ('đ', 'đ´'), + ('đ', 'đŽ'), + ('đ', 'đ¨'), + ('đ', 'đ'), + ('đ¤', 'đ¤Ą'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs b/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs new file mode 100644 index 000000000..7df9d2b93 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs @@ -0,0 +1,1389 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate grapheme-cluster-break ucd-13.0.0 --chars +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("CR", CR), + ("Control", CONTROL), + ("Extend", EXTEND), + ("L", L), + ("LF", LF), + ("LV", LV), + ("LVT", LVT), + ("Prepend", PREPEND), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("SpacingMark", SPACINGMARK), + ("T", T), + ("V", V), + ("ZWJ", ZWJ), +]; + +pub const CR: &'static [(char, char)] = &[('\r', '\r')]; + +pub const CONTROL: &'static [(char, char)] = &[ + ('\u{0}', '\t'), + ('\u{b}', '\u{c}'), + ('\u{e}', '\u{1f}'), + ('\u{7f}', '\u{9f}'), + ('\u{ad}', '\u{ad}'), + ('\u{61c}', '\u{61c}'), + ('\u{180e}', '\u{180e}'), + ('\u{200b}', '\u{200b}'), + ('\u{200e}', '\u{200f}'), + ('\u{2028}', '\u{202e}'), + ('\u{2060}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff0}', '\u{fffb}'), + ('\u{13430}', '\u{13438}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0000}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{e0fff}'), +]; + +pub const EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9be}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', '\u{bbe}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc2}', '\u{cc2}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d3e}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dcf}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{ddf}', '\u{ddf}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ecd}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1734}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ac0}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{200c}', '\u{200c}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('\u{aaec}', '\u{aaed}'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '\u{1133e}'), + ('\u{11340}', '\u{11340}'), + ('\u{11357}', '\u{11357}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114b0}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bd}', '\u{114bd}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115af}', '\u{115af}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{11930}', '\u{11930}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1d165}', '\u{1d165}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d16e}', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('đģ', 'đŋ'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const L: &'static [(char, char)] = &[('á', 'á
'), ('ęĨ ', 'ęĨŧ')]; + +pub const LF: &'static [(char, char)] = &[('\n', '\n')]; + +pub const LV: &'static [(char, char)] = &[ + ('ę°', 'ę°'), + ('ę°', 'ę°'), + ('ę°¸', 'ę°¸'), + ('ęą', 'ęą'), + ('ęą°', 'ęą°'), + ('ę˛', 'ę˛'), + ('겨', '겨'), + ('ęŗ', 'ęŗ'), + ('ęŗ ', 'ęŗ '), + ('ęŗŧ', 'ęŗŧ'), + ('ę´', 'ę´'), + ('ę´´', 'ę´´'), + ('ęĩ', 'ęĩ'), + ('ęĩŦ', 'ęĩŦ'), + ('ęļ', 'ęļ'), + ('ęļ¤', 'ęļ¤'), + ('ęˇ', 'ęˇ'), + ('ęˇ', 'ęˇ'), + ('꡸', '꡸'), + ('ę¸', 'ę¸'), + ('기', '기'), + ('ęš', 'ęš'), + ('ꚨ', 'ꚨ'), + ('ęē', 'ęē'), + ('ęē ', 'ęē '), + ('ęēŧ', 'ęēŧ'), + ('ęģ', 'ęģ'), + ('ęģ´', 'ęģ´'), + ('ęŧ', 'ęŧ'), + ('ęŧŦ', 'ęŧŦ'), + ('ęŊ', 'ęŊ'), + ('ęŊ¤', 'ęŊ¤'), + ('ęž', 'ęž'), + ('ęž', 'ęž'), + ('Ꞹ', 'Ꞹ'), + ('ęŋ', 'ęŋ'), + ('ęŋ°', 'ęŋ°'), + ('ë', 'ë'), + ('ë¨', 'ë¨'), + ('ë', 'ë'), + ('ë ', 'ë '), + ('ëŧ', 'ëŧ'), + ('ë', 'ë'), + ('ë´', 'ë´'), + ('ë', 'ë'), + ('ëŦ', 'ëŦ'), + ('ë', 'ë'), + ('ë¤', 'ë¤'), + ('ë
', 'ë
'), + ('ë
', 'ë
'), + ('ë
¸', 'ë
¸'), + ('ë', 'ë'), + ('ë°', 'ë°'), + ('ë', 'ë'), + ('ë¨', 'ë¨'), + ('ë', 'ë'), + ('ë ', 'ë '), + ('ëŧ', 'ëŧ'), + ('ë', 'ë'), + ('ë´', 'ë´'), + ('ë', 'ë'), + ('ëŦ', 'ëŦ'), + ('ë', 'ë'), + ('ë¤', 'ë¤'), + ('ë', 'ë'), + ('ë', 'ë'), + ('ë¸', 'ë¸'), + ('ë', 'ë'), + ('ë°', 'ë°'), + ('ë', 'ë'), + ('ë¨', 'ë¨'), + ('ë', 'ë'), + ('ë ', 'ë '), + ('ëŧ', 'ëŧ'), + ('ë', 'ë'), + ('ë´', 'ë´'), + ('ë', 'ë'), + ('ëŦ', 'ëŦ'), + ('ë', 'ë'), + ('ë¤', 'ë¤'), + ('ë', 'ë'), + ('ë', 'ë'), + ('ë¸', 'ë¸'), + ('ë', 'ë'), + ('ë°', 'ë°'), + ('ë', 'ë'), + ('ë¨', 'ë¨'), + ('ë', 'ë'), + ('ë ', 'ë '), + ('ëŧ', 'ëŧ'), + ('ë', 'ë'), + ('ë´', 'ë´'), + ('ë', 'ë'), + ('ëŦ', 'ëŦ'), + ('ë', 'ë'), + ('ë¤', 'ë¤'), + ('ë', 'ë'), + ('ë', 'ë'), + ('ë¸', 'ë¸'), + ('ë', 'ë'), + ('ë°', 'ë°'), + ('ë', 'ë'), + ('ë¨', 'ë¨'), + ('ë', 'ë'), + ('ë ', 'ë '), + ('ëŧ', 'ëŧ'), + ('ë', 'ë'), + ('ë´', 'ë´'), + ('ë', 'ë'), + ('ëŦ', 'ëŦ'), + ('ë ', 'ë '), + ('ë ¤', 'ë ¤'), + ('ëĄ', 'ëĄ'), + ('ëĄ', 'ëĄ'), + ('른', '른'), + ('ëĸ', 'ëĸ'), + ('ëĸ°', 'ëĸ°'), + ('ëŖ', 'ëŖ'), + ('ëŖ¨', 'ëŖ¨'), + ('ë¤', 'ë¤'), + ('ë¤ ', 'ë¤ '), + ('ë¤ŧ', 'ë¤ŧ'), + ('ëĨ', 'ëĨ'), + ('ëĨ´', 'ëĨ´'), + ('ëĻ', 'ëĻ'), + ('ëĻŦ', 'ëĻŦ'), + ('ë§', 'ë§'), + ('매', '매'), + ('ë¨', 'ë¨'), + ('ë¨', 'ë¨'), + ('머', '머'), + ('ëŠ', 'ëŠ'), + ('늰', '늰'), + ('ëĒ', 'ëĒ'), + ('ëĒ¨', 'ëĒ¨'), + ('ëĢ', 'ëĢ'), + ('ëĢ ', 'ëĢ '), + ('ëĢŧ', 'ëĢŧ'), + ('ëŦ', 'ëŦ'), + ('ëŦ´', 'ëŦ´'), + ('ë', 'ë'), + ('ëŦ', 'ëŦ'), + ('ëŽ', 'ëŽ'), + ('뎤', '뎤'), + ('ë¯', 'ë¯'), + ('ë¯', 'ë¯'), + ('미', '미'), + ('ë°', 'ë°'), + ('ë°°', 'ë°°'), + ('ëą', 'ëą'), + ('빨', '빨'), + ('ë˛', 'ë˛'), + ('ë˛ ', 'ë˛ '), + ('ë˛ŧ', 'ë˛ŧ'), + ('ëŗ', 'ëŗ'), + ('ëŗ´', 'ëŗ´'), + ('ë´', 'ë´'), + ('ë´Ŧ', 'ë´Ŧ'), + ('ëĩ', 'ëĩ'), + ('ëĩ¤', 'ëĩ¤'), + ('ëļ', 'ëļ'), + ('ëļ', 'ëļ'), + ('ëļ¸', 'ëļ¸'), + ('ëˇ', 'ëˇ'), + ('롰', '롰'), + ('ë¸', 'ë¸'), + ('븨', '븨'), + ('ëš', 'ëš'), + ('ëš ', 'ëš '), + ('ëšŧ', 'ëšŧ'), + ('ëē', 'ëē'), + ('ëē´', 'ëē´'), + ('ëģ', 'ëģ'), + ('ëģŦ', 'ëģŦ'), + ('ëŧ', 'ëŧ'), + ('ëŧ¤', 'ëŧ¤'), + ('ëŊ', 'ëŊ'), + ('ëŊ', 'ëŊ'), + ('ëŊ¸', 'ëŊ¸'), + ('ëž', 'ëž'), + ('ëž°', 'ëž°'), + ('ëŋ', 'ëŋ'), + ('ëŋ¨', 'ëŋ¨'), + ('ė', 'ė'), + ('ė ', 'ė '), + ('ėŧ', 'ėŧ'), + ('ė', 'ė'), + ('ė´', 'ė´'), + ('ė', 'ė'), + ('ėŦ', 'ėŦ'), + ('ė', 'ė'), + ('ė¤', 'ė¤'), + ('ė', 'ė'), + ('ė', 'ė'), + ('ė¸', 'ė¸'), + ('ė
', 'ė
'), + ('ė
°', 'ė
°'), + ('ė', 'ė'), + ('ė¨', 'ė¨'), + ('ė', 'ė'), + ('ė ', 'ė '), + ('ėŧ', 'ėŧ'), + ('ė', 'ė'), + ('ė´', 'ė´'), + ('ė', 'ė'), + ('ėŦ', 'ėŦ'), + ('ė', 'ė'), + ('ė¤', 'ė¤'), + ('ė', 'ė'), + ('ė', 'ė'), + ('ė¸', 'ė¸'), + ('ė', 'ė'), + ('ė°', 'ė°'), + ('ė', 'ė'), + ('ė¨', 'ė¨'), + ('ė', 'ė'), + ('ė ', 'ė '), + ('ėŧ', 'ėŧ'), + ('ė', 'ė'), + ('ė´', 'ė´'), + ('ė', 'ė'), + ('ėŦ', 'ėŦ'), + ('ė', 'ė'), + ('ė¤', 'ė¤'), + ('ė', 'ė'), + ('ė', 'ė'), + ('ė¸', 'ė¸'), + ('ė', 'ė'), + ('ė°', 'ė°'), + ('ė', 'ė'), + ('ė¨', 'ė¨'), + ('ė', 'ė'), + ('ė ', 'ė '), + ('ėŧ', 'ėŧ'), + ('ė', 'ė'), + ('ė´', 'ė´'), + ('ė', 'ė'), + ('ėŦ', 'ėŦ'), + ('ė', 'ė'), + ('ė¤', 'ė¤'), + ('ė', 'ė'), + ('ė', 'ė'), + ('ė¸', 'ė¸'), + ('ė', 'ė'), + ('ė°', 'ė°'), + ('ė', 'ė'), + ('ė¨', 'ė¨'), + ('ė', 'ė'), + ('ė ', 'ė '), + ('ėŧ', 'ėŧ'), + ('ė', 'ė'), + ('ė´', 'ė´'), + ('ė', 'ė'), + ('ėŦ', 'ėŦ'), + ('ė', 'ė'), + ('ė¤', 'ė¤'), + ('ė ', 'ė '), + ('ė ', 'ė '), + ('ė ¸', 'ė ¸'), + ('ėĄ', 'ėĄ'), + ('ėĄ°', 'ėĄ°'), + ('ėĸ', 'ėĸ'), + ('ėĸ¨', 'ėĸ¨'), + ('ėŖ', 'ėŖ'), + ('ėŖ ', 'ėŖ '), + ('ėŖŧ', 'ėŖŧ'), + ('ė¤', 'ė¤'), + ('ė¤´', 'ė¤´'), + ('ėĨ', 'ėĨ'), + ('ėĨŦ', 'ėĨŦ'), + ('ėĻ', 'ėĻ'), + ('ėĻ¤', 'ėĻ¤'), + ('ė§', 'ė§'), + ('ė§', 'ė§'), + ('ė§¸', 'ė§¸'), + ('ė¨', 'ė¨'), + ('ė¨°', 'ė¨°'), + ('ėŠ', 'ėŠ'), + ('ėŠ¨', 'ėŠ¨'), + ('ėĒ', 'ėĒ'), + ('ėĒ ', 'ėĒ '), + ('ėĒŧ', 'ėĒŧ'), + ('ėĢ', 'ėĢ'), + ('ėĢ´', 'ėĢ´'), + ('ėŦ', 'ėŦ'), + ('ėŦŦ', 'ėŦŦ'), + ('ė', 'ė'), + ('ė¤', 'ė¤'), + ('ėŽ', 'ėŽ'), + ('ėŽ', 'ėŽ'), + ('ėŽ¸', 'ėŽ¸'), + ('ė¯', 'ė¯'), + ('ė¯°', 'ė¯°'), + ('ė°', 'ė°'), + ('ė°¨', 'ė°¨'), + ('ėą', 'ėą'), + ('ėą ', 'ėą '), + ('ėąŧ', 'ėąŧ'), + ('ė˛', 'ė˛'), + ('ė˛´', 'ė˛´'), + ('ėŗ', 'ėŗ'), + ('ėŗŦ', 'ėŗŦ'), + ('ė´', 'ė´'), + ('ė´¤', 'ė´¤'), + ('ėĩ', 'ėĩ'), + ('ėĩ', 'ėĩ'), + ('ėĩ¸', 'ėĩ¸'), + ('ėļ', 'ėļ'), + ('ėļ°', 'ėļ°'), + ('ėˇ', 'ėˇ'), + ('ėˇ¨', 'ėˇ¨'), + ('ė¸', 'ė¸'), + ('ė¸ ', 'ė¸ '), + ('ė¸ŧ', 'ė¸ŧ'), + ('ėš', 'ėš'), + ('ėš´', 'ėš´'), + ('ėē', 'ėē'), + ('ėēŦ', 'ėēŦ'), + ('ėģ', 'ėģ'), + ('ėģ¤', 'ėģ¤'), + ('ėŧ', 'ėŧ'), + ('ėŧ', 'ėŧ'), + ('ėŧ¸', 'ėŧ¸'), + ('ėŊ', 'ėŊ'), + ('ėŊ°', 'ėŊ°'), + ('ėž', 'ėž'), + ('ėž¨', 'ėž¨'), + ('ėŋ', 'ėŋ'), + ('ėŋ ', 'ėŋ '), + ('ėŋŧ', 'ėŋŧ'), + ('í', 'í'), + ('í´', 'í´'), + ('í', 'í'), + ('íŦ', 'íŦ'), + ('í', 'í'), + ('í¤', 'í¤'), + ('í', 'í'), + ('í', 'í'), + ('í¸', 'í¸'), + ('í', 'í'), + ('í°', 'í°'), + ('í
', 'í
'), + ('í
¨', 'í
¨'), + ('í', 'í'), + ('í ', 'í '), + ('íŧ', 'íŧ'), + ('í', 'í'), + ('í´', 'í´'), + ('í', 'í'), + ('íŦ', 'íŦ'), + ('í', 'í'), + ('í¤', 'í¤'), + ('í', 'í'), + ('í', 'í'), + ('í¸', 'í¸'), + ('í', 'í'), + ('í°', 'í°'), + ('í', 'í'), + ('í¨', 'í¨'), + ('í', 'í'), + ('í ', 'í '), + ('íŧ', 'íŧ'), + ('í', 'í'), + ('í´', 'í´'), + ('í', 'í'), + ('íŦ', 'íŦ'), + ('í', 'í'), + ('í¤', 'í¤'), + ('í', 'í'), + ('í', 'í'), + ('í¸', 'í¸'), + ('í', 'í'), + ('í°', 'í°'), + ('í', 'í'), + ('í¨', 'í¨'), + ('í', 'í'), + ('í ', 'í '), + ('íŧ', 'íŧ'), + ('í', 'í'), + ('í´', 'í´'), + ('í', 'í'), + ('íŦ', 'íŦ'), + ('í', 'í'), + ('í¤', 'í¤'), + ('í', 'í'), + ('í', 'í'), + ('í¸', 'í¸'), + ('í', 'í'), + ('í°', 'í°'), + ('í', 'í'), + ('í¨', 'í¨'), + ('í', 'í'), + ('í ', 'í '), + ('íŧ', 'íŧ'), + ('í', 'í'), + ('í´', 'í´'), + ('í', 'í'), + ('íŦ', 'íŦ'), + ('í', 'í'), +]; + +pub const LVT: &'static [(char, char)] = &[ + ('ę°', 'ę°'), + ('ę°', 'ę°ˇ'), + ('ę°š', 'ęą'), + ('ęą', 'ęą¯'), + ('ęąą', 'ę˛'), + ('ę˛', '겧'), + ('겊', 'ęŗ'), + ('ęŗ
', 'ęŗ'), + ('ęŗĄ', 'ęŗģ'), + ('ęŗŊ', 'ę´'), + ('ę´', 'ę´ŗ'), + ('ę´ĩ', 'ęĩ'), + ('ęĩ', 'ęĩĢ'), + ('ęĩ', 'ęļ'), + ('ęļ', 'ęļŖ'), + ('ęļĨ', 'ęļŋ'), + ('ęˇ', 'ęˇ'), + ('ęˇ', 'ꡡ'), + ('ꡚ', 'ę¸'), + ('ę¸', 'ę¸¯'), + ('긹', 'ęš'), + ('ęš', 'ꚧ'), + ('Ꚋ', 'ęē'), + ('ęē
', 'ęē'), + ('ęēĄ', 'ęēģ'), + ('ęēŊ', 'ęģ'), + ('ęģ', 'ęģŗ'), + ('ęģĩ', 'ęŧ'), + ('ęŧ', 'ęŧĢ'), + ('ęŧ', 'ęŊ'), + ('ęŊ', 'ęŊŖ'), + ('ęŊĨ', 'ęŊŋ'), + ('ęž', 'ęž'), + ('ęž', 'ꞡ'), + ('ęžš', 'ęŋ'), + ('ęŋ', 'ęŋ¯'), + ('ęŋą', 'ë'), + ('ë', 'ë§'), + ('ëŠ', 'ë'), + ('ë
', 'ë'), + ('ëĄ', 'ëģ'), + ('ëŊ', 'ë'), + ('ë', 'ëŗ'), + ('ëĩ', 'ë'), + ('ë', 'ëĢ'), + ('ë', 'ë'), + ('ë', 'ëŖ'), + ('ëĨ', 'ëŋ'), + ('ë
', 'ë
'), + ('ë
', 'ë
ˇ'), + ('ë
š', 'ë'), + ('ë', 'ë¯'), + ('ëą', 'ë'), + ('ë', 'ë§'), + ('ëŠ', 'ë'), + ('ë
', 'ë'), + ('ëĄ', 'ëģ'), + ('ëŊ', 'ë'), + ('ë', 'ëŗ'), + ('ëĩ', 'ë'), + ('ë', 'ëĢ'), + ('ë', 'ë'), + ('ë', 'ëŖ'), + ('ëĨ', 'ëŋ'), + ('ë', 'ë'), + ('ë', 'ëˇ'), + ('ëš', 'ë'), + ('ë', 'ë¯'), + ('ëą', 'ë'), + ('ë', 'ë§'), + ('ëŠ', 'ë'), + ('ë
', 'ë'), + ('ëĄ', 'ëģ'), + ('ëŊ', 'ë'), + ('ë', 'ëŗ'), + ('ëĩ', 'ë'), + ('ë', 'ëĢ'), + ('ë', 'ë'), + ('ë', 'ëŖ'), + ('ëĨ', 'ëŋ'), + ('ë', 'ë'), + ('ë', 'ëˇ'), + ('ëš', 'ë'), + ('ë', 'ë¯'), + ('ëą', 'ë'), + ('ë', 'ë§'), + ('ëŠ', 'ë'), + ('ë
', 'ë'), + ('ëĄ', 'ëģ'), + ('ëŊ', 'ë'), + ('ë', 'ëŗ'), + ('ëĩ', 'ë'), + ('ë', 'ëĢ'), + ('ë', 'ë'), + ('ë', 'ëŖ'), + ('ëĨ', 'ëŋ'), + ('ë', 'ë'), + ('ë', 'ëˇ'), + ('ëš', 'ë'), + ('ë', 'ë¯'), + ('ëą', 'ë'), + ('ë', 'ë§'), + ('ëŠ', 'ë'), + ('ë
', 'ë'), + ('ëĄ', 'ëģ'), + ('ëŊ', 'ë'), + ('ë', 'ëŗ'), + ('ëĩ', 'ë'), + ('ë', 'ëĢ'), + ('ë', 'ë '), + ('ë ', 'ë Ŗ'), + ('ë Ĩ', 'ë ŋ'), + ('ëĄ', 'ëĄ'), + ('ëĄ', '륡'), + ('륚', 'ëĸ'), + ('ëĸ', 'ëĸ¯'), + ('ëĸą', 'ëŖ'), + ('ëŖ', 'ëŖ§'), + ('ëŖŠ', 'ë¤'), + ('ë¤
', 'ë¤'), + ('뤥', 'ë¤ģ'), + ('ë¤Ŋ', 'ëĨ'), + ('ëĨ', 'ëĨŗ'), + ('ëĨĩ', 'ëĻ'), + ('ëĻ', 'ëĻĢ'), + ('ëĻ', 'ë§'), + ('ë§', 'ë§Ŗ'), + ('ë§Ĩ', 'ë§ŋ'), + ('ë¨', 'ë¨'), + ('ë¨', '먡'), + ('먚', 'ëŠ'), + ('ëŠ', '늯'), + ('늹', 'ëĒ'), + ('ëĒ', 'ëĒ§'), + ('ëĒŠ', 'ëĢ'), + ('ëĢ
', 'ëĢ'), + ('ëĢĄ', 'ëĢģ'), + ('ëĢŊ', 'ëŦ'), + ('ëŦ', 'ëŦŗ'), + ('ëŦĩ', 'ë'), + ('ë', 'ëĢ'), + ('ë', 'ëŽ'), + ('ëŽ', 'ëŽŖ'), + ('ëŽĨ', 'ëŽŋ'), + ('ë¯', 'ë¯'), + ('ë¯', 'ë¯ˇ'), + ('믚', 'ë°'), + ('ë°', 'ë°¯'), + ('ë°ą', 'ëą'), + ('ëą', '빧'), + ('빊', 'ë˛'), + ('ë˛
', 'ë˛'), + ('벥', 'ë˛ģ'), + ('ë˛Ŋ', 'ëŗ'), + ('ëŗ', 'ëŗŗ'), + ('ëŗĩ', 'ë´'), + ('ë´', 'ë´Ģ'), + ('ë´', 'ëĩ'), + ('ëĩ', 'ëĩŖ'), + ('ëĩĨ', 'ëĩŋ'), + ('ëļ', 'ëļ'), + ('ëļ', 'ëļˇ'), + ('ëļš', 'ëˇ'), + ('ëˇ', 'ëˇ¯'), + ('롹', 'ë¸'), + ('ë¸', '븧'), + ('븊', 'ëš'), + ('ëš
', 'ëš'), + ('뚥', 'ëšģ'), + ('ëšŊ', 'ëē'), + ('ëē', 'ëēŗ'), + ('ëēĩ', 'ëģ'), + ('ëģ', 'ëģĢ'), + ('ëģ', 'ëŧ'), + ('ëŧ', 'ëŧŖ'), + ('ëŧĨ', 'ëŧŋ'), + ('ëŊ', 'ëŊ'), + ('ëŊ', 'ëŊˇ'), + ('ëŊš', 'ëž'), + ('ëž', '랯'), + ('ëžą', 'ëŋ'), + ('ëŋ', 'ëŋ§'), + ('ëŋŠ', 'ė'), + ('ė
', 'ė'), + ('ėĄ', 'ėģ'), + ('ėŊ', 'ė'), + ('ė', 'ėŗ'), + ('ėĩ', 'ė'), + ('ė', 'ėĢ'), + ('ė', 'ė'), + ('ė', 'ėŖ'), + ('ėĨ', 'ėŋ'), + ('ė', 'ė'), + ('ė', 'ėˇ'), + ('ėš', 'ė
'), + ('ė
', 'ė
¯'), + ('ė
ą', 'ė'), + ('ė', 'ė§'), + ('ėŠ', 'ė'), + ('ė
', 'ė'), + ('ėĄ', 'ėģ'), + ('ėŊ', 'ė'), + ('ė', 'ėŗ'), + ('ėĩ', 'ė'), + ('ė', 'ėĢ'), + ('ė', 'ė'), + ('ė', 'ėŖ'), + ('ėĨ', 'ėŋ'), + ('ė', 'ė'), + ('ė', 'ėˇ'), + ('ėš', 'ė'), + ('ė', 'ė¯'), + ('ėą', 'ė'), + ('ė', 'ė§'), + ('ėŠ', 'ė'), + ('ė
', 'ė'), + ('ėĄ', 'ėģ'), + ('ėŊ', 'ė'), + ('ė', 'ėŗ'), + ('ėĩ', 'ė'), + ('ė', 'ėĢ'), + ('ė', 'ė'), + ('ė', 'ėŖ'), + ('ėĨ', 'ėŋ'), + ('ė', 'ė'), + ('ė', 'ėˇ'), + ('ėš', 'ė'), + ('ė', 'ė¯'), + ('ėą', 'ė'), + ('ė', 'ė§'), + ('ėŠ', 'ė'), + ('ė
', 'ė'), + ('ėĄ', 'ėģ'), + ('ėŊ', 'ė'), + ('ė', 'ėŗ'), + ('ėĩ', 'ė'), + ('ė', 'ėĢ'), + ('ė', 'ė'), + ('ė', 'ėŖ'), + ('ėĨ', 'ėŋ'), + ('ė', 'ė'), + ('ė', 'ėˇ'), + ('ėš', 'ė'), + ('ė', 'ė¯'), + ('ėą', 'ė'), + ('ė', 'ė§'), + ('ėŠ', 'ė'), + ('ė
', 'ė'), + ('ėĄ', 'ėģ'), + ('ėŊ', 'ė'), + ('ė', 'ėŗ'), + ('ėĩ', 'ė'), + ('ė', 'ėĢ'), + ('ė', 'ė'), + ('ė', 'ėŖ'), + ('ėĨ', 'ėŋ'), + ('ė ', 'ė '), + ('ė ', 'ė ˇ'), + ('ė š', 'ėĄ'), + ('ėĄ', 'ėĄ¯'), + ('ėĄą', 'ėĸ'), + ('ėĸ', 'ėĸ§'), + ('ėĸŠ', 'ėŖ'), + ('ėŖ
', 'ėŖ'), + ('ėŖĄ', 'ėŖģ'), + ('ėŖŊ', 'ė¤'), + ('ė¤', 'ė¤ŗ'), + ('ė¤ĩ', 'ėĨ'), + ('ėĨ', 'ėĨĢ'), + ('ėĨ', 'ėĻ'), + ('ėĻ', 'ėĻŖ'), + ('ėĻĨ', 'ėĻŋ'), + ('ė§', 'ė§'), + ('ė§', 'ė§ˇ'), + ('ė§š', 'ė¨'), + ('ė¨', 'ė¨¯'), + ('ė¨ą', 'ėŠ'), + ('ėŠ', 'ėŠ§'), + ('ėŠŠ', 'ėĒ'), + ('ėĒ
', 'ėĒ'), + ('ėĒĄ', 'ėĒģ'), + ('ėĒŊ', 'ėĢ'), + ('ėĢ', 'ėĢŗ'), + ('ėĢĩ', 'ėŦ'), + ('ėŦ', 'ėŦĢ'), + ('ėŦ', 'ė'), + ('ė', 'ėŖ'), + ('ėĨ', 'ėŋ'), + ('ėŽ', 'ėŽ'), + ('ėŽ', 'ėŽˇ'), + ('ėŽš', 'ė¯'), + ('ė¯', 'ė¯¯'), + ('ė¯ą', 'ė°'), + ('ė°', 'ė°§'), + ('ė°Š', 'ėą'), + ('ėą
', 'ėą'), + ('ėąĄ', 'ėąģ'), + ('ėąŊ', 'ė˛'), + ('ė˛', 'ė˛ŗ'), + ('ė˛ĩ', 'ėŗ'), + ('ėŗ', 'ėŗĢ'), + ('ėŗ', 'ė´'), + ('ė´', 'ė´Ŗ'), + ('ė´Ĩ', 'ė´ŋ'), + ('ėĩ', 'ėĩ'), + ('ėĩ', 'ėĩˇ'), + ('ėĩš', 'ėļ'), + ('ėļ', 'ėļ¯'), + ('ėļą', 'ėˇ'), + ('ėˇ', 'ėˇ§'), + ('ėˇŠ', 'ė¸'), + ('ė¸
', 'ė¸'), + ('ė¸Ą', 'ė¸ģ'), + ('ė¸Ŋ', 'ėš'), + ('ėš', 'ėšŗ'), + ('ėšĩ', 'ėē'), + ('ėē', 'ėēĢ'), + ('ėē', 'ėģ'), + ('ėģ', 'ėģŖ'), + ('ėģĨ', 'ėģŋ'), + ('ėŧ', 'ėŧ'), + ('ėŧ', 'ėŧˇ'), + ('ėŧš', 'ėŊ'), + ('ėŊ', 'ėŊ¯'), + ('ėŊą', 'ėž'), + ('ėž', 'ėž§'), + ('ėžŠ', 'ėŋ'), + ('ėŋ
', 'ėŋ'), + ('ėŋĄ', 'ėŋģ'), + ('ėŋŊ', 'í'), + ('í', 'íŗ'), + ('íĩ', 'í'), + ('í', 'íĢ'), + ('í', 'í'), + ('í', 'íŖ'), + ('íĨ', 'íŋ'), + ('í', 'í'), + ('í', 'íˇ'), + ('íš', 'í'), + ('í', 'í¯'), + ('íą', 'í
'), + ('í
', 'í
§'), + ('í
Š', 'í'), + ('í
', 'í'), + ('íĄ', 'íģ'), + ('íŊ', 'í'), + ('í', 'íŗ'), + ('íĩ', 'í'), + ('í', 'íĢ'), + ('í', 'í'), + ('í', 'íŖ'), + ('íĨ', 'íŋ'), + ('í', 'í'), + ('í', 'íˇ'), + ('íš', 'í'), + ('í', 'í¯'), + ('íą', 'í'), + ('í', 'í§'), + ('íŠ', 'í'), + ('í
', 'í'), + ('íĄ', 'íģ'), + ('íŊ', 'í'), + ('í', 'íŗ'), + ('íĩ', 'í'), + ('í', 'íĢ'), + ('í', 'í'), + ('í', 'íŖ'), + ('íĨ', 'íŋ'), + ('í', 'í'), + ('í', 'íˇ'), + ('íš', 'í'), + ('í', 'í¯'), + ('íą', 'í'), + ('í', 'í§'), + ('íŠ', 'í'), + ('í
', 'í'), + ('íĄ', 'íģ'), + ('íŊ', 'í'), + ('í', 'íŗ'), + ('íĩ', 'í'), + ('í', 'íĢ'), + ('í', 'í'), + ('í', 'íŖ'), + ('íĨ', 'íŋ'), + ('í', 'í'), + ('í', 'íˇ'), + ('íš', 'í'), + ('í', 'í¯'), + ('íą', 'í'), + ('í', 'í§'), + ('íŠ', 'í'), + ('í
', 'í'), + ('íĄ', 'íģ'), + ('íŊ', 'í'), + ('í', 'íŗ'), + ('íĩ', 'í'), + ('í', 'íĢ'), + ('í', 'í'), + ('í', 'íŖ'), +]; + +pub const PREPEND: &'static [(char, char)] = &[ + ('\u{600}', '\u{605}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{8e2}', '\u{8e2}'), + ('āĩ', 'āĩ'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('đ', 'đ'), + ('\u{1193f}', '\u{1193f}'), + ('\u{11941}', '\u{11941}'), + ('đ¨ē', 'đ¨ē'), + ('đĒ', 'đĒ'), + ('đĩ', 'đĩ'), +]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('đĻ', 'đŋ')]; + +pub const SPACINGMARK: &'static [(char, char)] = &[ + ('ā¤', 'ā¤'), + ('ā¤ģ', 'ā¤ģ'), + ('ā¤ž', 'āĨ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨ'), + ('āĻ', 'āĻ'), + ('āĻŋ', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā¨', 'ā¨'), + ('ā¨ž', 'āŠ'), + ('āĒ', 'āĒ'), + ('āĒž', 'āĢ'), + ('āĢ', 'āĢ'), + ('āĢ', 'āĢ'), + ('āŦ', 'āŦ'), + ('ā', 'ā'), + ('ā', 'ā'), + ('ā', 'ā'), + ('āŽŋ', 'āŽŋ'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā°', 'ā°'), + ('āą', 'āą'), + ('ā˛', 'ā˛'), + ('ā˛ž', 'ā˛ž'), + ('āŗ', 'āŗ'), + ('āŗ', 'āŗ'), + ('āŗ', 'āŗ'), + ('āŗ', 'āŗ'), + ('ā´', 'ā´'), + ('ā´ŋ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āļ', 'āļ'), + ('āˇ', 'āˇ'), + ('āˇ', 'āˇ'), + ('āˇ˛', 'āˇŗ'), + ('ā¸ŗ', 'ā¸ŗ'), + ('āēŗ', 'āēŗ'), + ('āŧž', 'āŧŋ'), + ('āŊŋ', 'āŊŋ'), + ('áą', 'áą'), + ('áģ', 'áŧ'), + ('á', 'á'), + ('á', 'á'), + ('áļ', 'áļ'), + ('áž', 'á
'), + ('á', 'á'), + ('á¤Ŗ', 'á¤Ļ'), + ('ᤊ', 'á¤Ģ'), + ('ᤰ', '᤹'), + ('á¤ŗ', 'ᤸ'), + ('á¨', 'á¨'), + ('áŠ', 'áŠ'), + ('áŠ', 'áŠ'), + ('áŠ', 'ኲ'), + ('áŦ', 'áŦ'), + ('áŦģ', 'áŦģ'), + ('áŦŊ', 'á'), + ('á', 'á'), + ('áŽ', 'áŽ'), + ('Ꭵ', 'Ꭵ'), + ('áŽĻ', 'Ꭷ'), + ('áŽĒ', 'áŽĒ'), + ('ᯧ', 'ᯧ'), + ('á¯Ē', 'á¯Ŧ'), + ('ᯎ', 'ᯎ'), + ('á¯˛', 'á¯ŗ'), + ('á°¤', 'á°Ģ'), + ('á°´', 'á°ĩ'), + ('áŗĄ', 'áŗĄ'), + ('áŗˇ', 'áŗˇ'), + ('ę Ŗ', 'ę ¤'), + ('ę §', 'ę §'), + ('ęĸ', 'ęĸ'), + ('ęĸ´', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ęĻ', 'ęĻ'), + ('ęĻ´', 'ęĻĩ'), + ('ęĻē', 'ęĻģ'), + ('ęĻž', 'ę§'), + ('ę¨¯', 'ꨰ'), + ('ę¨ŗ', 'ꨴ'), + ('ęŠ', 'ęŠ'), + ('ęĢĢ', 'ęĢĢ'), + ('ęĢŽ', 'ęĢ¯'), + ('ęĢĩ', 'ęĢĩ'), + ('ę¯Ŗ', 'ę¯¤'), + ('ę¯Ļ', 'ę¯§'), + ('ę¯Š', 'ę¯Ē'), + ('ę¯Ŧ', 'ę¯Ŧ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ˛'), + ('đˇ', 'đ¸'), + ('đŦ', 'đŦ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đŗ', 'đĩ'), + ('đŋ', 'đ'), + ('\u{111ce}', '\u{111ce}'), + ('đŦ', 'đŽ'), + ('đ˛', 'đŗ'), + ('đĩ', 'đĩ'), + ('đ ', 'đĸ'), + ('đ', 'đ'), + ('đŋ', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đŖ'), + ('đĩ', 'đˇ'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đą', 'đ˛'), + ('đš', 'đš'), + ('đģ', 'đŧ'), + ('đž', 'đž'), + ('đ', 'đ'), + ('đ°', 'đą'), + ('đ¸', 'đģ'), + ('đž', 'đž'), + ('đ°', 'đ˛'), + ('đģ', 'đŧ'), + ('đž', 'đž'), + ('đŦ', 'đŦ'), + ('đŽ', 'đ¯'), + ('đļ', 'đļ'), + ('đ ', 'đĄ'), + ('đĻ', 'đĻ'), + ('đ Ŧ', 'đ Ž'), + ('đ ¸', 'đ ¸'), + ('\u{11931}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193d}', '\u{1193d}'), + ('\u{11940}', '\u{11940}'), + ('\u{11942}', '\u{11942}'), + ('đ§', 'đ§'), + ('đ§', 'đ§'), + ('đ§¤', 'đ§¤'), + ('đ¨š', 'đ¨š'), + ('đŠ', 'đŠ'), + ('đĒ', 'đĒ'), + ('đ°¯', 'đ°¯'), + ('đ°ž', 'đ°ž'), + ('đ˛Š', 'đ˛Š'), + ('đ˛ą', 'đ˛ą'), + ('đ˛´', 'đ˛´'), + ('đļ', 'đļ'), + ('đļ', 'đļ'), + ('đļ', 'đļ'), + ('đģĩ', 'đģļ'), + ('đŊ', 'đž'), + ('\u{16ff0}', '\u{16ff1}'), + ('đ
Ļ', 'đ
Ļ'), + ('đ
', 'đ
'), +]; + +pub const T: &'static [(char, char)] = &[('á¨', 'áŋ'), ('í', 'íģ')]; + +pub const V: &'static [(char, char)] = &[('á
', 'á§'), ('í°', 'í')]; + +pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')]; diff --git a/vendor/regex-syntax/src/unicode_tables/mod.rs b/vendor/regex-syntax/src/unicode_tables/mod.rs new file mode 100644 index 000000000..20736c7ac --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/mod.rs @@ -0,0 +1,57 @@ +#[cfg(feature = "unicode-age")] +pub mod age; + +#[cfg(feature = "unicode-case")] +pub mod case_folding_simple; + +#[cfg(feature = "unicode-gencat")] +pub mod general_category; + +#[cfg(feature = "unicode-segment")] +pub mod grapheme_cluster_break; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] +#[allow(dead_code)] +pub mod perl_decimal; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] +#[allow(dead_code)] +pub mod perl_space; + +#[cfg(feature = "unicode-perl")] +pub mod perl_word; + +#[cfg(feature = "unicode-bool")] +pub mod property_bool; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] +pub mod property_names; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] +pub mod property_values; + +#[cfg(feature = "unicode-script")] +pub mod script; + +#[cfg(feature = "unicode-script")] +pub mod script_extension; + +#[cfg(feature = "unicode-segment")] +pub mod sentence_break; + +#[cfg(feature = "unicode-segment")] +pub mod word_break; diff --git a/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs b/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs new file mode 100644 index 000000000..2a09259fc --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs @@ -0,0 +1,74 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category ucd-13.0.0 --chars --include decimalnumber +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("Decimal_Number", DECIMAL_NUMBER)]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('Ų ', 'ŲŠ'), + ('Û°', 'Ûš'), + ('ß', 'ß'), + ('āĨĻ', 'āĨ¯'), + ('ā§Ļ', 'ā§¯'), + ('āŠĻ', 'āŠ¯'), + ('āĢĻ', 'āĢ¯'), + ('āĻ', 'ā¯'), + ('ā¯Ļ', 'ā¯¯'), + ('āąĻ', 'āą¯'), + ('āŗĻ', 'āŗ¯'), + ('āĩĻ', 'āĩ¯'), + ('āˇĻ', 'āˇ¯'), + ('āš', 'āš'), + ('āģ', 'āģ'), + ('āŧ ', 'āŧŠ'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áŠ'), + ('á ', 'á '), + ('áĨ', 'áĨ'), + ('á§', 'á§'), + ('áĒ', 'áĒ'), + ('áĒ', 'áĒ'), + ('á', 'á'), + ('Ꮀ', '᎚'), + ('áą', 'áą'), + ('áą', 'áą'), + ('ę ', 'ęŠ'), + ('ęŖ', 'ęŖ'), + ('ę¤', 'ę¤'), + ('ę§', 'ę§'), + ('꧰', '꧚'), + ('ęŠ', 'ęŠ'), + ('ę¯°', 'ę¯š'), + ('īŧ', 'īŧ'), + ('đ ', 'đŠ'), + ('đ´°', 'đ´š'), + ('đĻ', 'đ¯'), + ('đ°', 'đš'), + ('đļ', 'đŋ'), + ('đ', 'đ'), + ('đ°', 'đš'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đš'), + ('đŖ ', 'đŖŠ'), + ('\u{11950}', '\u{11959}'), + ('đą', 'đą'), + ('đĩ', 'đĩ'), + ('đļ ', 'đļŠ'), + ('đŠ ', 'đŠŠ'), + ('đ', 'đ'), + ('đ', 'đŋ'), + ('đ
', 'đ
'), + ('đ°', 'đš'), + ('đĨ', 'đĨ'), + ('\u{1fbf0}', '\u{1fbf9}'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/perl_space.rs b/vendor/regex-syntax/src/unicode_tables/perl_space.rs new file mode 100644 index 000000000..c112dd126 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/perl_space.rs @@ -0,0 +1,23 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool ucd-13.0.0 --chars --include whitespace +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("White_Space", WHITE_SPACE)]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/perl_word.rs b/vendor/regex-syntax/src/unicode_tables/perl_word.rs new file mode 100644 index 000000000..df9eac7d7 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/perl_word.rs @@ -0,0 +1,743 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate perl-word ucd-13.0.0 --chars +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const PERL_WORD: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Ë'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('ËŦ', 'ËŦ'), + ('ËŽ', 'ËŽ'), + ('\u{300}', 'Í´'), + ('Íļ', '͡'), + ('Íē', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('\u{483}', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ', 'Õ'), + ('Õ ', 'Ö'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('×', '×Ē'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('Ø ', 'ŲŠ'), + ('ŲŽ', 'Û'), + ('Û', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', '\u{74a}'), + ('Ũ', 'Ūą'), + ('ß', 'ßĩ'), + ('ßē', 'ßē'), + ('\u{7fd}', '\u{7fd}'), + ('ā ', '\u{82d}'), + ('āĄ', '\u{85b}'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('āĨĻ', 'āĨ¯'), + ('āĨą', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('\u{9bc}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('\u{9d7}', '\u{9d7}'), + ('ā§', 'ā§'), + ('ā§', '\u{9e3}'), + ('ā§Ļ', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ā¨'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('\u{a3c}', '\u{a3c}'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠĻ', '\u{a75}'), + ('\u{a81}', 'āĒ'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', '\u{acd}'), + ('āĢ', 'āĢ'), + ('āĢ ', '\u{ae3}'), + ('āĢĻ', 'āĢ¯'), + ('āĢš', '\u{aff}'), + ('\u{b01}', 'āŦ'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('\u{b3c}', '\u{b44}'), + ('ā', 'ā'), + ('ā', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ā', 'ā'), + ('ā', '\u{b63}'), + ('āĻ', 'ā¯'), + ('āą', 'āą'), + ('\u{b82}', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', '\u{bcd}'), + ('ā¯', 'ā¯'), + ('\u{bd7}', '\u{bd7}'), + ('ā¯Ļ', 'ā¯¯'), + ('\u{c00}', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('āą', 'āą'), + ('āą ', '\u{c63}'), + ('āąĻ', 'āą¯'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('\u{cbc}', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('āŗ', 'āŗ'), + ('āŗ ', '\u{ce3}'), + ('āŗĻ', 'āŗ¯'), + ('āŗą', 'āŗ˛'), + ('\u{d00}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d57}'), + ('āĩ', '\u{d63}'), + ('āĩĻ', 'āĩ¯'), + ('āĩē', 'āĩŋ'), + ('\u{d81}', 'āļ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇĻ', 'āˇ¯'), + ('āˇ˛', 'āˇŗ'), + ('ā¸', '\u{e3a}'), + ('āš', '\u{e4e}'), + ('āš', 'āš'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('\u{ec8}', '\u{ecd}'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('\u{f18}', '\u{f19}'), + ('āŧ ', 'āŧŠ'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('āŧž', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('á', 'á'), + ('á', '\u{109d}'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('\u{135d}', '\u{135f}'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áŽ', 'á¸'), + ('á', 'á'), + ('á', '\u{1714}'), + ('á ', '\u{1734}'), + ('á', '\u{1753}'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('\u{1772}', '\u{1773}'), + ('á', '\u{17d3}'), + ('á', 'á'), + ('á', '\u{17dd}'), + ('á ', 'áŠ'), + ('\u{180b}', '\u{180d}'), + ('á ', 'á '), + ('á ', '᥸'), + ('áĸ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', '\u{193b}'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á§', 'á§'), + ('á¨', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', 'áĒ'), + ('áĒ', 'áĒ'), + ('áĒ§', 'áĒ§'), + ('\u{1ab0}', '\u{1ac0}'), + ('\u{1b00}', 'á'), + ('á', 'á'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'á¯ŗ'), + ('á°', '\u{1c37}'), + ('áą', 'áą'), + ('áą', 'áąŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'áŗē'), + ('á´', '\u{1df9}'), + ('\u{1dfb}', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('\u{200c}', '\u{200d}'), + ('âŋ', 'â'), + ('â', 'â'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('\u{20d0}', '\u{20f0}'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'â'), + ('â¯', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â'), + ('âļ', 'âŠ'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗ¤'), + ('âŗĢ', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ¯'), + ('\u{2d7f}', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('ã
', 'ã'), + ('ãĄ', '\u{302f}'), + ('ãą', 'ãĩ'), + ('ã¸', 'ãŧ'), + ('ã', 'ã'), + ('\u{3099}', '\u{309a}'), + ('ã', 'ã'), + ('ãĄ', 'ãē'), + ('ãŧ', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ã°', 'ãŋ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ęŊ'), + ('ę', 'ę'), + ('ę', 'ęĢ'), + ('ę', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('ęŋ', '\u{a6f1}'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ę §'), + ('\u{a82c}', '\u{a82c}'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', '\u{a8c5}'), + ('ęŖ', 'ęŖ'), + ('\u{a8e0}', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', '\u{a92d}'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('\u{a980}', 'ę§'), + ('ę§', 'ę§'), + ('ę§ ', '꧞'), + ('ę¨', '\u{aa36}'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ ', 'ęŠļ'), + ('ęŠē', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ ', 'ęĢ¯'), + ('ęĢ˛', '\u{aaf6}'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab69}'), + ('ę°', 'ę¯Ē'), + ('ę¯Ŧ', '\u{abed}'), + ('ę¯°', 'ę¯š'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇģ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('ī¸ŗ', 'ī¸´'), + ('īš', 'īš'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('īŧ', 'īŧ'), + ('īŧĄ', 'īŧē'), + ('īŧŋ', 'īŧŋ'), + ('īŊ', 'īŊ'), + ('īŊĻ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ
', 'đ
´'), + ('\u{101fd}', '\u{101fd}'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('\u{102e0}', '\u{102e0}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{1037a}'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đŠ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', '\u{10ae6}'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đ´', '\u{10d27}'), + ('đ´°', 'đ´š'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', '\u{10f50}'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', '\u{11046}'), + ('đĻ', 'đ¯'), + ('\u{1107f}', '\u{110ba}'), + ('đ', 'đ¨'), + ('đ°', 'đš'), + ('\u{11100}', '\u{11134}'), + ('đļ', 'đŋ'), + ('đ
', '\u{11147}'), + ('đ
', '\u{11173}'), + ('đ
ļ', 'đ
ļ'), + ('\u{11180}', 'đ'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111ce}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', '\u{112ea}'), + ('đ°', 'đš'), + ('\u{11300}', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('\u{1133b}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đ', 'đŖ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{1145e}', '\u{11461}'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{115b5}'), + ('đ¸', '\u{115c0}'), + ('đ', '\u{115dd}'), + ('đ', '\u{11640}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¸'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{1171d}', '\u{1172b}'), + ('đ°', 'đš'), + ('đ ', '\u{1183a}'), + ('đĸ ', 'đŖŠ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{11943}'), + ('\u{11950}', '\u{11959}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', '\u{119d7}'), + ('\u{119da}', 'đ§Ą'), + ('đ§Ŗ', 'đ§¤'), + ('đ¨', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('đŠ', '\u{11a99}'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', '\u{11c36}'), + ('\u{11c38}', 'đą'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('\u{11c92}', '\u{11ca7}'), + ('đ˛Š', '\u{11cb6}'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', 'đļ'), + ('đļ ', 'đļŠ'), + ('đģ ', 'đģļ'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đŠ ', 'đŠŠ'), + ('đĢ', 'đĢ'), + ('\u{16af0}', '\u{16af4}'), + ('đŦ', '\u{16b36}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đš', 'đšŋ'), + ('đŧ', 'đŊ'), + ('\u{16f4f}', 'đž'), + ('\u{16f8f}', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', '\u{16fe4}'), + ('\u{16ff0}', '\u{16ff1}'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1d165}', '\u{1d169}'), + ('đ
', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŋ'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('đ', 'đŦ'), + ('\u{1e130}', 'đŊ'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ', 'đš'), + ('đ ', 'đŖ'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('đ¤', 'đĨ'), + ('đĨ', 'đĨ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đ°', 'đ
'), + ('đ
', 'đ
Š'), + ('đ
°', 'đ'), + ('\u{1fbf0}', '\u{1fbf9}'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), + ('\u{e0100}', '\u{e01ef}'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/property_bool.rs b/vendor/regex-syntax/src/unicode_tables/property_bool.rs new file mode 100644 index 000000000..21cbaf9ae --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/property_bool.rs @@ -0,0 +1,10953 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool ucd-13.0.0 --chars +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ASCII_Hex_Digit", ASCII_HEX_DIGIT), + ("Alphabetic", ALPHABETIC), + ("Bidi_Control", BIDI_CONTROL), + ("Bidi_Mirrored", BIDI_MIRRORED), + ("Case_Ignorable", CASE_IGNORABLE), + ("Cased", CASED), + ("Changes_When_Casefolded", CHANGES_WHEN_CASEFOLDED), + ("Changes_When_Casemapped", CHANGES_WHEN_CASEMAPPED), + ("Changes_When_Lowercased", CHANGES_WHEN_LOWERCASED), + ("Changes_When_Titlecased", CHANGES_WHEN_TITLECASED), + ("Changes_When_Uppercased", CHANGES_WHEN_UPPERCASED), + ("Dash", DASH), + ("Default_Ignorable_Code_Point", DEFAULT_IGNORABLE_CODE_POINT), + ("Deprecated", DEPRECATED), + ("Diacritic", DIACRITIC), + ("Emoji", EMOJI), + ("Emoji_Component", EMOJI_COMPONENT), + ("Emoji_Modifier", EMOJI_MODIFIER), + ("Emoji_Modifier_Base", EMOJI_MODIFIER_BASE), + ("Emoji_Presentation", EMOJI_PRESENTATION), + ("Extended_Pictographic", EXTENDED_PICTOGRAPHIC), + ("Extender", EXTENDER), + ("Grapheme_Base", GRAPHEME_BASE), + ("Grapheme_Extend", GRAPHEME_EXTEND), + ("Grapheme_Link", GRAPHEME_LINK), + ("Hex_Digit", HEX_DIGIT), + ("Hyphen", HYPHEN), + ("IDS_Binary_Operator", IDS_BINARY_OPERATOR), + ("IDS_Trinary_Operator", IDS_TRINARY_OPERATOR), + ("ID_Continue", ID_CONTINUE), + ("ID_Start", ID_START), + ("Ideographic", IDEOGRAPHIC), + ("Join_Control", JOIN_CONTROL), + ("Logical_Order_Exception", LOGICAL_ORDER_EXCEPTION), + ("Lowercase", LOWERCASE), + ("Math", MATH), + ("Noncharacter_Code_Point", NONCHARACTER_CODE_POINT), + ("Other_Alphabetic", OTHER_ALPHABETIC), + ("Other_Default_Ignorable_Code_Point", OTHER_DEFAULT_IGNORABLE_CODE_POINT), + ("Other_Grapheme_Extend", OTHER_GRAPHEME_EXTEND), + ("Other_ID_Continue", OTHER_ID_CONTINUE), + ("Other_ID_Start", OTHER_ID_START), + ("Other_Lowercase", OTHER_LOWERCASE), + ("Other_Math", OTHER_MATH), + ("Other_Uppercase", OTHER_UPPERCASE), + ("Pattern_Syntax", PATTERN_SYNTAX), + ("Pattern_White_Space", PATTERN_WHITE_SPACE), + ("Prepended_Concatenation_Mark", PREPENDED_CONCATENATION_MARK), + ("Quotation_Mark", QUOTATION_MARK), + ("Radical", RADICAL), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("Sentence_Terminal", SENTENCE_TERMINAL), + ("Soft_Dotted", SOFT_DOTTED), + ("Terminal_Punctuation", TERMINAL_PUNCTUATION), + ("Unified_Ideograph", UNIFIED_IDEOGRAPH), + ("Uppercase", UPPERCASE), + ("Variation_Selector", VARIATION_SELECTOR), + ("White_Space", WHITE_SPACE), + ("XID_Continue", XID_CONTINUE), + ("XID_Start", XID_START), +]; + +pub const ASCII_HEX_DIGIT: &'static [(char, char)] = + &[('0', '9'), ('A', 'F'), ('a', 'f')]; + +pub const ALPHABETIC: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Ë'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('ËŦ', 'ËŦ'), + ('ËŽ', 'ËŽ'), + ('\u{345}', '\u{345}'), + ('Í°', 'Í´'), + ('Íļ', '͡'), + ('Íē', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ', 'Õ'), + ('Õ ', 'Ö'), + ('\u{5b0}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('×', '×Ē'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('Ø ', '\u{657}'), + ('\u{659}', '\u{65f}'), + ('ŲŽ', 'Û'), + ('Û', '\u{6dc}'), + ('\u{6e1}', '\u{6e8}'), + ('\u{6ed}', 'Û¯'), + ('Ûē', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', '\u{73f}'), + ('Ũ', 'Ūą'), + ('ß', 'ßĒ'), + ('ß´', 'ßĩ'), + ('ßē', 'ßē'), + ('ā ', '\u{817}'), + ('ā ', '\u{82c}'), + ('āĄ', 'āĄ'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('\u{8d4}', '\u{8df}'), + ('\u{8e3}', '\u{8e9}'), + ('\u{8f0}', 'ā¤ģ'), + ('ā¤Ŋ', 'āĨ'), + ('āĨ', 'āĨ'), + ('\u{955}', '\u{963}'), + ('āĨą', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('āĻŊ', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('\u{9d7}', '\u{9d7}'), + ('ā§', 'ā§'), + ('ā§', '\u{9e3}'), + ('ā§°', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('\u{a01}', 'ā¨'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4c}'), + ('\u{a51}', '\u{a51}'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('\u{a70}', '\u{a75}'), + ('\u{a81}', 'āĒ'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('āĒŊ', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', 'āĢ'), + ('āĢ', 'āĢ'), + ('āĢ ', '\u{ae3}'), + ('āĢš', '\u{afc}'), + ('\u{b01}', 'āŦ'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('āŦŊ', '\u{b44}'), + ('ā', 'ā'), + ('ā', 'ā'), + ('\u{b56}', '\u{b57}'), + ('ā', 'ā'), + ('ā', '\u{b63}'), + ('āą', 'āą'), + ('\u{b82}', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', 'ā°'), + ('ā°
', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4c}'), + ('\u{c55}', '\u{c56}'), + ('āą', 'āą'), + ('āą ', '\u{c63}'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('ā˛Ŋ', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccc}'), + ('\u{cd5}', '\u{cd6}'), + ('āŗ', 'āŗ'), + ('āŗ ', '\u{ce3}'), + ('āŗą', 'āŗ˛'), + ('\u{d00}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', 'ā´ē'), + ('ā´Ŋ', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d57}'), + ('āĩ', '\u{d63}'), + ('āĩē', 'āĩŋ'), + ('\u{d81}', 'āļ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇ˛', 'āˇŗ'), + ('ā¸', '\u{e3a}'), + ('āš', 'āš'), + ('\u{e4d}', '\u{e4d}'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', '\u{eb9}'), + ('\u{ebb}', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('\u{ecd}', '\u{ecd}'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('āŊ', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('\u{f71}', '\u{f81}'), + ('āž', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('á', '\u{1036}'), + ('á¸', 'á¸'), + ('áģ', 'áŋ'), + ('á', 'á'), + ('á', '\u{109d}'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áŽ', 'á¸'), + ('á', 'á'), + ('á', '\u{1713}'), + ('á ', '\u{1733}'), + ('á', '\u{1753}'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('\u{1772}', '\u{1773}'), + ('á', 'áŗ'), + ('áļ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', '᥸'), + ('áĸ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', 'ᤸ'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á¨', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('እ', '\u{1a74}'), + ('áĒ§', 'áĒ§'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{1b00}', 'áŦŗ'), + ('\u{1b35}', 'á'), + ('á
', 'á'), + ('\u{1b80}', '\u{1ba9}'), + ('\u{1bac}', 'Ꭿ'), + ('áŽē', 'á¯Ĩ'), + ('ᯧ', '\u{1bf1}'), + ('á°', '\u{1c36}'), + ('áą', 'áą'), + ('áą', 'áąŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('áŗŠ', 'áŗŦ'), + ('áŗŽ', 'áŗŗ'), + ('áŗĩ', 'áŗļ'), + ('áŗē', 'áŗē'), + ('á´', 'áļŋ'), + ('\u{1de7}', '\u{1df4}'), + ('á¸', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'â'), + ('â¯', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â'), + ('âļ', 'âŠ'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗ¤'), + ('âŗĢ', 'âŗŽ'), + ('âŗ˛', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ¯'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('ã
', 'ã'), + ('ãĄ', 'ãŠ'), + ('ãą', 'ãĩ'), + ('ã¸', 'ãŧ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãĄ', 'ãē'), + ('ãŧ', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ã°', 'ãŋ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ęŊ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĒ', 'ęĢ'), + ('ę', 'ęŽ'), + ('\u{a674}', '\u{a67b}'), + ('ęŋ', 'ę¯'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ę
'), + ('ę ', 'ę §'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', 'ęŖ'), + ('\u{a8c5}', '\u{a8c5}'), + ('ęŖ˛', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', '\u{a8ff}'), + ('ę¤', '\u{a92a}'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('\u{a980}', 'ęĻ˛'), + ('ęĻ´', 'ęĻŋ'), + ('ę§', 'ę§'), + ('ę§ ', 'ę§¯'), + ('ę§ē', '꧞'), + ('ę¨', '\u{aa36}'), + ('ęŠ', 'ęŠ'), + ('ęŠ ', 'ęŠļ'), + ('ęŠē', '\u{aabe}'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ ', 'ęĢ¯'), + ('ęĢ˛', 'ęĢĩ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab69}'), + ('ę°', 'ę¯Ē'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇģ'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), + ('īŊĻ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ
', 'đ
´'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{1037a}'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', 'đĢ¤'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đ´', '\u{10d27}'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', 'đŊ
'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', '\u{11045}'), + ('đ', 'đ¸'), + ('đ', 'đ¨'), + ('\u{11100}', '\u{11132}'), + ('đ
', '\u{11147}'), + ('đ
', 'đ
˛'), + ('đ
ļ', 'đ
ļ'), + ('\u{11180}', 'đŋ'), + ('đ', 'đ'), + ('\u{111ce}', '\u{111cf}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{11234}'), + ('\u{11237}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', '\u{112e8}'), + ('\u{11300}', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('đŊ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đ', 'đŖ'), + ('đ', 'đ'), + ('\u{11443}', 'đ
'), + ('đ', 'đ'), + ('đ', '\u{11461}'), + ('đ', 'đ'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', '\u{115b5}'), + ('đ¸', 'đž'), + ('đ', '\u{115dd}'), + ('đ', 'đž'), + ('\u{11640}', '\u{11640}'), + ('đ', 'đ'), + ('đ', '\u{116b5}'), + ('đ¸', 'đ¸'), + ('đ', 'đ'), + ('\u{1171d}', '\u{1172a}'), + ('đ ', 'đ ¸'), + ('đĸ ', 'đŖ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193f}', '\u{11942}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', '\u{119d7}'), + ('\u{119da}', 'đ§'), + ('đ§Ą', 'đ§Ą'), + ('đ§Ŗ', 'đ§¤'), + ('đ¨', 'đ¨˛'), + ('\u{11a35}', '\u{11a3e}'), + ('đŠ', 'đĒ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', '\u{11c36}'), + ('\u{11c38}', 'đ°ž'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('\u{11c92}', '\u{11ca7}'), + ('đ˛Š', '\u{11cb6}'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d41}'), + ('\u{11d43}', '\u{11d43}'), + ('đĩ', '\u{11d47}'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', 'đļ'), + ('đļ', 'đļ'), + ('đģ ', 'đģļ'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đĢ', 'đĢ'), + ('đŦ', 'đŦ¯'), + ('đ', 'đ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đš', 'đšŋ'), + ('đŧ', 'đŊ'), + ('\u{16f4f}', 'đž'), + ('\u{16f8f}', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', 'đŋŖ'), + ('\u{16ff0}', '\u{16ff1}'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('\u{1bc9e}', '\u{1bc9e}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('đ', 'đŦ'), + ('đˇ', 'đŊ'), + ('đ
', 'đ
'), + ('đ', 'đĢ'), + ('đ ', 'đŖ'), + ('đ¤', 'đĨ'), + ('\u{1e947}', '\u{1e947}'), + ('đĨ', 'đĨ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đ°', 'đ
'), + ('đ
', 'đ
Š'), + ('đ
°', 'đ'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const BIDI_CONTROL: &'static [(char, char)] = &[ + ('\u{61c}', '\u{61c}'), + ('\u{200e}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2066}', '\u{2069}'), +]; + +pub const BIDI_MIRRORED: &'static [(char, char)] = &[ + ('(', ')'), + ('<', '<'), + ('>', '>'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('ÂĢ', 'ÂĢ'), + ('Âģ', 'Âģ'), + ('āŧē', 'āŧŊ'), + ('á', 'á'), + ('âš', 'âē'), + ('â
', 'â'), + ('âŊ', 'âž'), + ('â', 'â'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'âĸ'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('âĢ', 'âŗ'), + ('âš', 'âš'), + ('âģ', 'â'), + ('â', 'â'), + ('â', 'â '), + ('âĸ', 'âĸ'), + ('â¤', 'âĢ'), + ('âŽ', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âĸ', 'âŖ'), + ('âĻ', 'â¸'), + ('âž', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â°', 'âŋ'), + ('â', 'â'), + ('â ', 'âĄ'), + ('âŠ', 'âĒ'), + ('â¨', 'âĩ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âĸ', 'â¯'), + ('âĻ', 'âĻ'), + ('âĻ', 'âĻ '), + ('âĻĸ', 'âĻ¯'), + ('âĻ¸', 'âĻ¸'), + ('â§', 'â§
'), + ('â§', 'â§'), + ('â§', 'â§'), + ('â§', 'â§'), + ('â§', 'â§'), + ('⧥', '⧥'), + ('â§Ŗ', 'â§Ĩ'), + ('⧨', '⧊'), + ('⧴', '⧚'), + ('â§ŧ', 'â§Ŋ'), + ('â¨', 'â¨'), + ('â¨', '⨥'), + ('⨤', '⨤'), + ('â¨Ļ', 'â¨Ļ'), + ('⨊', '⨊'), + ('â¨Ģ', '⨎'), + ('⨴', 'â¨ĩ'), + ('â¨ŧ', '⨞'), + ('âŠ', 'âŠ'), + ('⊤', 'âŠĨ'), + ('âŠĒ', 'âŠ'), + ('⊯', '⊰'), + ('âŠŗ', '⊴'), + ('⊚', 'âĒŖ'), + ('âĒĻ', 'âĒ'), + ('âĒ¯', 'âĢ'), + ('âĢ', 'âĢ'), + ('âĢ', 'âĢ'), + ('âĢĸ', 'âĢĻ'), + ('âĢŦ', 'âĢŽ'), + ('âĢŗ', 'âĢŗ'), + ('âĢˇ', 'âĢģ'), + ('âĢŊ', 'âĢŊ'), + ('⯞', '⯞'), + ('â¸', 'â¸
'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('⸠', '⸊'), + ('ã', 'ã'), + ('ã', 'ã'), + ('īš', 'īš'), + ('īš¤', 'īšĨ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧģ', 'īŧģ'), + ('īŧŊ', 'īŧŊ'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ '), + ('īŊĸ', 'īŊŖ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), +]; + +pub const CASE_IGNORABLE: &'static [(char, char)] = &[ + ('\'', '\''), + ('.', '.'), + (':', ':'), + ('^', '^'), + ('`', '`'), + ('¨', '¨'), + ('\u{ad}', '\u{ad}'), + ('¯', '¯'), + ('´', '´'), + ('¡', '¸'), + ('Ę°', '\u{36f}'), + ('Í´', 'Íĩ'), + ('Íē', 'Íē'), + ('Î', 'Î
'), + ('Î', 'Î'), + ('\u{483}', '\u{489}'), + ('Õ', 'Õ'), + ('Õ', 'Õ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('×´', '×´'), + ('\u{600}', '\u{605}'), + ('\u{610}', '\u{61a}'), + ('\u{61c}', '\u{61c}'), + ('Ų', 'Ų'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dd}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{70f}', '\u{70f}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', 'ßĩ'), + ('ßē', 'ßē'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{8d3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('āĨą', 'āĨą'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3f}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b56}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('āš', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('āģ', 'āģ'), + ('\u{ec8}', '\u{ecd}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('áŧ', 'áŧ'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1734}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('á', 'á'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180e}'), + ('áĄ', 'áĄ'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('áĒ§', 'áĒ§'), + ('\u{1ab0}', '\u{1ac0}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b34}'), + ('\u{1b36}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('Ṹ', 'áąŊ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('á´Ŧ', 'áĩĒ'), + ('áĩ¸', 'áĩ¸'), + ('áļ', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('ážŊ', 'ážŊ'), + ('ážŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ¯'), + ('áŋŊ', 'áŋž'), + ('\u{200b}', '\u{200f}'), + ('â', 'â'), + ('â¤', 'â¤'), + ('â§', 'â§'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('\u{20d0}', '\u{20f0}'), + ('âąŧ', 'âąŊ'), + ('\u{2cef}', '\u{2cf1}'), + ('âĩ¯', 'âĩ¯'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('ã
', 'ã
'), + ('\u{302a}', '\u{302d}'), + ('ãą', 'ãĩ'), + ('ãģ', 'ãģ'), + ('\u{3099}', 'ã'), + ('ãŧ', 'ãž'), + ('ę', 'ę'), + ('ę¸', 'ęŊ'), + ('ę', 'ę'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('ęŋ', 'ęŋ'), + ('ę', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('ę', 'ęĄ'), + ('ę°', 'ę°'), + ('ę', 'ę'), + ('ę¸', 'ęš'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('ę§', 'ę§'), + ('\u{a9e5}', 'ę§Ļ'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('ꊰ', 'ꊰ'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ęĢ', 'ęĢ'), + ('\u{aaec}', '\u{aaed}'), + ('ęĢŗ', 'ęĢ´'), + ('\u{aaf6}', '\u{aaf6}'), + ('ę', 'ę'), + ('\u{ab69}', '\u{ab6b}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('īŽ˛', 'ī¯'), + ('\u{fe00}', '\u{fe0f}'), + ('ī¸', 'ī¸'), + ('\u{fe20}', '\u{fe2f}'), + ('īš', 'īš'), + ('īš', 'īš'), + ('\u{feff}', '\u{feff}'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧž', 'īŧž'), + ('īŊ', 'īŊ'), + ('īŊ°', 'īŊ°'), + ('\u{ff9e}', '\u{ff9f}'), + ('īŋŖ', 'īŋŖ'), + ('\u{fff9}', '\u{fffb}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{11340}', '\u{11340}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{13430}', '\u{13438}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('đ', 'đ'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d173}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e130}', 'đŊ'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', 'đĨ'), + ('đģ', 'đŋ'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const CASED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Æē'), + ('Æŧ', 'Æŋ'), + ('Į', 'Ę'), + ('Ę', 'ʸ'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('\u{345}', '\u{345}'), + ('Í°', 'Íŗ'), + ('Íļ', '͡'), + ('Íē', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ ', 'Ö'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŊ', 'áŋ'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('á´', 'áļŋ'), + ('á¸', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'â'), + ('â¯', 'â´'), + ('âš', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â
ŋ'), + ('â', 'â'), + ('âļ', 'âŠ'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗ¤'), + ('âŗĢ', 'âŗŽ'), + ('âŗ˛', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ę', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', '\u{a7f6}'), + ('ę¸', 'ęē'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab68}'), + ('ę°', 'ęŽŋ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đĸ ', 'đŖ'), + ('đš', 'đšŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ¤', 'đĨ'), + ('đ°', 'đ
'), + ('đ
', 'đ
Š'), + ('đ
°', 'đ'), +]; + +pub const CHANGES_WHEN_CASEFOLDED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('Âĩ', 'Âĩ'), + ('Ã', 'Ã'), + ('Ã', 'Ã'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä ', 'Ä '), + ('Äĸ', 'Äĸ'), + ('Ĥ', 'Ĥ'), + ('ÄĻ', 'ÄĻ'), + ('Ĩ', 'Ĩ'), + ('ÄĒ', 'ÄĒ'), + ('ÄŦ', 'ÄŦ'), + ('ÄŽ', 'ÄŽ'), + ('Ä°', 'Ä°'), + ('IJ', 'IJ'), + ('Ä´', 'Ä´'), + ('Äļ', 'Äļ'), + ('Äš', 'Äš'), + ('Äģ', 'Äģ'), + ('ÄŊ', 'ÄŊ'), + ('Äŋ', 'Äŋ'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å
', 'Å
'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å ', 'Å '), + ('Åĸ', 'Åĸ'), + ('Ť', 'Ť'), + ('ÅĻ', 'ÅĻ'), + ('Ũ', 'Ũ'), + ('ÅĒ', 'ÅĒ'), + ('ÅŦ', 'ÅŦ'), + ('ÅŽ', 'ÅŽ'), + ('Å°', 'Å°'), + ('Å˛', 'Å˛'), + ('Å´', 'Å´'), + ('Åļ', 'Åļ'), + ('Ÿ', 'Åš'), + ('Åģ', 'Åģ'), + ('ÅŊ', 'ÅŊ'), + ('Åŋ', 'Åŋ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ '), + ('Æĸ', 'Æĸ'), + ('Ƥ', 'Ƥ'), + ('ÆĻ', 'Ƨ'), + ('ÆŠ', 'ÆŠ'), + ('ÆŦ', 'ÆŦ'), + ('ÆŽ', 'Ư'), + ('Æą', 'Æŗ'), + ('Æĩ', 'Æĩ'), + ('Æˇ', 'Ƹ'), + ('Æŧ', 'Æŧ'), + ('Į', 'Į
'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į ', 'Į '), + ('Įĸ', 'Įĸ'), + ('Į¤', 'Į¤'), + ('ĮĻ', 'ĮĻ'), + ('Į¨', 'Į¨'), + ('ĮĒ', 'ĮĒ'), + ('ĮŦ', 'ĮŦ'), + ('ĮŽ', 'ĮŽ'), + ('Įą', 'Į˛'), + ('Į´', 'Į´'), + ('Įļ', 'Į¸'), + ('Įē', 'Įē'), + ('Įŧ', 'Įŧ'), + ('Įž', 'Įž'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č ', 'Č '), + ('Čĸ', 'Čĸ'), + ('Ȥ', 'Ȥ'), + ('ČĻ', 'ČĻ'), + ('Ȩ', 'Ȩ'), + ('ČĒ', 'ČĒ'), + ('ČŦ', 'ČŦ'), + ('ČŽ', 'ČŽ'), + ('Č°', 'Č°'), + ('Ȳ', 'Ȳ'), + ('Čē', 'Čģ'), + ('ČŊ', 'Čž'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('\u{345}', '\u{345}'), + ('Í°', 'Í°'), + ('Ͳ', 'Ͳ'), + ('Íļ', 'Íļ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'ÎĢ'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī ', 'Ī '), + ('Īĸ', 'Īĸ'), + ('Ī¤', 'Ī¤'), + ('ĪĻ', 'ĪĻ'), + ('Ī¨', 'Ī¨'), + ('ĪĒ', 'ĪĒ'), + ('ĪŦ', 'ĪŦ'), + ('ĪŽ', 'ĪŽ'), + ('Ī°', 'Īą'), + ('Ī´', 'Īĩ'), + ('Īˇ', 'Īˇ'), + ('Īš', 'Īē'), + ('ĪŊ', 'Đ¯'), + ('Ņ ', 'Ņ '), + ('Ņĸ', 'Ņĸ'), + ('Ņ¤', 'Ņ¤'), + ('ŅĻ', 'ŅĻ'), + ('Ņ¨', 'Ņ¨'), + ('ŅĒ', 'ŅĒ'), + ('ŅŦ', 'ŅŦ'), + ('ŅŽ', 'ŅŽ'), + ('Ņ°', 'Ņ°'), + ('Ņ˛', 'Ņ˛'), + ('Ņ´', 'Ņ´'), + ('Ņļ', 'Ņļ'), + ('Ņ¸', 'Ņ¸'), + ('Ņē', 'Ņē'), + ('Ņŧ', 'Ņŧ'), + ('Ņž', 'Ņž'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō ', 'Ō '), + ('Ōĸ', 'Ōĸ'), + ('Ō¤', 'Ō¤'), + ('ŌĻ', 'ŌĻ'), + ('Ō¨', 'Ō¨'), + ('ŌĒ', 'ŌĒ'), + ('ŌŦ', 'ŌŦ'), + ('ŌŽ', 'ŌŽ'), + ('Ō°', 'Ō°'), + ('Ō˛', 'Ō˛'), + ('Ō´', 'Ō´'), + ('Ōļ', 'Ōļ'), + ('Ō¸', 'Ō¸'), + ('Ōē', 'Ōē'), + ('Ōŧ', 'Ōŧ'), + ('Ōž', 'Ōž'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ
', 'Ķ
'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ ', 'Ķ '), + ('Ķĸ', 'Ķĸ'), + ('Ķ¤', 'Ķ¤'), + ('ĶĻ', 'ĶĻ'), + ('Ķ¨', 'Ķ¨'), + ('ĶĒ', 'ĶĒ'), + ('ĶŦ', 'ĶŦ'), + ('ĶŽ', 'ĶŽ'), + ('Ķ°', 'Ķ°'), + ('Ķ˛', 'Ķ˛'), + ('Ķ´', 'Ķ´'), + ('Ķļ', 'Ķļ'), + ('Ķ¸', 'Ķ¸'), + ('Ķē', 'Ķē'), + ('Ķŧ', 'Ķŧ'), + ('Ķž', 'Ķž'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô ', 'Ô '), + ('Ôĸ', 'Ôĸ'), + ('Ô¤', 'Ô¤'), + ('ÔĻ', 'ÔĻ'), + ('Ô¨', 'Ô¨'), + ('ÔĒ', 'ÔĒ'), + ('ÔŦ', 'ÔŦ'), + ('ÔŽ', 'ÔŽ'), + ('Ôą', 'Õ'), + ('Ö', 'Ö'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á¸', 'áŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('Ḡ', 'Ḡ'), + ('á¸ĸ', 'á¸ĸ'), + ('Ḥ', 'Ḥ'), + ('á¸Ļ', 'á¸Ļ'), + ('Ḩ', 'Ḩ'), + ('á¸Ē', 'á¸Ē'), + ('á¸Ŧ', 'á¸Ŧ'), + ('Ḏ', 'Ḏ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('á¸ļ', 'á¸ļ'), + ('Ḹ', 'Ḹ'), + ('á¸ē', 'á¸ē'), + ('á¸ŧ', 'á¸ŧ'), + ('Ḟ', 'Ḟ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš ', 'áš '), + ('ášĸ', 'ášĸ'), + ('ᚤ', 'ᚤ'), + ('ášĻ', 'ášĻ'), + ('ᚨ', 'ᚨ'), + ('ášĒ', 'ášĒ'), + ('ášŦ', 'ášŦ'), + ('ᚎ', 'ᚎ'), + ('áš°', 'áš°'), + ('ᚲ', 'ᚲ'), + ('áš´', 'áš´'), + ('ášļ', 'ášļ'), + ('ᚸ', 'ᚸ'), + ('ášē', 'ášē'), + ('ášŧ', 'ášŧ'), + ('ášž', 'ášž'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē ', 'áē '), + ('áēĸ', 'áēĸ'), + ('áē¤', 'áē¤'), + ('áēĻ', 'áēĻ'), + ('áē¨', 'áē¨'), + ('áēĒ', 'áēĒ'), + ('áēŦ', 'áēŦ'), + ('áēŽ', 'áēŽ'), + ('áē°', 'áē°'), + ('áē˛', 'áē˛'), + ('áē´', 'áē´'), + ('áēļ', 'áēļ'), + ('áē¸', 'áē¸'), + ('áēē', 'áēē'), + ('áēŧ', 'áēŧ'), + ('áēž', 'áēž'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ ', 'áģ '), + ('áģĸ', 'áģĸ'), + ('áģ¤', 'áģ¤'), + ('áģĻ', 'áģĻ'), + ('áģ¨', 'áģ¨'), + ('áģĒ', 'áģĒ'), + ('áģŦ', 'áģŦ'), + ('áģŽ', 'áģŽ'), + ('áģ°', 'áģ°'), + ('áģ˛', 'áģ˛'), + ('áģ´', 'áģ´'), + ('áģļ', 'áģļ'), + ('áģ¸', 'áģ¸'), + ('áģē', 'áģē'), + ('áģŧ', 'áģŧ'), + ('áģž', 'áģž'), + ('áŧ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ¨', 'áŧ¯'), + ('áŧ¸', 'áŧŋ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ¨', 'áŊ¯'), + ('áž', 'ឯ'), + ('ឲ', 'áž´'), + ('ឡ', 'ážŧ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ¨', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋˇ', 'áŋŧ'), + ('âĻ', 'âĻ'), + ('âĒ', 'âĢ'), + ('â˛', 'â˛'), + ('â
', 'â
¯'), + ('â', 'â'), + ('âļ', 'â'), + ('â°', 'â°Ž'), + ('âą ', 'âą '), + ('âąĸ', '⹤'), + ('⹧', '⹧'), + ('⹊', '⹊'), + ('âąĢ', 'âąĢ'), + ('âą', 'âą°'), + ('⹲', '⹲'), + ('âąĩ', 'âąĩ'), + ('âąž', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('Ⲡ', 'Ⲡ'), + ('â˛ĸ', 'â˛ĸ'), + ('Ⲥ', 'Ⲥ'), + ('â˛Ļ', 'â˛Ļ'), + ('Ⲩ', 'Ⲩ'), + ('â˛Ē', 'â˛Ē'), + ('â˛Ŧ', 'â˛Ŧ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('â˛ļ', 'â˛ļ'), + ('Ⲹ', 'Ⲹ'), + ('â˛ē', 'â˛ē'), + ('â˛ŧ', 'â˛ŧ'), + ('Ⲟ', 'Ⲟ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ ', 'âŗ '), + ('âŗĸ', 'âŗĸ'), + ('âŗĢ', 'âŗĢ'), + ('âŗ', 'âŗ'), + ('âŗ˛', 'âŗ˛'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ę˛', 'ę˛'), + ('ę´', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęŽ'), + ('ę°', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', '\u{a7c7}'), + ('\u{a7c9}', '\u{a7c9}'), + ('\u{a7f5}', '\u{a7f5}'), + ('ę°', 'ęŽŋ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŧĄ', 'īŧē'), + ('đ', 'đ§'), + ('đ°', 'đ'), + ('đ˛', 'đ˛˛'), + ('đĸ ', 'đĸŋ'), + ('đš', 'đš'), + ('đ¤', 'đ¤Ą'), +]; + +pub const CHANGES_WHEN_CASEMAPPED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('Âĩ', 'Âĩ'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'ġ'), + ('Äš', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'ÆŠ'), + ('ÆŦ', 'Æš'), + ('Æŧ', 'ÆŊ'), + ('Æŋ', 'Æŋ'), + ('Į', 'Č '), + ('Čĸ', 'Čŗ'), + ('Čē', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É ', 'ÉĄ'), + ('ÉŖ', 'ÉŖ'), + ('ÉĨ', 'ÉĻ'), + ('ɨ', 'ÉŦ'), + ('ɯ', 'ɯ'), + ('Éą', 'ɲ'), + ('Éĩ', 'Éĩ'), + ('ÉŊ', 'ÉŊ'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('\u{345}', '\u{345}'), + ('Í°', 'Íŗ'), + ('Íļ', '͡'), + ('Íģ', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Ī'), + ('Ī', 'Īĩ'), + ('Īˇ', 'Īģ'), + ('ĪŊ', 'Ō'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('ÕĄ', 'Ö'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŊ', 'áŋ'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('áĩš', 'áĩš'), + ('áĩŊ', 'áĩŊ'), + ('áļ', 'áļ'), + ('á¸', 'áē'), + ('áē', 'áē'), + ('áē ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('âĻ', 'âĻ'), + ('âĒ', 'âĢ'), + ('â˛', 'â˛'), + ('â
', 'â
'), + ('â
', 'â
ŋ'), + ('â', 'â'), + ('âļ', 'âŠ'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âą°'), + ('⹲', 'âąŗ'), + ('âąĩ', 'âąļ'), + ('âąž', 'âŗŖ'), + ('âŗĢ', 'âŗŽ'), + ('âŗ˛', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĸ', 'ę¯'), + ('ę˛', 'ę¯'), + ('ęš', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ęŽ'), + ('ę°', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', '\u{a7f6}'), + ('ę', 'ę'), + ('ę°', 'ęŽŋ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đĸ ', 'đŖ'), + ('đš', 'đšŋ'), + ('đ¤', 'đĨ'), +]; + +pub const CHANGES_WHEN_LOWERCASED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('Ã', 'Ã'), + ('Ã', 'Ã'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä ', 'Ä '), + ('Äĸ', 'Äĸ'), + ('Ĥ', 'Ĥ'), + ('ÄĻ', 'ÄĻ'), + ('Ĩ', 'Ĩ'), + ('ÄĒ', 'ÄĒ'), + ('ÄŦ', 'ÄŦ'), + ('ÄŽ', 'ÄŽ'), + ('Ä°', 'Ä°'), + ('IJ', 'IJ'), + ('Ä´', 'Ä´'), + ('Äļ', 'Äļ'), + ('Äš', 'Äš'), + ('Äģ', 'Äģ'), + ('ÄŊ', 'ÄŊ'), + ('Äŋ', 'Äŋ'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å
', 'Å
'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å ', 'Å '), + ('Åĸ', 'Åĸ'), + ('Ť', 'Ť'), + ('ÅĻ', 'ÅĻ'), + ('Ũ', 'Ũ'), + ('ÅĒ', 'ÅĒ'), + ('ÅŦ', 'ÅŦ'), + ('ÅŽ', 'ÅŽ'), + ('Å°', 'Å°'), + ('Å˛', 'Å˛'), + ('Å´', 'Å´'), + ('Åļ', 'Åļ'), + ('Ÿ', 'Åš'), + ('Åģ', 'Åģ'), + ('ÅŊ', 'ÅŊ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ '), + ('Æĸ', 'Æĸ'), + ('Ƥ', 'Ƥ'), + ('ÆĻ', 'Ƨ'), + ('ÆŠ', 'ÆŠ'), + ('ÆŦ', 'ÆŦ'), + ('ÆŽ', 'Ư'), + ('Æą', 'Æŗ'), + ('Æĩ', 'Æĩ'), + ('Æˇ', 'Ƹ'), + ('Æŧ', 'Æŧ'), + ('Į', 'Į
'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į ', 'Į '), + ('Įĸ', 'Įĸ'), + ('Į¤', 'Į¤'), + ('ĮĻ', 'ĮĻ'), + ('Į¨', 'Į¨'), + ('ĮĒ', 'ĮĒ'), + ('ĮŦ', 'ĮŦ'), + ('ĮŽ', 'ĮŽ'), + ('Įą', 'Į˛'), + ('Į´', 'Į´'), + ('Įļ', 'Į¸'), + ('Įē', 'Įē'), + ('Įŧ', 'Įŧ'), + ('Įž', 'Įž'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č ', 'Č '), + ('Čĸ', 'Čĸ'), + ('Ȥ', 'Ȥ'), + ('ČĻ', 'ČĻ'), + ('Ȩ', 'Ȩ'), + ('ČĒ', 'ČĒ'), + ('ČŦ', 'ČŦ'), + ('ČŽ', 'ČŽ'), + ('Č°', 'Č°'), + ('Ȳ', 'Ȳ'), + ('Čē', 'Čģ'), + ('ČŊ', 'Čž'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('Í°', 'Í°'), + ('Ͳ', 'Ͳ'), + ('Íļ', 'Íļ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'ÎĢ'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī ', 'Ī '), + ('Īĸ', 'Īĸ'), + ('Ī¤', 'Ī¤'), + ('ĪĻ', 'ĪĻ'), + ('Ī¨', 'Ī¨'), + ('ĪĒ', 'ĪĒ'), + ('ĪŦ', 'ĪŦ'), + ('ĪŽ', 'ĪŽ'), + ('Ī´', 'Ī´'), + ('Īˇ', 'Īˇ'), + ('Īš', 'Īē'), + ('ĪŊ', 'Đ¯'), + ('Ņ ', 'Ņ '), + ('Ņĸ', 'Ņĸ'), + ('Ņ¤', 'Ņ¤'), + ('ŅĻ', 'ŅĻ'), + ('Ņ¨', 'Ņ¨'), + ('ŅĒ', 'ŅĒ'), + ('ŅŦ', 'ŅŦ'), + ('ŅŽ', 'ŅŽ'), + ('Ņ°', 'Ņ°'), + ('Ņ˛', 'Ņ˛'), + ('Ņ´', 'Ņ´'), + ('Ņļ', 'Ņļ'), + ('Ņ¸', 'Ņ¸'), + ('Ņē', 'Ņē'), + ('Ņŧ', 'Ņŧ'), + ('Ņž', 'Ņž'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō ', 'Ō '), + ('Ōĸ', 'Ōĸ'), + ('Ō¤', 'Ō¤'), + ('ŌĻ', 'ŌĻ'), + ('Ō¨', 'Ō¨'), + ('ŌĒ', 'ŌĒ'), + ('ŌŦ', 'ŌŦ'), + ('ŌŽ', 'ŌŽ'), + ('Ō°', 'Ō°'), + ('Ō˛', 'Ō˛'), + ('Ō´', 'Ō´'), + ('Ōļ', 'Ōļ'), + ('Ō¸', 'Ō¸'), + ('Ōē', 'Ōē'), + ('Ōŧ', 'Ōŧ'), + ('Ōž', 'Ōž'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ
', 'Ķ
'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ ', 'Ķ '), + ('Ķĸ', 'Ķĸ'), + ('Ķ¤', 'Ķ¤'), + ('ĶĻ', 'ĶĻ'), + ('Ķ¨', 'Ķ¨'), + ('ĶĒ', 'ĶĒ'), + ('ĶŦ', 'ĶŦ'), + ('ĶŽ', 'ĶŽ'), + ('Ķ°', 'Ķ°'), + ('Ķ˛', 'Ķ˛'), + ('Ķ´', 'Ķ´'), + ('Ķļ', 'Ķļ'), + ('Ķ¸', 'Ķ¸'), + ('Ķē', 'Ķē'), + ('Ķŧ', 'Ķŧ'), + ('Ķž', 'Ķž'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô ', 'Ô '), + ('Ôĸ', 'Ôĸ'), + ('Ô¤', 'Ô¤'), + ('ÔĻ', 'ÔĻ'), + ('Ô¨', 'Ô¨'), + ('ÔĒ', 'ÔĒ'), + ('ÔŦ', 'ÔŦ'), + ('ÔŽ', 'ÔŽ'), + ('Ôą', 'Õ'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('Ḡ', 'Ḡ'), + ('á¸ĸ', 'á¸ĸ'), + ('Ḥ', 'Ḥ'), + ('á¸Ļ', 'á¸Ļ'), + ('Ḩ', 'Ḩ'), + ('á¸Ē', 'á¸Ē'), + ('á¸Ŧ', 'á¸Ŧ'), + ('Ḏ', 'Ḏ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('á¸ļ', 'á¸ļ'), + ('Ḹ', 'Ḹ'), + ('á¸ē', 'á¸ē'), + ('á¸ŧ', 'á¸ŧ'), + ('Ḟ', 'Ḟ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš ', 'áš '), + ('ášĸ', 'ášĸ'), + ('ᚤ', 'ᚤ'), + ('ášĻ', 'ášĻ'), + ('ᚨ', 'ᚨ'), + ('ášĒ', 'ášĒ'), + ('ášŦ', 'ášŦ'), + ('ᚎ', 'ᚎ'), + ('áš°', 'áš°'), + ('ᚲ', 'ᚲ'), + ('áš´', 'áš´'), + ('ášļ', 'ášļ'), + ('ᚸ', 'ᚸ'), + ('ášē', 'ášē'), + ('ášŧ', 'ášŧ'), + ('ášž', 'ášž'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē ', 'áē '), + ('áēĸ', 'áēĸ'), + ('áē¤', 'áē¤'), + ('áēĻ', 'áēĻ'), + ('áē¨', 'áē¨'), + ('áēĒ', 'áēĒ'), + ('áēŦ', 'áēŦ'), + ('áēŽ', 'áēŽ'), + ('áē°', 'áē°'), + ('áē˛', 'áē˛'), + ('áē´', 'áē´'), + ('áēļ', 'áēļ'), + ('áē¸', 'áē¸'), + ('áēē', 'áēē'), + ('áēŧ', 'áēŧ'), + ('áēž', 'áēž'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ ', 'áģ '), + ('áģĸ', 'áģĸ'), + ('áģ¤', 'áģ¤'), + ('áģĻ', 'áģĻ'), + ('áģ¨', 'áģ¨'), + ('áģĒ', 'áģĒ'), + ('áģŦ', 'áģŦ'), + ('áģŽ', 'áģŽ'), + ('áģ°', 'áģ°'), + ('áģ˛', 'áģ˛'), + ('áģ´', 'áģ´'), + ('áģļ', 'áģļ'), + ('áģ¸', 'áģ¸'), + ('áģē', 'áģē'), + ('áģŧ', 'áģŧ'), + ('áģž', 'áģž'), + ('áŧ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ¨', 'áŧ¯'), + ('áŧ¸', 'áŧŋ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ¨', 'áŊ¯'), + ('áž', 'áž'), + ('áž', 'áž'), + ('ឨ', 'ឯ'), + ('ី', 'ážŧ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ¨', 'áŋŦ'), + ('áŋ¸', 'áŋŧ'), + ('âĻ', 'âĻ'), + ('âĒ', 'âĢ'), + ('â˛', 'â˛'), + ('â
', 'â
¯'), + ('â', 'â'), + ('âļ', 'â'), + ('â°', 'â°Ž'), + ('âą ', 'âą '), + ('âąĸ', '⹤'), + ('⹧', '⹧'), + ('⹊', '⹊'), + ('âąĢ', 'âąĢ'), + ('âą', 'âą°'), + ('⹲', '⹲'), + ('âąĩ', 'âąĩ'), + ('âąž', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('Ⲡ', 'Ⲡ'), + ('â˛ĸ', 'â˛ĸ'), + ('Ⲥ', 'Ⲥ'), + ('â˛Ļ', 'â˛Ļ'), + ('Ⲩ', 'Ⲩ'), + ('â˛Ē', 'â˛Ē'), + ('â˛Ŧ', 'â˛Ŧ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('â˛ļ', 'â˛ļ'), + ('Ⲹ', 'Ⲹ'), + ('â˛ē', 'â˛ē'), + ('â˛ŧ', 'â˛ŧ'), + ('Ⲟ', 'Ⲟ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ ', 'âŗ '), + ('âŗĸ', 'âŗĸ'), + ('âŗĢ', 'âŗĢ'), + ('âŗ', 'âŗ'), + ('âŗ˛', 'âŗ˛'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ę˛', 'ę˛'), + ('ę´', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęŽ'), + ('ę°', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', '\u{a7c7}'), + ('\u{a7c9}', '\u{a7c9}'), + ('\u{a7f5}', '\u{a7f5}'), + ('īŧĄ', 'īŧē'), + ('đ', 'đ§'), + ('đ°', 'đ'), + ('đ˛', 'đ˛˛'), + ('đĸ ', 'đĸŋ'), + ('đš', 'đš'), + ('đ¤', 'đ¤Ą'), +]; + +pub const CHANGES_WHEN_TITLECASED: &'static [(char, char)] = &[ + ('a', 'z'), + ('Âĩ', 'Âĩ'), + ('Ã', 'Ãļ'), + ('ø', 'Ãŋ'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä
', 'Ä
'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('ÄĄ', 'ÄĄ'), + ('ÄŖ', 'ÄŖ'), + ('ÄĨ', 'ÄĨ'), + ('ħ', 'ħ'), + ('ÄŠ', 'ÄŠ'), + ('ÄĢ', 'ÄĢ'), + ('Ä', 'Ä'), + ('į', 'į'), + ('Äą', 'Äą'), + ('Äŗ', 'Äŗ'), + ('Äĩ', 'Äĩ'), + ('ġ', 'ġ'), + ('Äē', 'Äē'), + ('Äŧ', 'Äŧ'), + ('Äž', 'Äž'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('ÅĄ', 'ÅĄ'), + ('ÅŖ', 'ÅŖ'), + ('ÅĨ', 'ÅĨ'), + ('ŧ', 'ŧ'), + ('ÅŠ', 'ÅŠ'), + ('ÅĢ', 'ÅĢ'), + ('Å', 'Å'), + ('ů', 'ů'), + ('Åą', 'Åą'), + ('Åŗ', 'Åŗ'), + ('Åĩ', 'Åĩ'), + ('Åˇ', 'Åˇ'), + ('Åē', 'Åē'), + ('Åŧ', 'Åŧ'), + ('Åž', 'Æ'), + ('Æ', 'Æ'), + ('Æ
', 'Æ
'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('ÆĄ', 'ÆĄ'), + ('ÆŖ', 'ÆŖ'), + ('ÆĨ', 'ÆĨ'), + ('ƨ', 'ƨ'), + ('Æ', 'Æ'), + ('Æ°', 'Æ°'), + ('Æ´', 'Æ´'), + ('Æļ', 'Æļ'), + ('Æš', 'Æš'), + ('ÆŊ', 'ÆŊ'), + ('Æŋ', 'Æŋ'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('ĮĄ', 'ĮĄ'), + ('ĮŖ', 'ĮŖ'), + ('ĮĨ', 'ĮĨ'), + ('Į§', 'Į§'), + ('ĮŠ', 'ĮŠ'), + ('ĮĢ', 'ĮĢ'), + ('Į', 'Į'), + ('Į¯', 'Įą'), + ('Įŗ', 'Įŗ'), + ('Įĩ', 'Įĩ'), + ('Įš', 'Įš'), + ('Įģ', 'Įģ'), + ('ĮŊ', 'ĮŊ'), + ('Įŋ', 'Įŋ'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č
', 'Č
'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('ČŖ', 'ČŖ'), + ('ČĨ', 'ČĨ'), + ('ȧ', 'ȧ'), + ('ČŠ', 'ČŠ'), + ('ČĢ', 'ČĢ'), + ('Č', 'Č'), + ('Č¯', 'Č¯'), + ('Čą', 'Čą'), + ('Čŗ', 'Čŗ'), + ('Čŧ', 'Čŧ'), + ('Čŋ', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É ', 'ÉĄ'), + ('ÉŖ', 'ÉŖ'), + ('ÉĨ', 'ÉĻ'), + ('ɨ', 'ÉŦ'), + ('ɯ', 'ɯ'), + ('Éą', 'ɲ'), + ('Éĩ', 'Éĩ'), + ('ÉŊ', 'ÉŊ'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('\u{345}', '\u{345}'), + ('Íą', 'Íą'), + ('Íŗ', 'Íŗ'), + ('͡', '͡'), + ('Íģ', 'ÍŊ'), + ('Î', 'Î'), + ('ÎŦ', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('ĪĄ', 'ĪĄ'), + ('ĪŖ', 'ĪŖ'), + ('ĪĨ', 'ĪĨ'), + ('Ī§', 'Ī§'), + ('ĪŠ', 'ĪŠ'), + ('ĪĢ', 'ĪĢ'), + ('Ī', 'Ī'), + ('Ī¯', 'Īŗ'), + ('Īĩ', 'Īĩ'), + ('Ī¸', 'Ī¸'), + ('Īģ', 'Īģ'), + ('Đ°', 'Ņ'), + ('ŅĄ', 'ŅĄ'), + ('ŅŖ', 'ŅŖ'), + ('ŅĨ', 'ŅĨ'), + ('Ņ§', 'Ņ§'), + ('ŅŠ', 'ŅŠ'), + ('ŅĢ', 'ŅĢ'), + ('Ņ', 'Ņ'), + ('Ņ¯', 'Ņ¯'), + ('Ņą', 'Ņą'), + ('Ņŗ', 'Ņŗ'), + ('Ņĩ', 'Ņĩ'), + ('Ņˇ', 'Ņˇ'), + ('Ņš', 'Ņš'), + ('Ņģ', 'Ņģ'), + ('ŅŊ', 'ŅŊ'), + ('Ņŋ', 'Ņŋ'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('ŌĄ', 'ŌĄ'), + ('ŌŖ', 'ŌŖ'), + ('ŌĨ', 'ŌĨ'), + ('Ō§', 'Ō§'), + ('ŌŠ', 'ŌŠ'), + ('ŌĢ', 'ŌĢ'), + ('Ō', 'Ō'), + ('Ō¯', 'Ō¯'), + ('Ōą', 'Ōą'), + ('Ōŗ', 'Ōŗ'), + ('Ōĩ', 'Ōĩ'), + ('Ōˇ', 'Ōˇ'), + ('Ōš', 'Ōš'), + ('Ōģ', 'Ōģ'), + ('ŌŊ', 'ŌŊ'), + ('Ōŋ', 'Ōŋ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('ĶĄ', 'ĶĄ'), + ('ĶŖ', 'ĶŖ'), + ('ĶĨ', 'ĶĨ'), + ('Ķ§', 'Ķ§'), + ('ĶŠ', 'ĶŠ'), + ('ĶĢ', 'ĶĢ'), + ('Ķ', 'Ķ'), + ('Ķ¯', 'Ķ¯'), + ('Ķą', 'Ķą'), + ('Ķŗ', 'Ķŗ'), + ('Ķĩ', 'Ķĩ'), + ('Ķˇ', 'Ķˇ'), + ('Ķš', 'Ķš'), + ('Ķģ', 'Ķģ'), + ('ĶŊ', 'ĶŊ'), + ('Ķŋ', 'Ķŋ'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô
', 'Ô
'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('ÔĄ', 'ÔĄ'), + ('ÔŖ', 'ÔŖ'), + ('ÔĨ', 'ÔĨ'), + ('Ô§', 'Ô§'), + ('ÔŠ', 'ÔŠ'), + ('ÔĢ', 'ÔĢ'), + ('Ô', 'Ô'), + ('Ô¯', 'Ô¯'), + ('ÕĄ', 'Ö'), + ('á¸', 'áŊ'), + ('á˛', 'á˛'), + ('áĩš', 'áĩš'), + ('áĩŊ', 'áĩŊ'), + ('áļ', 'áļ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸
', 'á¸
'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('ḥ', 'ḥ'), + ('á¸Ŗ', 'á¸Ŗ'), + ('á¸Ĩ', 'á¸Ĩ'), + ('ḧ', 'ḧ'), + ('Ḋ', 'Ḋ'), + ('á¸Ģ', 'á¸Ģ'), + ('á¸', 'á¸'), + ('ḯ', 'ḯ'), + ('ḹ', 'ḹ'), + ('á¸ŗ', 'á¸ŗ'), + ('á¸ĩ', 'á¸ĩ'), + ('ḡ', 'ḡ'), + ('Ḛ', 'Ḛ'), + ('á¸ģ', 'á¸ģ'), + ('á¸Ŋ', 'á¸Ŋ'), + ('á¸ŋ', 'á¸ŋ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš
', 'áš
'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('ᚥ', 'ᚥ'), + ('ášŖ', 'ášŖ'), + ('ášĨ', 'ášĨ'), + ('ᚧ', 'ᚧ'), + ('ᚊ', 'ᚊ'), + ('ášĢ', 'ášĢ'), + ('áš', 'áš'), + ('ᚯ', 'ᚯ'), + ('ášą', 'ášą'), + ('ášŗ', 'ášŗ'), + ('ášĩ', 'ášĩ'), + ('ᚡ', 'ᚡ'), + ('ášš', 'ášš'), + ('ášģ', 'ášģ'), + ('ášŊ', 'ášŊ'), + ('ášŋ', 'ášŋ'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē
', 'áē
'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áēĄ', 'áēĄ'), + ('áēŖ', 'áēŖ'), + ('áēĨ', 'áēĨ'), + ('áē§', 'áē§'), + ('áēŠ', 'áēŠ'), + ('áēĢ', 'áēĢ'), + ('áē', 'áē'), + ('áē¯', 'áē¯'), + ('áēą', 'áēą'), + ('áēŗ', 'áēŗ'), + ('áēĩ', 'áēĩ'), + ('áēˇ', 'áēˇ'), + ('áēš', 'áēš'), + ('áēģ', 'áēģ'), + ('áēŊ', 'áēŊ'), + ('áēŋ', 'áēŋ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ
', 'áģ
'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģĄ', 'áģĄ'), + ('áģŖ', 'áģŖ'), + ('áģĨ', 'áģĨ'), + ('áģ§', 'áģ§'), + ('áģŠ', 'áģŠ'), + ('áģĢ', 'áģĢ'), + ('áģ', 'áģ'), + ('áģ¯', 'áģ¯'), + ('áģą', 'áģą'), + ('áģŗ', 'áģŗ'), + ('áģĩ', 'áģĩ'), + ('áģˇ', 'áģˇ'), + ('áģš', 'áģš'), + ('áģģ', 'áģģ'), + ('áģŊ', 'áģŊ'), + ('áģŋ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŧ§'), + ('áŧ°', 'áŧˇ'), + ('áŊ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ ', 'áŊ§'), + ('áŊ°', 'áŊŊ'), + ('áž', 'áž'), + ('áž', 'áž'), + ('áž ', 'ឧ'), + ('áž°', 'áž´'), + ('ážļ', 'ឡ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋ§'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋˇ'), + ('â
', 'â
'), + ('â
°', 'â
ŋ'), + ('â', 'â'), + ('â', 'âŠ'), + ('â°°', 'âą'), + ('⹥', '⹥'), + ('âąĨ', 'âąĻ'), + ('⹨', '⹨'), + ('âąĒ', 'âąĒ'), + ('âąŦ', 'âąŦ'), + ('âąŗ', 'âąŗ'), + ('âąļ', 'âąļ'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛
', 'â˛
'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('ⲥ', 'ⲥ'), + ('â˛Ŗ', 'â˛Ŗ'), + ('â˛Ĩ', 'â˛Ĩ'), + ('ⲧ', 'ⲧ'), + ('Ⲋ', 'Ⲋ'), + ('â˛Ģ', 'â˛Ģ'), + ('â˛', 'â˛'), + ('â˛¯', 'â˛¯'), + ('ⲹ', 'ⲹ'), + ('â˛ŗ', 'â˛ŗ'), + ('â˛ĩ', 'â˛ĩ'), + ('ⲡ', 'ⲡ'), + ('Ⲛ', 'Ⲛ'), + ('â˛ģ', 'â˛ģ'), + ('â˛Ŋ', 'â˛Ŋ'), + ('â˛ŋ', 'â˛ŋ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ
', 'âŗ
'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗĄ', 'âŗĄ'), + ('âŗŖ', 'âŗŖ'), + ('âŗŦ', 'âŗŦ'), + ('âŗŽ', 'âŗŽ'), + ('âŗŗ', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ę¯'), + ('ęŗ', 'ęŗ'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ę¯'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('\u{a7c8}', '\u{a7c8}'), + ('\u{a7ca}', '\u{a7ca}'), + ('\u{a7f6}', '\u{a7f6}'), + ('ę', 'ę'), + ('ę°', 'ęŽŋ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŊ', 'īŊ'), + ('đ¨', 'đ'), + ('đ', 'đģ'), + ('đŗ', 'đŗ˛'), + ('đŖ', 'đŖ'), + ('đš ', 'đšŋ'), + ('đ¤ĸ', 'đĨ'), +]; + +pub const CHANGES_WHEN_UPPERCASED: &'static [(char, char)] = &[ + ('a', 'z'), + ('Âĩ', 'Âĩ'), + ('Ã', 'Ãļ'), + ('ø', 'Ãŋ'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä
', 'Ä
'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('ÄĄ', 'ÄĄ'), + ('ÄŖ', 'ÄŖ'), + ('ÄĨ', 'ÄĨ'), + ('ħ', 'ħ'), + ('ÄŠ', 'ÄŠ'), + ('ÄĢ', 'ÄĢ'), + ('Ä', 'Ä'), + ('į', 'į'), + ('Äą', 'Äą'), + ('Äŗ', 'Äŗ'), + ('Äĩ', 'Äĩ'), + ('ġ', 'ġ'), + ('Äē', 'Äē'), + ('Äŧ', 'Äŧ'), + ('Äž', 'Äž'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('ÅĄ', 'ÅĄ'), + ('ÅŖ', 'ÅŖ'), + ('ÅĨ', 'ÅĨ'), + ('ŧ', 'ŧ'), + ('ÅŠ', 'ÅŠ'), + ('ÅĢ', 'ÅĢ'), + ('Å', 'Å'), + ('ů', 'ů'), + ('Åą', 'Åą'), + ('Åŗ', 'Åŗ'), + ('Åĩ', 'Åĩ'), + ('Åˇ', 'Åˇ'), + ('Åē', 'Åē'), + ('Åŧ', 'Åŧ'), + ('Åž', 'Æ'), + ('Æ', 'Æ'), + ('Æ
', 'Æ
'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('ÆĄ', 'ÆĄ'), + ('ÆŖ', 'ÆŖ'), + ('ÆĨ', 'ÆĨ'), + ('ƨ', 'ƨ'), + ('Æ', 'Æ'), + ('Æ°', 'Æ°'), + ('Æ´', 'Æ´'), + ('Æļ', 'Æļ'), + ('Æš', 'Æš'), + ('ÆŊ', 'ÆŊ'), + ('Æŋ', 'Æŋ'), + ('Į
', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('ĮĄ', 'ĮĄ'), + ('ĮŖ', 'ĮŖ'), + ('ĮĨ', 'ĮĨ'), + ('Į§', 'Į§'), + ('ĮŠ', 'ĮŠ'), + ('ĮĢ', 'ĮĢ'), + ('Į', 'Į'), + ('Į¯', 'Į°'), + ('Į˛', 'Įŗ'), + ('Įĩ', 'Įĩ'), + ('Įš', 'Įš'), + ('Įģ', 'Įģ'), + ('ĮŊ', 'ĮŊ'), + ('Įŋ', 'Įŋ'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č
', 'Č
'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('ČŖ', 'ČŖ'), + ('ČĨ', 'ČĨ'), + ('ȧ', 'ȧ'), + ('ČŠ', 'ČŠ'), + ('ČĢ', 'ČĢ'), + ('Č', 'Č'), + ('Č¯', 'Č¯'), + ('Čą', 'Čą'), + ('Čŗ', 'Čŗ'), + ('Čŧ', 'Čŧ'), + ('Čŋ', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É ', 'ÉĄ'), + ('ÉŖ', 'ÉŖ'), + ('ÉĨ', 'ÉĻ'), + ('ɨ', 'ÉŦ'), + ('ɯ', 'ɯ'), + ('Éą', 'ɲ'), + ('Éĩ', 'Éĩ'), + ('ÉŊ', 'ÉŊ'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('Ę', 'Ę'), + ('\u{345}', '\u{345}'), + ('Íą', 'Íą'), + ('Íŗ', 'Íŗ'), + ('͡', '͡'), + ('Íģ', 'ÍŊ'), + ('Î', 'Î'), + ('ÎŦ', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('ĪĄ', 'ĪĄ'), + ('ĪŖ', 'ĪŖ'), + ('ĪĨ', 'ĪĨ'), + ('Ī§', 'Ī§'), + ('ĪŠ', 'ĪŠ'), + ('ĪĢ', 'ĪĢ'), + ('Ī', 'Ī'), + ('Ī¯', 'Īŗ'), + ('Īĩ', 'Īĩ'), + ('Ī¸', 'Ī¸'), + ('Īģ', 'Īģ'), + ('Đ°', 'Ņ'), + ('ŅĄ', 'ŅĄ'), + ('ŅŖ', 'ŅŖ'), + ('ŅĨ', 'ŅĨ'), + ('Ņ§', 'Ņ§'), + ('ŅŠ', 'ŅŠ'), + ('ŅĢ', 'ŅĢ'), + ('Ņ', 'Ņ'), + ('Ņ¯', 'Ņ¯'), + ('Ņą', 'Ņą'), + ('Ņŗ', 'Ņŗ'), + ('Ņĩ', 'Ņĩ'), + ('Ņˇ', 'Ņˇ'), + ('Ņš', 'Ņš'), + ('Ņģ', 'Ņģ'), + ('ŅŊ', 'ŅŊ'), + ('Ņŋ', 'Ņŋ'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('ŌĄ', 'ŌĄ'), + ('ŌŖ', 'ŌŖ'), + ('ŌĨ', 'ŌĨ'), + ('Ō§', 'Ō§'), + ('ŌŠ', 'ŌŠ'), + ('ŌĢ', 'ŌĢ'), + ('Ō', 'Ō'), + ('Ō¯', 'Ō¯'), + ('Ōą', 'Ōą'), + ('Ōŗ', 'Ōŗ'), + ('Ōĩ', 'Ōĩ'), + ('Ōˇ', 'Ōˇ'), + ('Ōš', 'Ōš'), + ('Ōģ', 'Ōģ'), + ('ŌŊ', 'ŌŊ'), + ('Ōŋ', 'Ōŋ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('ĶĄ', 'ĶĄ'), + ('ĶŖ', 'ĶŖ'), + ('ĶĨ', 'ĶĨ'), + ('Ķ§', 'Ķ§'), + ('ĶŠ', 'ĶŠ'), + ('ĶĢ', 'ĶĢ'), + ('Ķ', 'Ķ'), + ('Ķ¯', 'Ķ¯'), + ('Ķą', 'Ķą'), + ('Ķŗ', 'Ķŗ'), + ('Ķĩ', 'Ķĩ'), + ('Ķˇ', 'Ķˇ'), + ('Ķš', 'Ķš'), + ('Ķģ', 'Ķģ'), + ('ĶŊ', 'ĶŊ'), + ('Ķŋ', 'Ķŋ'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô
', 'Ô
'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('ÔĄ', 'ÔĄ'), + ('ÔŖ', 'ÔŖ'), + ('ÔĨ', 'ÔĨ'), + ('Ô§', 'Ô§'), + ('ÔŠ', 'ÔŠ'), + ('ÔĢ', 'ÔĢ'), + ('Ô', 'Ô'), + ('Ô¯', 'Ô¯'), + ('ÕĄ', 'Ö'), + ('á', 'áē'), + ('áŊ', 'áŋ'), + ('á¸', 'áŊ'), + ('á˛', 'á˛'), + ('áĩš', 'áĩš'), + ('áĩŊ', 'áĩŊ'), + ('áļ', 'áļ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸
', 'á¸
'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('ḥ', 'ḥ'), + ('á¸Ŗ', 'á¸Ŗ'), + ('á¸Ĩ', 'á¸Ĩ'), + ('ḧ', 'ḧ'), + ('Ḋ', 'Ḋ'), + ('á¸Ģ', 'á¸Ģ'), + ('á¸', 'á¸'), + ('ḯ', 'ḯ'), + ('ḹ', 'ḹ'), + ('á¸ŗ', 'á¸ŗ'), + ('á¸ĩ', 'á¸ĩ'), + ('ḡ', 'ḡ'), + ('Ḛ', 'Ḛ'), + ('á¸ģ', 'á¸ģ'), + ('á¸Ŋ', 'á¸Ŋ'), + ('á¸ŋ', 'á¸ŋ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš
', 'áš
'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('ᚥ', 'ᚥ'), + ('ášŖ', 'ášŖ'), + ('ášĨ', 'ášĨ'), + ('ᚧ', 'ᚧ'), + ('ᚊ', 'ᚊ'), + ('ášĢ', 'ášĢ'), + ('áš', 'áš'), + ('ᚯ', 'ᚯ'), + ('ášą', 'ášą'), + ('ášŗ', 'ášŗ'), + ('ášĩ', 'ášĩ'), + ('ᚡ', 'ᚡ'), + ('ášš', 'ášš'), + ('ášģ', 'ášģ'), + ('ášŊ', 'ášŊ'), + ('ášŋ', 'ášŋ'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē
', 'áē
'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áēĄ', 'áēĄ'), + ('áēŖ', 'áēŖ'), + ('áēĨ', 'áēĨ'), + ('áē§', 'áē§'), + ('áēŠ', 'áēŠ'), + ('áēĢ', 'áēĢ'), + ('áē', 'áē'), + ('áē¯', 'áē¯'), + ('áēą', 'áēą'), + ('áēŗ', 'áēŗ'), + ('áēĩ', 'áēĩ'), + ('áēˇ', 'áēˇ'), + ('áēš', 'áēš'), + ('áēģ', 'áēģ'), + ('áēŊ', 'áēŊ'), + ('áēŋ', 'áēŋ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ
', 'áģ
'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģĄ', 'áģĄ'), + ('áģŖ', 'áģŖ'), + ('áģĨ', 'áģĨ'), + ('áģ§', 'áģ§'), + ('áģŠ', 'áģŠ'), + ('áģĢ', 'áģĢ'), + ('áģ', 'áģ'), + ('áģ¯', 'áģ¯'), + ('áģą', 'áģą'), + ('áģŗ', 'áģŗ'), + ('áģĩ', 'áģĩ'), + ('áģˇ', 'áģˇ'), + ('áģš', 'áģš'), + ('áģģ', 'áģģ'), + ('áģŊ', 'áģŊ'), + ('áģŋ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŧ§'), + ('áŧ°', 'áŧˇ'), + ('áŊ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ ', 'áŊ§'), + ('áŊ°', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ឡ'), + ('ážŧ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋ§'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋˇ'), + ('áŋŧ', 'áŋŧ'), + ('â
', 'â
'), + ('â
°', 'â
ŋ'), + ('â', 'â'), + ('â', 'âŠ'), + ('â°°', 'âą'), + ('⹥', '⹥'), + ('âąĨ', 'âąĻ'), + ('⹨', '⹨'), + ('âąĒ', 'âąĒ'), + ('âąŦ', 'âąŦ'), + ('âąŗ', 'âąŗ'), + ('âąļ', 'âąļ'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛
', 'â˛
'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('ⲥ', 'ⲥ'), + ('â˛Ŗ', 'â˛Ŗ'), + ('â˛Ĩ', 'â˛Ĩ'), + ('ⲧ', 'ⲧ'), + ('Ⲋ', 'Ⲋ'), + ('â˛Ģ', 'â˛Ģ'), + ('â˛', 'â˛'), + ('â˛¯', 'â˛¯'), + ('ⲹ', 'ⲹ'), + ('â˛ŗ', 'â˛ŗ'), + ('â˛ĩ', 'â˛ĩ'), + ('ⲡ', 'ⲡ'), + ('Ⲛ', 'Ⲛ'), + ('â˛ģ', 'â˛ģ'), + ('â˛Ŋ', 'â˛Ŋ'), + ('â˛ŋ', 'â˛ŋ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ
', 'âŗ
'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗĄ', 'âŗĄ'), + ('âŗŖ', 'âŗŖ'), + ('âŗŦ', 'âŗŦ'), + ('âŗŽ', 'âŗŽ'), + ('âŗŗ', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ę¯'), + ('ęŗ', 'ęŗ'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ę¯'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('\u{a7c8}', '\u{a7c8}'), + ('\u{a7ca}', '\u{a7ca}'), + ('\u{a7f6}', '\u{a7f6}'), + ('ę', 'ę'), + ('ę°', 'ęŽŋ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŊ', 'īŊ'), + ('đ¨', 'đ'), + ('đ', 'đģ'), + ('đŗ', 'đŗ˛'), + ('đŖ', 'đŖ'), + ('đš ', 'đšŋ'), + ('đ¤ĸ', 'đĨ'), +]; + +pub const DASH: &'static [(char, char)] = &[ + ('-', '-'), + ('Ö', 'Ö'), + ('Öž', 'Öž'), + ('á', 'á'), + ('á ', 'á '), + ('â', 'â'), + ('â', 'â'), + ('âģ', 'âģ'), + ('â', 'â'), + ('â', 'â'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('â¸ē', 'â¸ģ'), + ('âš', 'âš'), + ('ã', 'ã'), + ('ã°', 'ã°'), + ('ã ', 'ã '), + ('ī¸ą', 'ī¸˛'), + ('īš', 'īš'), + ('īšŖ', 'īšŖ'), + ('īŧ', 'īŧ'), + ('\u{10ead}', '\u{10ead}'), +]; + +pub const DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{34f}', '\u{34f}'), + ('\u{61c}', '\u{61c}'), + ('á
', 'á
'), + ('\u{17b4}', '\u{17b5}'), + ('\u{180b}', '\u{180e}'), + ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{206f}'), + ('ã
¤', 'ã
¤'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{feff}', '\u{feff}'), + ('īž ', 'īž '), + ('\u{fff0}', '\u{fff8}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0000}', '\u{e0fff}'), +]; + +pub const DEPRECATED: &'static [(char, char)] = &[ + ('Å', 'Å'), + ('Ųŗ', 'Ųŗ'), + ('\u{f77}', '\u{f77}'), + ('\u{f79}', '\u{f79}'), + ('áŖ', 'á¤'), + ('\u{206a}', '\u{206f}'), + ('âŠ', 'âĒ'), + ('\u{e0001}', '\u{e0001}'), +]; + +pub const DIACRITIC: &'static [(char, char)] = &[ + ('^', '^'), + ('`', '`'), + ('¨', '¨'), + ('¯', '¯'), + ('´', '´'), + ('¡', '¸'), + ('Ę°', '\u{34e}'), + ('\u{350}', '\u{357}'), + ('\u{35d}', '\u{362}'), + ('Í´', 'Íĩ'), + ('Íē', 'Íē'), + ('Î', 'Î
'), + ('\u{483}', '\u{487}'), + ('Õ', 'Õ'), + ('\u{591}', '\u{5a1}'), + ('\u{5a3}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c4}'), + ('\u{64b}', '\u{652}'), + ('\u{657}', '\u{658}'), + ('\u{6df}', '\u{6e0}'), + ('ÛĨ', 'ÛĻ'), + ('\u{6ea}', '\u{6ec}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', 'ßĩ'), + ('\u{818}', '\u{819}'), + ('\u{8e3}', '\u{8fe}'), + ('\u{93c}', '\u{93c}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{954}'), + ('āĨą', 'āĨą'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a4d}', '\u{a4d}'), + ('\u{abc}', '\u{abc}'), + ('\u{acd}', '\u{acd}'), + ('\u{afd}', '\u{aff}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b55}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c4d}', '\u{c4d}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{ccd}', '\u{ccd}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{dca}', '\u{dca}'), + ('\u{e47}', '\u{e4c}'), + ('\u{e4e}', '\u{e4e}'), + ('\u{eba}', '\u{eba}'), + ('\u{ec8}', '\u{ecc}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('āŧž', 'āŧŋ'), + ('\u{f82}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{1037}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('áŖ', 'á¤'), + ('áŠ', 'á'), + ('á', '\u{108d}'), + ('á', 'á'), + ('á', 'á'), + ('\u{135d}', '\u{135f}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a75}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1b34}', '\u{1b34}'), + ('á', 'á'), + ('\u{1b6b}', '\u{1b73}'), + ('áŽĒ', '\u{1bab}'), + ('\u{1c36}', '\u{1c37}'), + ('Ṹ', 'áąŊ'), + ('\u{1cd0}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('áŗˇ', '\u{1cf9}'), + ('á´Ŧ', 'áĩĒ'), + ('\u{1dc4}', '\u{1dcf}'), + ('\u{1df5}', '\u{1df9}'), + ('\u{1dfd}', '\u{1dff}'), + ('ážŊ', 'ážŊ'), + ('ážŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ¯'), + ('áŋŊ', 'áŋž'), + ('\u{2cef}', '\u{2cf1}'), + ('ⸯ', 'ⸯ'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', 'ã'), + ('ãŧ', 'ãŧ'), + ('\u{a66f}', '\u{a66f}'), + ('\u{a67c}', '\u{a67d}'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('\u{a6f0}', '\u{a6f1}'), + ('ę', 'ęĄ'), + ('ę', 'ę'), + ('ę¸', 'ęš'), + ('\u{a8c4}', '\u{a8c4}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a92b}', 'ꤎ'), + ('ęĨ', 'ęĨ'), + ('\u{a9b3}', '\u{a9b3}'), + ('ę§', 'ę§'), + ('\u{a9e5}', '\u{a9e5}'), + ('ęŠģ', 'ęŠŊ'), + ('\u{aabf}', 'ęĢ'), + ('\u{aaf6}', '\u{aaf6}'), + ('ę', 'ę'), + ('\u{ab69}', '\u{ab6b}'), + ('ę¯Ŧ', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe20}', '\u{fe2f}'), + ('īŧž', 'īŧž'), + ('īŊ', 'īŊ'), + ('īŊ°', 'īŊ°'), + ('\u{ff9e}', '\u{ff9f}'), + ('īŋŖ', 'īŋŖ'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10ae5}', '\u{10ae6}'), + ('đ´ĸ', '\u{10d27}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{11133}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('đ', 'đ'), + ('\u{111ca}', '\u{111cc}'), + ('đĩ', '\u{11236}'), + ('\u{112e9}', '\u{112ea}'), + ('\u{1133c}', '\u{1133c}'), + ('đ', 'đ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11442}', '\u{11442}'), + ('\u{11446}', '\u{11446}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{1163f}', '\u{1163f}'), + ('đļ', '\u{116b7}'), + ('\u{1172b}', '\u{1172b}'), + ('\u{11839}', '\u{1183a}'), + ('\u{1193d}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a34}', '\u{11a34}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a99}', '\u{11a99}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11d42}', '\u{11d42}'), + ('\u{11d44}', '\u{11d45}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f8f}', 'đž'), + ('\u{16ff0}', '\u{16ff1}'), + ('\u{1d167}', '\u{1d169}'), + ('đ
', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e946}'), + ('\u{1e948}', '\u{1e94a}'), +]; + +pub const EMOJI: &'static [(char, char)] = &[ + ('#', '#'), + ('*', '*'), + ('0', '9'), + ('Š', 'Š'), + ('ÂŽ', 'ÂŽ'), + ('âŧ', 'âŧ'), + ('â', 'â'), + ('âĸ', 'âĸ'), + ('âš', 'âš'), + ('â', 'â'), + ('âŠ', 'âĒ'), + ('â', 'â'), + ('â¨', 'â¨'), + ('â', 'â'), + ('âŠ', 'âŗ'), + ('â¸', 'âē'), + ('â', 'â'), + ('âĒ', 'âĢ'), + ('âļ', 'âļ'), + ('â', 'â'), + ('âģ', 'âž'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â ', 'â '), + ('âĸ', 'âŖ'), + ('âĻ', 'âĻ'), + ('âĒ', 'âĒ'), + ('âŽ', 'â¯'), + ('â¸', 'âē'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â '), + ('âŖ', 'âŖ'), + ('âĨ', 'âĻ'), + ('â¨', 'â¨'), + ('âģ', 'âģ'), + ('âž', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â ', 'âĄ'), + ('â§', 'â§'), + ('âĒ', 'âĢ'), + ('â°', 'âą'), + ('âŊ', 'âž'), + ('â', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âŠ', 'âĒ'), + ('â°', 'âĩ'), + ('âˇ', 'âē'), + ('âŊ', 'âŊ'), + ('â', 'â'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âĄ', 'âĄ'), + ('â¨', 'â¨'), + ('âŗ', 'â´'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âŖ', 'â¤'), + ('â', 'â'), + ('âĄ', 'âĄ'), + ('â°', 'â°'), + ('âŋ', 'âŋ'), + ('⤴', 'â¤ĩ'), + ('âŦ
', 'âŦ'), + ('âŦ', 'âŦ'), + ('â', 'â'), + ('â', 'â'), + ('ã°', 'ã°'), + ('ãŊ', 'ãŊ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ
°', 'đ
ą'), + ('đ
ž', 'đ
ŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĻ', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¯', 'đ¯'), + ('đ˛', 'đē'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đ¤', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ°'), + ('đŗ', 'đĩ'), + ('đˇ', 'đŊ'), + ('đŋ', 'đŊ'), + ('đ', 'đ'), + ('đ', 'đ§'), + ('đ¯', 'đ°'), + ('đŗ', 'đē'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¤', 'đĨ'), + ('đ¨', 'đ¨'), + ('đą', 'đ˛'), + ('đŧ', 'đŧ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĄ', 'đĄ'), + ('đŖ', 'đŖ'), + ('đ¨', 'đ¨'), + ('đ¯', 'đ¯'), + ('đŗ', 'đŗ'), + ('đē', 'đ'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', '\u{1f6d7}'), + ('đ ', 'đĨ'), + ('đŠ', 'đŠ'), + ('đĢ', 'đŦ'), + ('đ°', 'đ°'), + ('đŗ', '\u{1f6fc}'), + ('đ ', 'đĢ'), + ('\u{1f90c}', 'đ¤ē'), + ('đ¤ŧ', 'đĨ
'), + ('đĨ', '\u{1f978}'), + ('đĨē', '\u{1f9cb}'), + ('đ§', 'đ§ŋ'), + ('đŠ°', '\u{1fa74}'), + ('đŠ¸', 'đŠē'), + ('đĒ', '\u{1fa86}'), + ('đĒ', '\u{1faa8}'), + ('\u{1fab0}', '\u{1fab6}'), + ('\u{1fac0}', '\u{1fac2}'), + ('\u{1fad0}', '\u{1fad6}'), +]; + +pub const EMOJI_COMPONENT: &'static [(char, char)] = &[ + ('#', '#'), + ('*', '*'), + ('0', '9'), + ('\u{200d}', '\u{200d}'), + ('\u{20e3}', '\u{20e3}'), + ('\u{fe0f}', '\u{fe0f}'), + ('đĻ', 'đŋ'), + ('đģ', 'đŋ'), + ('đĻ°', 'đĻŗ'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const EMOJI_MODIFIER: &'static [(char, char)] = &[('đģ', 'đŋ')]; + +pub const EMOJI_MODIFIER_BASE: &'static [(char, char)] = &[ + ('â', 'â'), + ('âš', 'âš'), + ('â', 'â'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĻ', 'đ¸'), + ('đŧ', 'đŧ'), + ('đ', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĒ', 'đĒ'), + ('đ´', 'đĩ'), + ('đē', 'đē'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đŖ', 'đŖ'), + ('đ´', 'đļ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{1f90c}', '\u{1f90c}'), + ('đ¤', 'đ¤'), + ('đ¤', 'đ¤'), + ('đ¤Ļ', 'đ¤Ļ'), + ('đ¤°', 'đ¤š'), + ('đ¤ŧ', 'đ¤ž'), + ('\u{1f977}', '\u{1f977}'), + ('đĻĩ', 'đĻļ'), + ('đĻ¸', 'đĻš'), + ('đĻģ', 'đĻģ'), + ('đ§', 'đ§'), + ('đ§', 'đ§'), +]; + +pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[ + ('â', 'â'), + ('âŠ', 'âŦ'), + ('â°', 'â°'), + ('âŗ', 'âŗ'), + ('âŊ', 'âž'), + ('â', 'â'), + ('â', 'â'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('âĄ', 'âĄ'), + ('âĒ', 'âĢ'), + ('âŊ', 'âž'), + ('â', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('âĒ', 'âĒ'), + ('â˛', 'âŗ'), + ('âĩ', 'âĩ'), + ('âē', 'âē'), + ('âŊ', 'âŊ'), + ('â
', 'â
'), + ('â', 'â'), + ('â¨', 'â¨'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â°', 'â°'), + ('âŋ', 'âŋ'), + ('âŦ', 'âŦ'), + ('â', 'â'), + ('â', 'â'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĻ', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¯', 'đ¯'), + ('đ˛', 'đļ'), + ('đ¸', 'đē'), + ('đ', 'đ'), + ('đ', 'đ '), + ('đ', 'đĩ'), + ('đˇ', 'đŧ'), + ('đž', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đ°'), + ('đ´', 'đ´'), + ('đ¸', 'đž'), + ('đ', 'đ'), + ('đ', 'đŧ'), + ('đŋ', 'đŊ'), + ('đ', 'đ'), + ('đ', 'đ§'), + ('đē', 'đē'), + ('đ', 'đ'), + ('đ¤', 'đ¤'), + ('đģ', 'đ'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{1f6d7}'), + ('đĢ', 'đŦ'), + ('đ´', '\u{1f6fc}'), + ('đ ', 'đĢ'), + ('\u{1f90c}', 'đ¤ē'), + ('đ¤ŧ', 'đĨ
'), + ('đĨ', '\u{1f978}'), + ('đĨē', '\u{1f9cb}'), + ('đ§', 'đ§ŋ'), + ('đŠ°', '\u{1fa74}'), + ('đŠ¸', 'đŠē'), + ('đĒ', '\u{1fa86}'), + ('đĒ', '\u{1faa8}'), + ('\u{1fab0}', '\u{1fab6}'), + ('\u{1fac0}', '\u{1fac2}'), + ('\u{1fad0}', '\u{1fad6}'), +]; + +pub const EXTENDED_PICTOGRAPHIC: &'static [(char, char)] = &[ + ('Š', 'Š'), + ('ÂŽ', 'ÂŽ'), + ('âŧ', 'âŧ'), + ('â', 'â'), + ('âĸ', 'âĸ'), + ('âš', 'âš'), + ('â', 'â'), + ('âŠ', 'âĒ'), + ('â', 'â'), + ('â¨', 'â¨'), + ('â', 'â'), + ('â', 'â'), + ('âŠ', 'âŗ'), + ('â¸', 'âē'), + ('â', 'â'), + ('âĒ', 'âĢ'), + ('âļ', 'âļ'), + ('â', 'â'), + ('âģ', 'âž'), + ('â', 'â
'), + ('â', 'â'), + ('â', 'â
'), + ('â', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âĄ', 'âĄ'), + ('â¨', 'â¨'), + ('âŗ', 'â´'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âŖ', 'â§'), + ('â', 'â'), + ('âĄ', 'âĄ'), + ('â°', 'â°'), + ('âŋ', 'âŋ'), + ('⤴', 'â¤ĩ'), + ('âŦ
', 'âŦ'), + ('âŦ', 'âŦ'), + ('â', 'â'), + ('â', 'â'), + ('ã°', 'ã°'), + ('ãŊ', 'ãŊ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('đ', '\u{1f0ff}'), + ('\u{1f10d}', '\u{1f10f}'), + ('đ¯', 'đ¯'), + ('đ
Ŧ', 'đ
ą'), + ('đ
ž', 'đ
ŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{1f1ad}', '\u{1f1e5}'), + ('đ', '\u{1f20f}'), + ('đ', 'đ'), + ('đ¯', 'đ¯'), + ('đ˛', 'đē'), + ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', 'đē'), + ('đ', 'đŊ'), + ('đ', 'đ'), + ('đ', '\u{1f6ff}'), + ('\u{1f774}', '\u{1f77f}'), + ('đ', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8ff}'), + ('\u{1f90c}', 'đ¤ē'), + ('đ¤ŧ', 'đĨ
'), + ('đĨ', '\u{1faff}'), + ('\u{1fc00}', '\u{1fffd}'), +]; + +pub const EXTENDER: &'static [(char, char)] = &[ + ('¡', '¡'), + ('Ë', 'Ë'), + ('Ų', 'Ų'), + ('ßē', 'ßē'), + ('\u{b55}', '\u{b55}'), + ('āš', 'āš'), + ('āģ', 'āģ'), + ('á ', 'á '), + ('áĄ', 'áĄ'), + ('áĒ§', 'áĒ§'), + ('\u{1c36}', '\u{1c36}'), + ('áąģ', 'áąģ'), + ('ã
', 'ã
'), + ('ãą', 'ãĩ'), + ('ã', 'ã'), + ('ãŧ', 'ãž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę§', 'ę§'), + ('ę§Ļ', 'ę§Ļ'), + ('ꊰ', 'ꊰ'), + ('ęĢ', 'ęĢ'), + ('ęĢŗ', 'ęĢ´'), + ('īŊ°', 'īŊ°'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11a98}', '\u{11a98}'), + ('đ', 'đ'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', 'đŋŖ'), + ('đŧ', 'đŊ'), + ('\u{1e944}', '\u{1e946}'), +]; + +pub const GRAPHEME_BASE: &'static [(char, char)] = &[ + (' ', '~'), + ('\u{a0}', 'ÂŦ'), + ('ÂŽ', 'Ëŋ'), + ('Í°', '͡'), + ('Íē', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Ō'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ', 'Ö'), + ('Ö', 'Ö'), + ('Öž', 'Öž'), + ('×', '×'), + ('×', '×'), + ('×', '×'), + ('×', '×Ē'), + ('ׯ', '×´'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ų'), + ('Ų ', 'Ų¯'), + ('Ųą', 'Û'), + ('Û', 'Û'), + ('ÛĨ', 'ÛĻ'), + ('ÛŠ', 'ÛŠ'), + ('ÛŽ', 'Ü'), + ('Ü', 'Ü'), + ('Ü', 'ܯ'), + ('Ũ', 'ŪĨ'), + ('Ūą', 'Ūą'), + ('ß', 'ßĒ'), + ('ß´', 'ßē'), + ('ßž', 'ā '), + ('ā ', 'ā '), + ('ā ¤', 'ā ¤'), + ('ā ¨', 'ā ¨'), + ('ā °', 'ā ž'), + ('āĄ', 'āĄ'), + ('āĄ', 'āĄ'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('ā¤', 'ā¤š'), + ('ā¤ģ', 'ā¤ģ'), + ('ā¤Ŋ', 'āĨ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨĄ'), + ('āĨ¤', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('āĻŊ', 'āĻŊ'), + ('āĻŋ', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§Ą'), + ('ā§Ļ', 'ā§Ŋ'), + ('ā¨', 'ā¨'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('ā¨ž', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠĻ', 'āŠ¯'), + ('āŠ˛', 'āŠ´'), + ('āŠļ', 'āŠļ'), + ('āĒ', 'āĒ'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('āĒŊ', 'āĢ'), + ('āĢ', 'āĢ'), + ('āĢ', 'āĢ'), + ('āĢ', 'āĢ'), + ('āĢ ', 'āĢĄ'), + ('āĢĻ', 'āĢą'), + ('āĢš', 'āĢš'), + ('āŦ', 'āŦ'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('āŦŊ', 'āŦŊ'), + ('ā', 'ā'), + ('ā', 'ā'), + ('ā', 'ā'), + ('ā', 'ā'), + ('ā', 'āĄ'), + ('āĻ', 'āˇ'), + ('āŽ', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('āŽŋ', 'āŽŋ'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯Ļ', 'ā¯ē'), + ('ā°', 'ā°'), + ('ā°
', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'ā°Ŋ'), + ('āą', 'āą'), + ('āą', 'āą'), + ('āą ', 'āąĄ'), + ('āąĻ', 'āą¯'), + ('āąˇ', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('ā˛Ŋ', 'ā˛ž'), + ('āŗ', 'āŗ'), + ('āŗ', 'āŗ'), + ('āŗ', 'āŗ'), + ('āŗ', 'āŗ'), + ('āŗ', 'āŗ'), + ('āŗ ', 'āŗĄ'), + ('āŗĻ', 'āŗ¯'), + ('āŗą', 'āŗ˛'), + ('ā´', 'ā´'), + ('ā´', 'ā´'), + ('ā´', 'ā´ē'), + ('ā´Ŋ', 'ā´Ŋ'), + ('ā´ŋ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩĄ'), + ('āĩĻ', 'āĩŋ'), + ('āļ', 'āļ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('āˇ', 'āˇ'), + ('āˇ', 'āˇ'), + ('āˇĻ', 'āˇ¯'), + ('āˇ˛', 'āˇ´'), + ('ā¸', 'ā¸°'), + ('ā¸˛', 'ā¸ŗ'), + ('ā¸ŋ', 'āš'), + ('āš', 'āš'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āē°'), + ('āē˛', 'āēŗ'), + ('āēŊ', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ´'), + ('āŧļ', 'āŧļ'), + ('āŧ¸', 'āŧ¸'), + ('āŧē', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('āŊŋ', 'āŊŋ'), + ('āž
', 'āž
'), + ('āž', 'āž'), + ('āžž', 'āŋ
'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), + ('á', 'áŦ'), + ('áą', 'áą'), + ('á¸', 'á¸'), + ('áģ', 'áŧ'), + ('áŋ', 'á'), + ('á', 'á'), + ('áĄ', 'á°'), + ('áĩ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áŧ'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á', 'á'), + ('á ', 'á¸'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áą'), + ('áĩ', 'áļ'), + ('á', 'á'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('á', 'áŗ'), + ('áļ', 'áļ'), + ('áž', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áŠ'), + ('á°', 'áš'), + ('á ', 'á '), + ('á ', 'á '), + ('á ', '᥸'), + ('áĸ', 'áĸ'), + ('áĸ', 'áĸ¨'), + ('áĸĒ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('á¤Ŗ', 'á¤Ļ'), + ('ᤊ', 'á¤Ģ'), + ('ᤰ', '᤹'), + ('á¤ŗ', 'ᤸ'), + ('áĨ', 'áĨ'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á§', 'á§'), + ('á§', 'á¨'), + ('á¨', 'á¨'), + ('á¨', 'áŠ'), + ('áŠ', 'áŠ'), + ('እ', 'እ'), + ('áŠŖ', 'ኤ'), + ('áŠ', 'ኲ'), + ('áĒ', 'áĒ'), + ('áĒ', 'áĒ'), + ('áĒ ', 'áĒ'), + ('áŦ', 'áŦŗ'), + ('áŦģ', 'áŦģ'), + ('áŦŊ', 'á'), + ('á', 'á'), + ('á', 'áĒ'), + ('á´', 'áŧ'), + ('áŽ', 'Ꭵ'), + ('áŽĻ', 'Ꭷ'), + ('áŽĒ', 'áŽĒ'), + ('ᎎ', 'á¯Ĩ'), + ('ᯧ', 'ᯧ'), + ('á¯Ē', 'á¯Ŧ'), + ('ᯎ', 'ᯎ'), + ('á¯˛', 'á¯ŗ'), + ('á¯ŧ', 'á°Ģ'), + ('á°´', 'á°ĩ'), + ('á°ģ', 'áą'), + ('áą', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'áŗ'), + ('áŗ', 'áŗ'), + ('áŗĄ', 'áŗĄ'), + ('áŗŠ', 'áŗŦ'), + ('áŗŽ', 'áŗŗ'), + ('áŗĩ', 'áŗˇ'), + ('áŗē', 'áŗē'), + ('á´', 'áļŋ'), + ('á¸', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ¯'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋž'), + ('\u{2000}', '\u{200a}'), + ('â', 'â§'), + ('\u{202f}', '\u{205f}'), + ('â°', 'âą'), + ('â´', 'â'), + ('â', 'â'), + ('â ', 'âŋ'), + ('â', 'â'), + ('â', 'âĻ'), + ('â', 'â'), + ('â ', 'âŗ'), + ('âļ', 'âŽ'), + ('\u{2b97}', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗŽ'), + ('âŗ˛', 'âŗŗ'), + ('âŗš', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ°'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('â¸', '\u{2e52}'), + ('âē', 'âē'), + ('âē', 'âģŗ'), + ('âŧ', 'âŋ'), + ('âŋ°', 'âŋģ'), + ('\u{3000}', 'ãŠ'), + ('ã°', 'ãŋ'), + ('ã', 'ã'), + ('ã', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã', 'ãŖ'), + ('ã°', 'ã'), + ('ã ', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ęĢ'), + ('ę', 'ęŽ'), + ('ęŗ', 'ęŗ'), + ('ęž', 'ę'), + ('ę ', 'ę¯'), + ('ę˛', 'ęˇ'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ę '), + ('ę ', 'ę
'), + ('ę ', 'ę '), + ('ę ', 'ę ¤'), + ('ę §', 'ę Ģ'), + ('ę °', 'ę š'), + ('ęĄ', 'ꥡ'), + ('ęĸ', 'ęŖ'), + ('ęŖ', 'ęŖ'), + ('ęŖ˛', 'ęŖž'), + ('ę¤', 'ę¤Ĩ'), + ('ꤎ', 'ęĨ'), + ('ęĨ', 'ęĨ'), + ('ęĨ', 'ęĨŧ'), + ('ęĻ', 'ęĻ˛'), + ('ęĻ´', 'ęĻĩ'), + ('ęĻē', 'ęĻģ'), + ('ęĻž', 'ę§'), + ('ę§', 'ę§'), + ('ę§', 'ꧤ'), + ('ę§Ļ', '꧞'), + ('ę¨', 'ꨨ'), + ('ę¨¯', 'ꨰ'), + ('ę¨ŗ', 'ꨴ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠģ'), + ('ęŠŊ', 'ęĒ¯'), + ('ęĒą', 'ęĒą'), + ('ęĒĩ', 'ęĒļ'), + ('ęĒš', 'ęĒŊ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢĢ'), + ('ęĢŽ', 'ęĢĩ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ęŦ°', '\u{ab6b}'), + ('ę°', 'ę¯¤'), + ('ę¯Ļ', 'ę¯§'), + ('ę¯Š', 'ę¯Ŧ'), + ('ę¯°', 'ę¯š'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'ī¯'), + ('ī¯', 'ī´ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇŊ'), + ('ī¸', 'ī¸'), + ('ī¸°', 'īš'), + ('īš', 'īšĻ'), + ('īš¨', 'īšĢ'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('īŧ', 'īž'), + ('īž ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ ', 'īŋĻ'), + ('īŋ¨', 'īŋŽ'), + ('īŋŧ', 'īŋŊ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ', 'đ'), + ('đ', 'đŗ'), + ('đˇ', 'đ'), + ('đ', '\u{1019c}'), + ('đ ', 'đ '), + ('đ', 'đŧ'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đĄ', 'đģ'), + ('đ', 'đŖ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đŠ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ¯', 'đ¯'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ', 'đĸ'), + ('đĸ§', 'đĸ¯'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đŖģ', 'đ¤'), + ('đ¤', 'đ¤š'), + ('đ¤ŋ', 'đ¤ŋ'), + ('đĻ', 'đĻˇ'), + ('đĻŧ', 'đ§'), + ('đ§', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('đŠ', 'đŠ'), + ('đŠ', 'đŠ'), + ('đŠ ', 'đĒ'), + ('đĢ', 'đĢ¤'), + ('đĢĢ', 'đĢļ'), + ('đŦ', 'đŦĩ'), + ('đŦš', 'đ'), + ('đ', 'đ˛'), + ('đ¸', 'đŽ'), + ('đŽ', 'đŽ'), + ('đŽŠ', 'đŽ¯'), + ('đ°', 'đą'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đŗē', 'đ´Ŗ'), + ('đ´°', 'đ´š'), + ('đš ', 'đšž'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10ead}', '\u{10ead}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ§'), + ('đŧ°', 'đŊ
'), + ('đŊ', 'đŊ'), + ('\u{10fb0}', '\u{10fcb}'), + ('đŋ ', 'đŋļ'), + ('đ', 'đ'), + ('đ', 'đˇ'), + ('đ', 'đ'), + ('đ', 'đ¯'), + ('đ', 'đ˛'), + ('đˇ', 'đ¸'), + ('đģ', 'đŧ'), + ('đž', 'đ'), + ('đ', 'đ¨'), + ('đ°', 'đš'), + ('đ', 'đĻ'), + ('đŦ', 'đŦ'), + ('đļ', '\u{11147}'), + ('đ
', 'đ
˛'), + ('đ
´', 'đ
ļ'), + ('đ', 'đĩ'), + ('đŋ', 'đ'), + ('đ', '\u{111ce}'), + ('đ', 'đ'), + ('đĄ', 'đ´'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ˛', 'đŗ'), + ('đĩ', 'đĩ'), + ('đ¸', 'đŊ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŠ'), + ('đ°', 'đ'), + ('đ ', 'đĸ'), + ('đ°', 'đš'), + ('đ', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('đŊ', 'đŊ'), + ('đŋ', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŖ'), + ('đ', 'đˇ'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{11461}'), + ('đ', 'đ¯'), + ('đą', 'đ˛'), + ('đš', 'đš'), + ('đģ', 'đŧ'), + ('đž', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đą'), + ('đ¸', 'đģ'), + ('đž', 'đž'), + ('đ', 'đ'), + ('đ', 'đ˛'), + ('đģ', 'đŧ'), + ('đž', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đŦ'), + ('đ', 'đĒ'), + ('đŦ', 'đŦ'), + ('đŽ', 'đ¯'), + ('đļ', 'đļ'), + ('đ¸', 'đ¸'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đĄ'), + ('đĻ', 'đĻ'), + ('đ°', 'đŋ'), + ('đ ', 'đ Ž'), + ('đ ¸', 'đ ¸'), + ('đ ģ', 'đ ģ'), + ('đĸ ', 'đŖ˛'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{1192f}'), + ('\u{11931}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193d}', '\u{1193d}'), + ('\u{1193f}', '\u{11942}'), + ('\u{11944}', '\u{11946}'), + ('\u{11950}', '\u{11959}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', 'đ§'), + ('đ§', 'đ§'), + ('đ§Ą', 'đ§¤'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨˛'), + ('đ¨š', 'đ¨ē'), + ('đ¨ŋ', 'đŠ'), + ('đŠ', 'đŠ'), + ('đŠ', 'đŠ'), + ('đŠ', 'đĒ'), + ('đĒ', 'đĒ'), + ('đĒ', 'đĒĸ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', 'đ°¯'), + ('đ°ž', 'đ°ž'), + ('đą', 'đą
'), + ('đą', 'đąŦ'), + ('đą°', 'đ˛'), + ('đ˛Š', 'đ˛Š'), + ('đ˛ą', 'đ˛ą'), + ('đ˛´', 'đ˛´'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', 'đ´°'), + ('đĩ', 'đĩ'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('đļ', 'đļ'), + ('đļ', 'đļ'), + ('đļ', 'đļ'), + ('đļ ', 'đļŠ'), + ('đģ ', 'đģ˛'), + ('đģĩ', 'đģ¸'), + ('\u{11fb0}', '\u{11fb0}'), + ('đŋ', 'đŋą'), + ('đŋŋ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ´'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đŠ ', 'đŠŠ'), + ('đŠŽ', 'đŠ¯'), + ('đĢ', 'đĢ'), + ('đĢĩ', 'đĢĩ'), + ('đŦ', 'đŦ¯'), + ('đŦˇ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đš', 'đē'), + ('đŧ', 'đŊ'), + ('đŊ', 'đž'), + ('đž', 'đž'), + ('đŋ ', 'đŋŖ'), + ('\u{16ff0}', '\u{16ff1}'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ', 'đĩ'), + ('đ', 'đĻ'), + ('đŠ', 'đ
¤'), + ('đ
Ļ', 'đ
Ļ'), + ('đ
Ē', 'đ
'), + ('đ', 'đ'), + ('đ', 'đŠ'), + ('đŽ', 'đ¨'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ ', 'đŗ'), + ('đ', 'đ'), + ('đ ', 'đ¸'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ§ŋ'), + ('đ¨ˇ', 'đ¨ē'), + ('đŠ', 'đŠ´'), + ('đŠļ', 'đĒ'), + ('đĒ
', 'đĒ'), + ('đ', 'đŦ'), + ('đˇ', 'đŊ'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ', 'đĢ'), + ('đ°', 'đš'), + ('đŋ', 'đŋ'), + ('đ ', 'đŖ'), + ('đŖ', 'đŖ'), + ('đ¤', 'đĨ'), + ('đĨ', 'đĨ'), + ('đĨ', 'đĨ'), + ('đĨ', 'đĨ'), + ('đąą', 'đ˛´'), + ('đ´', 'đ´Ŋ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đģ°', 'đģą'), + ('đ', 'đĢ'), + ('đ°', 'đ'), + ('đ ', 'đŽ'), + ('đą', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', '\u{1f1ad}'), + ('đĻ', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đĨ'), + ('đ', '\u{1f6d7}'), + ('đ ', 'đŦ'), + ('đ°', '\u{1f6fc}'), + ('đ', 'đŗ'), + ('đ', 'đ'), + ('đ ', 'đĢ'), + ('đ ', 'đ '), + ('đ ', 'đĄ'), + ('đĄ', 'đĄ'), + ('đĄ ', 'đĸ'), + ('đĸ', 'đĸ'), + ('\u{1f8b0}', '\u{1f8b1}'), + ('đ¤', '\u{1f978}'), + ('đĨē', '\u{1f9cb}'), + ('đ§', 'đŠ'), + ('đŠ ', 'đŠ'), + ('đŠ°', '\u{1fa74}'), + ('đŠ¸', 'đŠē'), + ('đĒ', '\u{1fa86}'), + ('đĒ', '\u{1faa8}'), + ('\u{1fab0}', '\u{1fab6}'), + ('\u{1fac0}', '\u{1fac2}'), + ('\u{1fad0}', '\u{1fad6}'), + ('\u{1fb00}', '\u{1fb92}'), + ('\u{1fb94}', '\u{1fbca}'), + ('\u{1fbf0}', '\u{1fbf9}'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9be}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', '\u{bbe}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc2}', '\u{cc2}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d3e}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dcf}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{ddf}', '\u{ddf}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ecd}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1734}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ac0}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{200c}', '\u{200c}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('\u{aaec}', '\u{aaed}'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '\u{1133e}'), + ('\u{11340}', '\u{11340}'), + ('\u{11357}', '\u{11357}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114b0}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bd}', '\u{114bd}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115af}', '\u{115af}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{11930}', '\u{11930}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1d165}', '\u{1d165}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d16e}', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const GRAPHEME_LINK: &'static [(char, char)] = &[ + ('\u{94d}', '\u{94d}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{a4d}', '\u{a4d}'), + ('\u{acd}', '\u{acd}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c4d}', '\u{c4d}'), + ('\u{ccd}', '\u{ccd}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{dca}', '\u{dca}'), + ('\u{e3a}', '\u{e3a}'), + ('\u{eba}', '\u{eba}'), + ('\u{f84}', '\u{f84}'), + ('\u{1039}', '\u{103a}'), + ('\u{1714}', '\u{1714}'), + ('\u{1734}', '\u{1734}'), + ('\u{17d2}', '\u{17d2}'), + ('\u{1a60}', '\u{1a60}'), + ('á', 'á'), + ('áŽĒ', '\u{1bab}'), + ('á¯˛', 'á¯ŗ'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{a806}', '\u{a806}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c4}'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abed}', '\u{abed}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{11046}', '\u{11046}'), + ('\u{1107f}', '\u{1107f}'), + ('\u{110b9}', '\u{110b9}'), + ('\u{11133}', '\u{11134}'), + ('đ', 'đ'), + ('đĩ', 'đĩ'), + ('\u{112ea}', '\u{112ea}'), + ('đ', 'đ'), + ('\u{11442}', '\u{11442}'), + ('\u{114c2}', '\u{114c2}'), + ('\u{115bf}', '\u{115bf}'), + ('\u{1163f}', '\u{1163f}'), + ('đļ', 'đļ'), + ('\u{1172b}', '\u{1172b}'), + ('\u{11839}', '\u{11839}'), + ('\u{1193d}', '\u{1193e}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a34}', '\u{11a34}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a99}', '\u{11a99}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11d44}', '\u{11d45}'), + ('\u{11d97}', '\u{11d97}'), +]; + +pub const HEX_DIGIT: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'F'), + ('a', 'f'), + ('īŧ', 'īŧ'), + ('īŧĄ', 'īŧĻ'), + ('īŊ', 'īŊ'), +]; + +pub const HYPHEN: &'static [(char, char)] = &[ + ('-', '-'), + ('\u{ad}', '\u{ad}'), + ('Ö', 'Ö'), + ('á ', 'á '), + ('â', 'â'), + ('â¸', 'â¸'), + ('ãģ', 'ãģ'), + ('īšŖ', 'īšŖ'), + ('īŧ', 'īŧ'), + ('īŊĨ', 'īŊĨ'), +]; + +pub const IDS_BINARY_OPERATOR: &'static [(char, char)] = + &[('âŋ°', 'âŋą'), ('âŋ´', 'âŋģ')]; + +pub const IDS_TRINARY_OPERATOR: &'static [(char, char)] = &[('âŋ˛', 'âŋŗ')]; + +pub const ID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('¡', '¡'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Ë'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('ËŦ', 'ËŦ'), + ('ËŽ', 'ËŽ'), + ('\u{300}', 'Í´'), + ('Íļ', '͡'), + ('Íē', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('\u{483}', '\u{487}'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ', 'Õ'), + ('Õ ', 'Ö'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('×', '×Ē'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('Ø ', 'ŲŠ'), + ('ŲŽ', 'Û'), + ('Û', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', '\u{74a}'), + ('Ũ', 'Ūą'), + ('ß', 'ßĩ'), + ('ßē', 'ßē'), + ('\u{7fd}', '\u{7fd}'), + ('ā ', '\u{82d}'), + ('āĄ', '\u{85b}'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('āĨĻ', 'āĨ¯'), + ('āĨą', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('\u{9bc}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('\u{9d7}', '\u{9d7}'), + ('ā§', 'ā§'), + ('ā§', '\u{9e3}'), + ('ā§Ļ', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ā¨'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('\u{a3c}', '\u{a3c}'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠĻ', '\u{a75}'), + ('\u{a81}', 'āĒ'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', '\u{acd}'), + ('āĢ', 'āĢ'), + ('āĢ ', '\u{ae3}'), + ('āĢĻ', 'āĢ¯'), + ('āĢš', '\u{aff}'), + ('\u{b01}', 'āŦ'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('\u{b3c}', '\u{b44}'), + ('ā', 'ā'), + ('ā', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ā', 'ā'), + ('ā', '\u{b63}'), + ('āĻ', 'ā¯'), + ('āą', 'āą'), + ('\u{b82}', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', '\u{bcd}'), + ('ā¯', 'ā¯'), + ('\u{bd7}', '\u{bd7}'), + ('ā¯Ļ', 'ā¯¯'), + ('\u{c00}', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('āą', 'āą'), + ('āą ', '\u{c63}'), + ('āąĻ', 'āą¯'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('\u{cbc}', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('āŗ', 'āŗ'), + ('āŗ ', '\u{ce3}'), + ('āŗĻ', 'āŗ¯'), + ('āŗą', 'āŗ˛'), + ('\u{d00}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d57}'), + ('āĩ', '\u{d63}'), + ('āĩĻ', 'āĩ¯'), + ('āĩē', 'āĩŋ'), + ('\u{d81}', 'āļ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇĻ', 'āˇ¯'), + ('āˇ˛', 'āˇŗ'), + ('ā¸', '\u{e3a}'), + ('āš', '\u{e4e}'), + ('āš', 'āš'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('\u{ec8}', '\u{ecd}'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('\u{f18}', '\u{f19}'), + ('āŧ ', 'āŧŠ'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('āŧž', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('á', 'á'), + ('á', '\u{109d}'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('\u{135d}', '\u{135f}'), + ('áŠ', 'áą'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áŽ', 'á¸'), + ('á', 'á'), + ('á', '\u{1714}'), + ('á ', '\u{1734}'), + ('á', '\u{1753}'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('\u{1772}', '\u{1773}'), + ('á', '\u{17d3}'), + ('á', 'á'), + ('á', '\u{17dd}'), + ('á ', 'áŠ'), + ('\u{180b}', '\u{180d}'), + ('á ', 'á '), + ('á ', '᥸'), + ('áĸ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', '\u{193b}'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á§', 'á§'), + ('á¨', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', 'áĒ'), + ('áĒ', 'áĒ'), + ('áĒ§', 'áĒ§'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{1b00}', 'á'), + ('á', 'á'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'á¯ŗ'), + ('á°', '\u{1c37}'), + ('áą', 'áą'), + ('áą', 'áąŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'áŗē'), + ('á´', '\u{1df9}'), + ('\u{1dfb}', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('âŋ', 'â'), + ('â', 'â'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20f0}'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗ¤'), + ('âŗĢ', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ¯'), + ('\u{2d7f}', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('\u{2de0}', '\u{2dff}'), + ('ã
', 'ã'), + ('ãĄ', '\u{302f}'), + ('ãą', 'ãĩ'), + ('ã¸', 'ãŧ'), + ('ã', 'ã'), + ('\u{3099}', 'ã'), + ('ãĄ', 'ãē'), + ('ãŧ', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ã°', 'ãŋ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ęŊ'), + ('ę', 'ę'), + ('ę', 'ęĢ'), + ('ę', '\u{a66f}'), + ('\u{a674}', '\u{a67d}'), + ('ęŋ', '\u{a6f1}'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ę §'), + ('\u{a82c}', '\u{a82c}'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', '\u{a8c5}'), + ('ęŖ', 'ęŖ'), + ('\u{a8e0}', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', '\u{a92d}'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('\u{a980}', 'ę§'), + ('ę§', 'ę§'), + ('ę§ ', '꧞'), + ('ę¨', '\u{aa36}'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ ', 'ęŠļ'), + ('ęŠē', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ ', 'ęĢ¯'), + ('ęĢ˛', '\u{aaf6}'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab69}'), + ('ę°', 'ę¯Ē'), + ('ę¯Ŧ', '\u{abed}'), + ('ę¯°', 'ę¯š'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇģ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('ī¸ŗ', 'ī¸´'), + ('īš', 'īš'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('īŧ', 'īŧ'), + ('īŧĄ', 'īŧē'), + ('īŧŋ', 'īŧŋ'), + ('īŊ', 'īŊ'), + ('īŊĻ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ
', 'đ
´'), + ('\u{101fd}', '\u{101fd}'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('\u{102e0}', '\u{102e0}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{1037a}'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đŠ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', '\u{10ae6}'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đ´', '\u{10d27}'), + ('đ´°', 'đ´š'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', '\u{10f50}'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', '\u{11046}'), + ('đĻ', 'đ¯'), + ('\u{1107f}', '\u{110ba}'), + ('đ', 'đ¨'), + ('đ°', 'đš'), + ('\u{11100}', '\u{11134}'), + ('đļ', 'đŋ'), + ('đ
', '\u{11147}'), + ('đ
', '\u{11173}'), + ('đ
ļ', 'đ
ļ'), + ('\u{11180}', 'đ'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111ce}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', '\u{112ea}'), + ('đ°', 'đš'), + ('\u{11300}', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('\u{1133b}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đ', 'đŖ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{1145e}', '\u{11461}'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{115b5}'), + ('đ¸', '\u{115c0}'), + ('đ', '\u{115dd}'), + ('đ', '\u{11640}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¸'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{1171d}', '\u{1172b}'), + ('đ°', 'đš'), + ('đ ', '\u{1183a}'), + ('đĸ ', 'đŖŠ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{11943}'), + ('\u{11950}', '\u{11959}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', '\u{119d7}'), + ('\u{119da}', 'đ§Ą'), + ('đ§Ŗ', 'đ§¤'), + ('đ¨', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('đŠ', '\u{11a99}'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', '\u{11c36}'), + ('\u{11c38}', 'đą'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('\u{11c92}', '\u{11ca7}'), + ('đ˛Š', '\u{11cb6}'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', 'đļ'), + ('đļ ', 'đļŠ'), + ('đģ ', 'đģļ'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đŠ ', 'đŠŠ'), + ('đĢ', 'đĢ'), + ('\u{16af0}', '\u{16af4}'), + ('đŦ', '\u{16b36}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đš', 'đšŋ'), + ('đŧ', 'đŊ'), + ('\u{16f4f}', 'đž'), + ('\u{16f8f}', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', '\u{16fe4}'), + ('\u{16ff0}', '\u{16ff1}'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1d165}', '\u{1d169}'), + ('đ
', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŋ'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('đ', 'đŦ'), + ('\u{1e130}', 'đŊ'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ', 'đš'), + ('đ ', 'đŖ'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('đ¤', 'đĨ'), + ('đĨ', 'đĨ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('\u{1fbf0}', '\u{1fbf9}'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const ID_START: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Ë'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('ËŦ', 'ËŦ'), + ('ËŽ', 'ËŽ'), + ('Í°', 'Í´'), + ('Íļ', '͡'), + ('Íē', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ', 'Õ'), + ('Õ ', 'Ö'), + ('×', '×Ē'), + ('ׯ', 'ײ'), + ('Ø ', 'Ų'), + ('ŲŽ', 'Ų¯'), + ('Ųą', 'Û'), + ('Û', 'Û'), + ('ÛĨ', 'ÛĻ'), + ('ÛŽ', 'Û¯'), + ('Ûē', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', 'Ü'), + ('Ü', 'ܯ'), + ('Ũ', 'ŪĨ'), + ('Ūą', 'Ūą'), + ('ß', 'ßĒ'), + ('ß´', 'ßĩ'), + ('ßē', 'ßē'), + ('ā ', 'ā '), + ('ā ', 'ā '), + ('ā ¤', 'ā ¤'), + ('ā ¨', 'ā ¨'), + ('āĄ', 'āĄ'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('ā¤', 'ā¤š'), + ('ā¤Ŋ', 'ā¤Ŋ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨĄ'), + ('āĨą', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('āĻŊ', 'āĻŊ'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§Ą'), + ('ā§°', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠ˛', 'āŠ´'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('āĒŊ', 'āĒŊ'), + ('āĢ', 'āĢ'), + ('āĢ ', 'āĢĄ'), + ('āĢš', 'āĢš'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('āŦŊ', 'āŦŊ'), + ('ā', 'ā'), + ('ā', 'āĄ'), + ('āą', 'āą'), + ('āŽ', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('ā¯', 'ā¯'), + ('ā°
', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'ā°Ŋ'), + ('āą', 'āą'), + ('āą ', 'āąĄ'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('ā˛Ŋ', 'ā˛Ŋ'), + ('āŗ', 'āŗ'), + ('āŗ ', 'āŗĄ'), + ('āŗą', 'āŗ˛'), + ('\u{d04}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', 'ā´ē'), + ('ā´Ŋ', 'ā´Ŋ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩĄ'), + ('āĩē', 'āĩŋ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('ā¸', 'ā¸°'), + ('ā¸˛', 'ā¸ŗ'), + ('āš', 'āš'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āē°'), + ('āē˛', 'āēŗ'), + ('āēŊ', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('āŊ', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('āž', 'āž'), + ('á', 'áĒ'), + ('áŋ', 'áŋ'), + ('á', 'á'), + ('á', 'á'), + ('áĄ', 'áĄ'), + ('áĨ', 'áĻ'), + ('áŽ', 'á°'), + ('áĩ', 'á'), + ('á', 'á'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áŽ', 'á¸'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áą'), + ('á', 'á'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('á', 'áŗ'), + ('á', 'á'), + ('á', 'á'), + ('á ', '᥸'), + ('áĸ', 'áĸ¨'), + ('áĸĒ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á¨', 'á¨'), + ('ᨠ', 'áŠ'), + ('áĒ§', 'áĒ§'), + ('áŦ
', 'áŦŗ'), + ('á
', 'á'), + ('áŽ', 'Ꭰ'), + ('ᎎ', 'Ꭿ'), + ('áŽē', 'á¯Ĩ'), + ('á°', 'á°Ŗ'), + ('áą', 'áą'), + ('áą', 'áąŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('áŗŠ', 'áŗŦ'), + ('áŗŽ', 'áŗŗ'), + ('áŗĩ', 'áŗļ'), + ('áŗē', 'áŗē'), + ('á´', 'áļŋ'), + ('á¸', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗ¤'), + ('âŗĢ', 'âŗŽ'), + ('âŗ˛', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ¯'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('ã
', 'ã'), + ('ãĄ', 'ãŠ'), + ('ãą', 'ãĩ'), + ('ã¸', 'ãŧ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãĄ', 'ãē'), + ('ãŧ', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ã°', 'ãŋ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ęŊ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĒ', 'ęĢ'), + ('ę', 'ęŽ'), + ('ęŋ', 'ę'), + ('ę ', 'ę¯'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ę '), + ('ę ', 'ę
'), + ('ę ', 'ę '), + ('ę ', 'ę ĸ'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', 'ęĸŗ'), + ('ęŖ˛', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', 'ęŖž'), + ('ę¤', 'ę¤Ĩ'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('ęĻ', 'ęĻ˛'), + ('ę§', 'ę§'), + ('ę§ ', 'ꧤ'), + ('ę§Ļ', 'ę§¯'), + ('ę§ē', '꧞'), + ('ę¨', 'ꨨ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ ', 'ęŠļ'), + ('ęŠē', 'ęŠē'), + ('ꊞ', 'ęĒ¯'), + ('ęĒą', 'ęĒą'), + ('ęĒĩ', 'ęĒļ'), + ('ęĒš', 'ęĒŊ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ ', 'ęĢĒ'), + ('ęĢ˛', 'ęĢ´'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab69}'), + ('ę°', 'ę¯ĸ'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇģ'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), + ('īŊĻ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ
', 'đ
´'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', 'đĢ¤'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đ´', 'đ´Ŗ'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', 'đŊ
'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', 'đˇ'), + ('đ', 'đ¯'), + ('đ', 'đ¨'), + ('đ', 'đĻ'), + ('đ
', 'đ
'), + ('\u{11147}', '\u{11147}'), + ('đ
', 'đ
˛'), + ('đ
ļ', 'đ
ļ'), + ('đ', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĢ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đ', 'đ´'), + ('đ', 'đ'), + ('đ', '\u{11461}'), + ('đ', 'đ¯'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đ¯'), + ('đ', 'đ'), + ('đ', 'đĒ'), + ('đ¸', 'đ¸'), + ('đ', 'đ'), + ('đ ', 'đ Ģ'), + ('đĸ ', 'đŖ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{1192f}'), + ('\u{1193f}', '\u{1193f}'), + ('\u{11941}', '\u{11941}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', 'đ§'), + ('đ§Ą', 'đ§Ą'), + ('đ§Ŗ', 'đ§Ŗ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨˛'), + ('đ¨ē', 'đ¨ē'), + ('đŠ', 'đŠ'), + ('đŠ', 'đĒ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', 'đ°Ž'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', 'đ´°'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('đļ', 'đļ'), + ('đģ ', 'đģ˛'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đĢ', 'đĢ'), + ('đŦ', 'đŦ¯'), + ('đ', 'đ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đš', 'đšŋ'), + ('đŧ', 'đŊ'), + ('đŊ', 'đŊ'), + ('đž', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', 'đŋŖ'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŦ'), + ('đˇ', 'đŊ'), + ('đ
', 'đ
'), + ('đ', 'đĢ'), + ('đ ', 'đŖ'), + ('đ¤', 'đĨ'), + ('đĨ', 'đĨ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const IDEOGRAPHIC: &'static [(char, char)] = &[ + ('ã', 'ã'), + ('ãĄ', 'ãŠ'), + ('ã¸', 'ãē'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('\u{16fe4}', '\u{16fe4}'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ
°', 'đģ'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const JOIN_CONTROL: &'static [(char, char)] = &[('\u{200c}', '\u{200d}')]; + +pub const LOGICAL_ORDER_EXCEPTION: &'static [(char, char)] = &[ + ('āš', 'āš'), + ('āģ', 'āģ'), + ('áĻĩ', 'áĻˇ'), + ('áĻē', 'áĻē'), + ('ęĒĩ', 'ęĒļ'), + ('ęĒš', 'ęĒš'), + ('ęĒģ', 'ęĒŧ'), +]; + +pub const LOWERCASE: &'static [(char, char)] = &[ + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('Âē', 'Âē'), + ('Ã', 'Ãļ'), + ('ø', 'Ãŋ'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä
', 'Ä
'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('ÄĄ', 'ÄĄ'), + ('ÄŖ', 'ÄŖ'), + ('ÄĨ', 'ÄĨ'), + ('ħ', 'ħ'), + ('ÄŠ', 'ÄŠ'), + ('ÄĢ', 'ÄĢ'), + ('Ä', 'Ä'), + ('į', 'į'), + ('Äą', 'Äą'), + ('Äŗ', 'Äŗ'), + ('Äĩ', 'Äĩ'), + ('ġ', 'ĸ'), + ('Äē', 'Äē'), + ('Äŧ', 'Äŧ'), + ('Äž', 'Äž'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('ÅĄ', 'ÅĄ'), + ('ÅŖ', 'ÅŖ'), + ('ÅĨ', 'ÅĨ'), + ('ŧ', 'ŧ'), + ('ÅŠ', 'ÅŠ'), + ('ÅĢ', 'ÅĢ'), + ('Å', 'Å'), + ('ů', 'ů'), + ('Åą', 'Åą'), + ('Åŗ', 'Åŗ'), + ('Åĩ', 'Åĩ'), + ('Åˇ', 'Åˇ'), + ('Åē', 'Åē'), + ('Åŧ', 'Åŧ'), + ('Åž', 'Æ'), + ('Æ', 'Æ'), + ('Æ
', 'Æ
'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('ÆĄ', 'ÆĄ'), + ('ÆŖ', 'ÆŖ'), + ('ÆĨ', 'ÆĨ'), + ('ƨ', 'ƨ'), + ('ÆĒ', 'ÆĢ'), + ('Æ', 'Æ'), + ('Æ°', 'Æ°'), + ('Æ´', 'Æ´'), + ('Æļ', 'Æļ'), + ('Æš', 'Æē'), + ('ÆŊ', 'Æŋ'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('ĮĄ', 'ĮĄ'), + ('ĮŖ', 'ĮŖ'), + ('ĮĨ', 'ĮĨ'), + ('Į§', 'Į§'), + ('ĮŠ', 'ĮŠ'), + ('ĮĢ', 'ĮĢ'), + ('Į', 'Į'), + ('Į¯', 'Į°'), + ('Įŗ', 'Įŗ'), + ('Įĩ', 'Įĩ'), + ('Įš', 'Įš'), + ('Įģ', 'Įģ'), + ('ĮŊ', 'ĮŊ'), + ('Įŋ', 'Įŋ'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č
', 'Č
'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('ČĄ', 'ČĄ'), + ('ČŖ', 'ČŖ'), + ('ČĨ', 'ČĨ'), + ('ȧ', 'ȧ'), + ('ČŠ', 'ČŠ'), + ('ČĢ', 'ČĢ'), + ('Č', 'Č'), + ('Č¯', 'Č¯'), + ('Čą', 'Čą'), + ('Čŗ', 'Čš'), + ('Čŧ', 'Čŧ'), + ('Čŋ', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'Ę'), + ('Ę', 'ʸ'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('\u{345}', '\u{345}'), + ('Íą', 'Íą'), + ('Íŗ', 'Íŗ'), + ('͡', '͡'), + ('Íē', 'ÍŊ'), + ('Î', 'Î'), + ('ÎŦ', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('ĪĄ', 'ĪĄ'), + ('ĪŖ', 'ĪŖ'), + ('ĪĨ', 'ĪĨ'), + ('Ī§', 'Ī§'), + ('ĪŠ', 'ĪŠ'), + ('ĪĢ', 'ĪĢ'), + ('Ī', 'Ī'), + ('Ī¯', 'Īŗ'), + ('Īĩ', 'Īĩ'), + ('Ī¸', 'Ī¸'), + ('Īģ', 'Īŧ'), + ('Đ°', 'Ņ'), + ('ŅĄ', 'ŅĄ'), + ('ŅŖ', 'ŅŖ'), + ('ŅĨ', 'ŅĨ'), + ('Ņ§', 'Ņ§'), + ('ŅŠ', 'ŅŠ'), + ('ŅĢ', 'ŅĢ'), + ('Ņ', 'Ņ'), + ('Ņ¯', 'Ņ¯'), + ('Ņą', 'Ņą'), + ('Ņŗ', 'Ņŗ'), + ('Ņĩ', 'Ņĩ'), + ('Ņˇ', 'Ņˇ'), + ('Ņš', 'Ņš'), + ('Ņģ', 'Ņģ'), + ('ŅŊ', 'ŅŊ'), + ('Ņŋ', 'Ņŋ'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('ŌĄ', 'ŌĄ'), + ('ŌŖ', 'ŌŖ'), + ('ŌĨ', 'ŌĨ'), + ('Ō§', 'Ō§'), + ('ŌŠ', 'ŌŠ'), + ('ŌĢ', 'ŌĢ'), + ('Ō', 'Ō'), + ('Ō¯', 'Ō¯'), + ('Ōą', 'Ōą'), + ('Ōŗ', 'Ōŗ'), + ('Ōĩ', 'Ōĩ'), + ('Ōˇ', 'Ōˇ'), + ('Ōš', 'Ōš'), + ('Ōģ', 'Ōģ'), + ('ŌŊ', 'ŌŊ'), + ('Ōŋ', 'Ōŋ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('ĶĄ', 'ĶĄ'), + ('ĶŖ', 'ĶŖ'), + ('ĶĨ', 'ĶĨ'), + ('Ķ§', 'Ķ§'), + ('ĶŠ', 'ĶŠ'), + ('ĶĢ', 'ĶĢ'), + ('Ķ', 'Ķ'), + ('Ķ¯', 'Ķ¯'), + ('Ķą', 'Ķą'), + ('Ķŗ', 'Ķŗ'), + ('Ķĩ', 'Ķĩ'), + ('Ķˇ', 'Ķˇ'), + ('Ķš', 'Ķš'), + ('Ķģ', 'Ķģ'), + ('ĶŊ', 'ĶŊ'), + ('Ķŋ', 'Ķŋ'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô
', 'Ô
'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('ÔĄ', 'ÔĄ'), + ('ÔŖ', 'ÔŖ'), + ('ÔĨ', 'ÔĨ'), + ('Ô§', 'Ô§'), + ('ÔŠ', 'ÔŠ'), + ('ÔĢ', 'ÔĢ'), + ('Ô', 'Ô'), + ('Ô¯', 'Ô¯'), + ('Õ ', 'Ö'), + ('á', 'áē'), + ('áŊ', 'áŋ'), + ('á¸', 'áŊ'), + ('á˛', 'á˛'), + ('á´', 'áļŋ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸
', 'á¸
'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('ḥ', 'ḥ'), + ('á¸Ŗ', 'á¸Ŗ'), + ('á¸Ĩ', 'á¸Ĩ'), + ('ḧ', 'ḧ'), + ('Ḋ', 'Ḋ'), + ('á¸Ģ', 'á¸Ģ'), + ('á¸', 'á¸'), + ('ḯ', 'ḯ'), + ('ḹ', 'ḹ'), + ('á¸ŗ', 'á¸ŗ'), + ('á¸ĩ', 'á¸ĩ'), + ('ḡ', 'ḡ'), + ('Ḛ', 'Ḛ'), + ('á¸ģ', 'á¸ģ'), + ('á¸Ŋ', 'á¸Ŋ'), + ('á¸ŋ', 'á¸ŋ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš
', 'áš
'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('ᚥ', 'ᚥ'), + ('ášŖ', 'ášŖ'), + ('ášĨ', 'ášĨ'), + ('ᚧ', 'ᚧ'), + ('ᚊ', 'ᚊ'), + ('ášĢ', 'ášĢ'), + ('áš', 'áš'), + ('ᚯ', 'ᚯ'), + ('ášą', 'ášą'), + ('ášŗ', 'ášŗ'), + ('ášĩ', 'ášĩ'), + ('ᚡ', 'ᚡ'), + ('ášš', 'ášš'), + ('ášģ', 'ášģ'), + ('ášŊ', 'ášŊ'), + ('ášŋ', 'ášŋ'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē
', 'áē
'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áēĄ', 'áēĄ'), + ('áēŖ', 'áēŖ'), + ('áēĨ', 'áēĨ'), + ('áē§', 'áē§'), + ('áēŠ', 'áēŠ'), + ('áēĢ', 'áēĢ'), + ('áē', 'áē'), + ('áē¯', 'áē¯'), + ('áēą', 'áēą'), + ('áēŗ', 'áēŗ'), + ('áēĩ', 'áēĩ'), + ('áēˇ', 'áēˇ'), + ('áēš', 'áēš'), + ('áēģ', 'áēģ'), + ('áēŊ', 'áēŊ'), + ('áēŋ', 'áēŋ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ
', 'áģ
'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģĄ', 'áģĄ'), + ('áģŖ', 'áģŖ'), + ('áģĨ', 'áģĨ'), + ('áģ§', 'áģ§'), + ('áģŠ', 'áģŠ'), + ('áģĢ', 'áģĢ'), + ('áģ', 'áģ'), + ('áģ¯', 'áģ¯'), + ('áģą', 'áģą'), + ('áģŗ', 'áģŗ'), + ('áģĩ', 'áģĩ'), + ('áģˇ', 'áģˇ'), + ('áģš', 'áģš'), + ('áģģ', 'áģģ'), + ('áģŊ', 'áģŊ'), + ('áģŋ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŧ§'), + ('áŧ°', 'áŧˇ'), + ('áŊ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ ', 'áŊ§'), + ('áŊ°', 'áŊŊ'), + ('áž', 'áž'), + ('áž', 'áž'), + ('áž ', 'ឧ'), + ('áž°', 'áž´'), + ('ážļ', 'ឡ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋ§'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋˇ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¯', 'â¯'), + ('â´', 'â´'), + ('âš', 'âš'), + ('âŧ', 'âŊ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
°', 'â
ŋ'), + ('â', 'â'), + ('â', 'âŠ'), + ('â°°', 'âą'), + ('⹥', '⹥'), + ('âąĨ', 'âąĻ'), + ('⹨', '⹨'), + ('âąĒ', 'âąĒ'), + ('âąŦ', 'âąŦ'), + ('âąą', 'âąą'), + ('âąŗ', 'âą´'), + ('âąļ', 'âąŊ'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛
', 'â˛
'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('ⲥ', 'ⲥ'), + ('â˛Ŗ', 'â˛Ŗ'), + ('â˛Ĩ', 'â˛Ĩ'), + ('ⲧ', 'ⲧ'), + ('Ⲋ', 'Ⲋ'), + ('â˛Ģ', 'â˛Ģ'), + ('â˛', 'â˛'), + ('â˛¯', 'â˛¯'), + ('ⲹ', 'ⲹ'), + ('â˛ŗ', 'â˛ŗ'), + ('â˛ĩ', 'â˛ĩ'), + ('ⲡ', 'ⲡ'), + ('Ⲛ', 'Ⲛ'), + ('â˛ģ', 'â˛ģ'), + ('â˛Ŋ', 'â˛Ŋ'), + ('â˛ŋ', 'â˛ŋ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ
', 'âŗ
'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗĄ', 'âŗĄ'), + ('âŗŖ', 'âŗ¤'), + ('âŗŦ', 'âŗŦ'), + ('âŗŽ', 'âŗŽ'), + ('âŗŗ', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ęą'), + ('ęŗ', 'ęŗ'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ę¯', 'ę¯'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('\u{a7c8}', '\u{a7c8}'), + ('\u{a7ca}', '\u{a7ca}'), + ('\u{a7f6}', '\u{a7f6}'), + ('ę¸', 'ęē'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab68}'), + ('ę°', 'ęŽŋ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŊ', 'īŊ'), + ('đ¨', 'đ'), + ('đ', 'đģ'), + ('đŗ', 'đŗ˛'), + ('đŖ', 'đŖ'), + ('đš ', 'đšŋ'), + ('đ', 'đŗ'), + ('đ', 'đ'), + ('đ', 'đ§'), + ('đ', 'đ'), + ('đļ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ'), + ('đĒ', 'đ'), + ('đ', 'đˇ'), + ('đ', 'đĢ'), + ('đ', 'đ'), + ('đē', 'đ'), + ('đŽ', 'đ'), + ('đĸ', 'đģ'), + ('đ', 'đ¯'), + ('đ', 'đĨ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đŧ', 'đ'), + ('đ', 'đ'), + ('đļ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đ'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¤ĸ', 'đĨ'), +]; + +pub const MATH: &'static [(char, char)] = &[ + ('+', '+'), + ('<', '>'), + ('^', '^'), + ('|', '|'), + ('~', '~'), + ('ÂŦ', 'ÂŦ'), + ('Âą', 'Âą'), + ('Ã', 'Ã'), + ('Ãˇ', 'Ãˇ'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī°', 'Īą'), + ('Ī´', 'Īļ'), + ('Ø', 'Ø'), + ('â', 'â'), + ('â˛', 'â´'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('\u{2061}', '\u{2064}'), + ('âē', 'âž'), + ('â', 'â'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20e6}'), + ('\u{20eb}', '\u{20ef}'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('â¨', 'âŠ'), + ('âŦ', 'â'), + ('â¯', 'âą'), + ('âŗ', 'â¸'), + ('âŧ', 'â
'), + ('â
', 'â
'), + ('â', 'â§'), + ('âŠ', 'âŽ'), + ('â°', 'âą'), + ('âļ', 'âˇ'), + ('âŧ', 'â'), + ('â', 'â'), + ('â¤', 'âĨ'), + ('â´', 'âŋ'), + ('â', 'â'), + ('â ', 'âĄ'), + ('âŧ', 'âŧ'), + ('â', 'âĩ'), + ('âˇ', 'âˇ'), + ('â', 'â'), + ('â', 'âĸ'), + ('â ', 'âĄ'), + ('âŽ', 'âˇ'), + ('âŧ', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âĸ', 'âĸ'), + ('â¤', 'â¤'), + ('â§', 'âŦ'), + ('â¸', 'âŋ'), + ('â
', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â ', 'âŖ'), + ('â', 'â¯'), + ('â', 'âŋ'), + ('â¤', 'âĢŋ'), + ('âŦ°', 'â'), + ('â', 'â'), + ('īŦŠ', 'īŦŠ'), + ('īšĄ', 'īšĻ'), + ('īš¨', 'īš¨'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧŧ', 'īŧŧ'), + ('īŧž', 'īŧž'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ'), + ('īŋĸ', 'īŋĸ'), + ('īŋŠ', 'īŋŦ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đŋ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đģ°', 'đģą'), +]; + +pub const NONCHARACTER_CODE_POINT: &'static [(char, char)] = &[ + ('\u{fdd0}', '\u{fdef}'), + ('\u{fffe}', '\u{ffff}'), + ('\u{1fffe}', '\u{1ffff}'), + ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), + ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), + ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), + ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), + ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), + ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), + ('\u{efffe}', '\u{effff}'), + ('\u{ffffe}', '\u{fffff}'), + ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const OTHER_ALPHABETIC: &'static [(char, char)] = &[ + ('\u{345}', '\u{345}'), + ('\u{5b0}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{657}'), + ('\u{659}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6e1}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ed}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{73f}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{816}', '\u{817}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82c}'), + ('\u{8d4}', '\u{8df}'), + ('\u{8e3}', '\u{8e9}'), + ('\u{8f0}', 'ā¤'), + ('\u{93a}', 'ā¤ģ'), + ('ā¤ž', 'āĨ'), + ('āĨ', 'āĨ'), + ('\u{955}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'āĻ'), + ('\u{9be}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{a01}', 'ā¨'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4c}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'āĒ'), + ('āĒž', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', 'āĢ'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{afc}'), + ('\u{b01}', 'āŦ'), + ('\u{b3e}', '\u{b44}'), + ('ā', 'ā'), + ('ā', 'ā'), + ('\u{b56}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', 'ā¯'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', 'ā°'), + ('\u{c3e}', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4c}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ā˛'), + ('ā˛ž', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccc}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', 'ā´'), + ('\u{d3e}', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'āļ'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇ˛', 'āˇŗ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e4d}', '\u{e4d}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{eb9}'), + ('\u{ebb}', '\u{ebc}'), + ('\u{ecd}', '\u{ecd}'), + ('\u{f71}', '\u{f81}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('áĢ', '\u{1036}'), + ('á¸', 'á¸'), + ('áģ', '\u{103e}'), + ('á', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('áĸ', 'á¤'), + ('á§', 'á'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('á', 'á'), + ('á', '\u{109d}'), + ('\u{1712}', '\u{1713}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('áļ', 'á'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', 'ᤸ'), + ('\u{1a17}', '\u{1a1b}'), + ('áŠ', '\u{1a5e}'), + ('እ', '\u{1a74}'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{1b00}', 'áŦ'), + ('\u{1b35}', 'á'), + ('\u{1b80}', 'áŽ'), + ('Ꭵ', '\u{1ba9}'), + ('\u{1bac}', '\u{1bad}'), + ('ᯧ', '\u{1bf1}'), + ('á°¤', '\u{1c36}'), + ('\u{1de7}', '\u{1df4}'), + ('âļ', 'âŠ'), + ('\u{2de0}', '\u{2dff}'), + ('\u{a674}', '\u{a67b}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a802}', '\u{a802}'), + ('\u{a80b}', '\u{a80b}'), + ('ę Ŗ', 'ę §'), + ('ęĸ', 'ęĸ'), + ('ęĸ´', 'ęŖ'), + ('\u{a8c5}', '\u{a8c5}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92a}'), + ('\u{a947}', 'ęĨ'), + ('\u{a980}', 'ęĻ'), + ('ęĻ´', 'ęĻŋ'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ęŠ'), + ('ęŠģ', 'ęŠŊ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabe}'), + ('ęĢĢ', 'ęĢ¯'), + ('ęĢĩ', 'ęĢĩ'), + ('ę¯Ŗ', 'ę¯Ē'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('đ', 'đ'), + ('\u{11038}', '\u{11045}'), + ('đ', 'đ'), + ('đ°', 'đ¸'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11132}'), + ('đ
', 'đ
'), + ('\u{11180}', 'đ'), + ('đŗ', 'đŋ'), + ('\u{111ce}', '\u{111cf}'), + ('đŦ', '\u{11234}'), + ('\u{11237}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{112df}', '\u{112e8}'), + ('\u{11300}', 'đ'), + ('\u{1133e}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đĸ', 'đŖ'), + ('đĩ', 'đ'), + ('\u{11443}', 'đ
'), + ('\u{114b0}', 'đ'), + ('\u{115af}', '\u{115b5}'), + ('đ¸', 'đž'), + ('\u{115dc}', '\u{115dd}'), + ('đ°', 'đž'), + ('\u{11640}', '\u{11640}'), + ('\u{116ab}', '\u{116b5}'), + ('\u{1171d}', '\u{1172a}'), + ('đ Ŧ', 'đ ¸'), + ('\u{11930}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{11940}', '\u{11940}'), + ('\u{11942}', '\u{11942}'), + ('đ§', '\u{119d7}'), + ('\u{119da}', 'đ§'), + ('đ§¤', 'đ§¤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a35}', 'đ¨š'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', 'đĒ'), + ('đ°¯', '\u{11c36}'), + ('\u{11c38}', 'đ°ž'), + ('\u{11c92}', '\u{11ca7}'), + ('đ˛Š', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d41}'), + ('\u{11d43}', '\u{11d43}'), + ('\u{11d47}', '\u{11d47}'), + ('đļ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', 'đļ'), + ('\u{11ef3}', 'đģļ'), + ('\u{16f4f}', '\u{16f4f}'), + ('đŊ', 'đž'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16ff0}', '\u{16ff1}'), + ('\u{1bc9e}', '\u{1bc9e}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e947}', '\u{1e947}'), + ('đ°', 'đ
'), + ('đ
', 'đ
Š'), + ('đ
°', 'đ'), +]; + +pub const OTHER_DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('\u{34f}', '\u{34f}'), + ('á
', 'á
'), + ('\u{17b4}', '\u{17b5}'), + ('\u{2065}', '\u{2065}'), + ('ã
¤', 'ã
¤'), + ('īž ', 'īž '), + ('\u{fff0}', '\u{fff8}'), + ('\u{e0000}', '\u{e0000}'), + ('\u{e0002}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{e0fff}'), +]; + +pub const OTHER_GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('\u{9be}', '\u{9be}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{b3e}', '\u{b3e}'), + ('\u{b57}', '\u{b57}'), + ('\u{bbe}', '\u{bbe}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{cc2}', '\u{cc2}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{d3e}', '\u{d3e}'), + ('\u{d57}', '\u{d57}'), + ('\u{dcf}', '\u{dcf}'), + ('\u{ddf}', '\u{ddf}'), + ('\u{1b35}', '\u{1b35}'), + ('\u{200c}', '\u{200c}'), + ('\u{302e}', '\u{302f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{1133e}', '\u{1133e}'), + ('\u{11357}', '\u{11357}'), + ('\u{114b0}', '\u{114b0}'), + ('\u{114bd}', '\u{114bd}'), + ('\u{115af}', '\u{115af}'), + ('\u{11930}', '\u{11930}'), + ('\u{1d165}', '\u{1d165}'), + ('\u{1d16e}', '\u{1d172}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const OTHER_ID_CONTINUE: &'static [(char, char)] = + &[('¡', '¡'), ('Î', 'Î'), ('áŠ', 'áą'), ('á§', 'á§')]; + +pub const OTHER_ID_START: &'static [(char, char)] = + &[('\u{1885}', '\u{1886}'), ('â', 'â'), ('âŽ', 'âŽ'), ('ã', 'ã')]; + +pub const OTHER_LOWERCASE: &'static [(char, char)] = &[ + ('ÂĒ', 'ÂĒ'), + ('Âē', 'Âē'), + ('Ę°', 'ʸ'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('\u{345}', '\u{345}'), + ('Íē', 'Íē'), + ('á´Ŧ', 'áĩĒ'), + ('áĩ¸', 'áĩ¸'), + ('áļ', 'áļŋ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â
°', 'â
ŋ'), + ('â', 'âŠ'), + ('âąŧ', 'âąŊ'), + ('ę', 'ę'), + ('ę°', 'ę°'), + ('ę¸', 'ęš'), + ('ę', 'ę'), +]; + +pub const OTHER_MATH: &'static [(char, char)] = &[ + ('^', '^'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī°', 'Īą'), + ('Ī´', 'Īĩ'), + ('â', 'â'), + ('â˛', 'â´'), + ('â', 'â'), + ('\u{2061}', '\u{2064}'), + ('âŊ', 'âž'), + ('â', 'â'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20e6}'), + ('\u{20eb}', '\u{20ef}'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('â¨', 'âŠ'), + ('âŦ', 'â'), + ('â¯', 'âą'), + ('âŗ', 'â¸'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'â'), + ('âĄ', 'âĸ'), + ('â¤', 'âĨ'), + ('â§', 'â§'), + ('âŠ', 'â'), + ('â°', 'âą'), + ('âļ', 'âˇ'), + ('âŧ', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'âĨ'), + ('â', 'â'), + ('â´', 'âĩ'), + ('âˇ', 'âˇ'), + ('â', 'â'), + ('âĸ', 'âĸ'), + ('â ', 'âĄ'), + ('âŽ', 'âļ'), + ('âŧ', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('âĸ', 'âĸ'), + ('â¤', 'â¤'), + ('â§', 'âŦ'), + ('â
', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â ', 'âŖ'), + ('â', 'âŽ'), + ('â
', 'â'), + ('âĻ', 'â¯'), + ('âĻ', 'âĻ'), + ('â§', 'â§'), + ('â§ŧ', 'â§Ŋ'), + ('īšĄ', 'īšĄ'), + ('īšŖ', 'īšŖ'), + ('īš¨', 'īš¨'), + ('īŧŧ', 'īŧŧ'), + ('īŧž', 'īŧž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŋ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), +]; + +pub const OTHER_UPPERCASE: &'static [(char, char)] = + &[('â
', 'â
¯'), ('âļ', 'â'), ('đ°', 'đ
'), ('đ
', 'đ
Š'), ('đ
°', 'đ')]; + +pub const PATTERN_SYNTAX: &'static [(char, char)] = &[ + ('!', '/'), + (':', '@'), + ('[', '^'), + ('`', '`'), + ('{', '~'), + ('ÂĄ', '§'), + ('Š', 'Š'), + ('ÂĢ', 'ÂŦ'), + ('ÂŽ', 'ÂŽ'), + ('°', 'Âą'), + ('Âļ', 'Âļ'), + ('Âģ', 'Âģ'), + ('Âŋ', 'Âŋ'), + ('Ã', 'Ã'), + ('Ãˇ', 'Ãˇ'), + ('â', 'â§'), + ('â°', 'âž'), + ('â', 'â'), + ('â', 'â'), + ('â', '\u{245f}'), + ('â', 'âĩ'), + ('â', 'â¯ŋ'), + ('â¸', '\u{2e7f}'), + ('ã', 'ã'), + ('ã', 'ã '), + ('ã°', 'ã°'), + ('ī´ž', 'ī´ŋ'), + ('īš
', 'īš'), +]; + +pub const PATTERN_WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{200e}', '\u{200f}'), + ('\u{2028}', '\u{2029}'), +]; + +pub const PREPENDED_CONCATENATION_MARK: &'static [(char, char)] = &[ + ('\u{600}', '\u{605}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), +]; + +pub const QUOTATION_MARK: &'static [(char, char)] = &[ + ('\"', '\"'), + ('\'', '\''), + ('ÂĢ', 'ÂĢ'), + ('Âģ', 'Âģ'), + ('â', 'â'), + ('âš', 'âē'), + ('âš', 'âš'), + ('ã', 'ã'), + ('ã', 'ã'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŊĸ', 'īŊŖ'), +]; + +pub const RADICAL: &'static [(char, char)] = + &[('âē', 'âē'), ('âē', 'âģŗ'), ('âŧ', 'âŋ')]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('đĻ', 'đŋ')]; + +pub const SENTENCE_TERMINAL: &'static [(char, char)] = &[ + ('!', '!'), + ('.', '.'), + ('?', '?'), + ('Ö', 'Ö'), + ('Ø', 'Ø'), + ('Û', 'Û'), + ('Ü', 'Ü'), + ('ßš', 'ßš'), + ('ā ˇ', 'ā ˇ'), + ('ā š', 'ā š'), + ('ā Ŋ', 'ā ž'), + ('āĨ¤', 'āĨĨ'), + ('á', 'á'), + ('áĸ', 'áĸ'), + ('á§', 'á¨'), + ('áŽ', 'áŽ'), + ('áĩ', 'áļ'), + ('á ', 'á '), + ('á ', 'á '), + ('áĨ', 'áĨ
'), + ('áĒ¨', 'áĒĢ'), + ('á', 'á'), + ('á', 'á'), + ('á°ģ', 'á°ŧ'), + ('áąž', 'áąŋ'), + ('âŧ', 'âŊ'), + ('â', 'â'), + ('⸎', '⸎'), + ('â¸ŧ', 'â¸ŧ'), + ('ã', 'ã'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ęŗ', 'ęŗ'), + ('ęˇ', 'ęˇ'), + ('ęĄļ', 'ꥡ'), + ('ęŖ', 'ęŖ'), + ('ę¤¯', 'ę¤¯'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ°', 'ęĢą'), + ('ę¯Ģ', 'ę¯Ģ'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŊĄ', 'īŊĄ'), + ('đŠ', 'đŠ'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đž', 'đ'), + ('đ
', 'đ
'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¸', 'đš'), + ('đģ', 'đŧ'), + ('đŠ', 'đŠ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đŧ', 'đž'), + ('\u{11944}', '\u{11944}'), + ('\u{11946}', '\u{11946}'), + ('đŠ', 'đŠ'), + ('đĒ', 'đĒ'), + ('đą', 'đą'), + ('đģˇ', 'đģ¸'), + ('đŠŽ', 'đŠ¯'), + ('đĢĩ', 'đĢĩ'), + ('đŦˇ', 'đŦ¸'), + ('đ', 'đ'), + ('đē', 'đē'), + ('đ˛', 'đ˛'), + ('đĒ', 'đĒ'), +]; + +pub const SOFT_DOTTED: &'static [(char, char)] = &[ + ('i', 'j'), + ('į', 'į'), + ('É', 'É'), + ('ɨ', 'ɨ'), + ('Ę', 'Ę'), + ('ʲ', 'ʲ'), + ('Īŗ', 'Īŗ'), + ('Ņ', 'Ņ'), + ('Ņ', 'Ņ'), + ('áĩĸ', 'áĩĸ'), + ('áļ', 'áļ'), + ('áļ¤', 'áļ¤'), + ('áļ¨', 'áļ¨'), + ('á¸', 'á¸'), + ('áģ', 'áģ'), + ('âą', 'âą'), + ('â
', 'â
'), + ('âąŧ', 'âąŧ'), + ('đĸ', 'đŖ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đž', 'đŋ'), + ('đ˛', 'đŗ'), + ('đĻ', 'đ§'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đļ', 'đˇ'), + ('đĒ', 'đĢ'), + ('đ', 'đ'), + ('đ', 'đ'), +]; + +pub const TERMINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '!'), + (',', ','), + ('.', '.'), + (':', ';'), + ('?', '?'), + ('Íž', 'Íž'), + ('Î', 'Î'), + ('Ö', 'Ö'), + ('×', '×'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Û', 'Û'), + ('Ü', 'Ü'), + ('Ü', 'Ü'), + ('߸', 'ßš'), + ('ā °', 'ā ž'), + ('āĄ', 'āĄ'), + ('āĨ¤', 'āĨĨ'), + ('āš', 'āš'), + ('āŧ', 'āŧ'), + ('āŧ', 'āŧ'), + ('á', 'á'), + ('áĄ', 'á¨'), + ('áŽ', 'áŽ'), + ('áĢ', 'á'), + ('áĩ', 'áļ'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á
'), + ('á ', 'á '), + ('áĨ', 'áĨ
'), + ('áĒ¨', 'áĒĢ'), + ('á', 'á'), + ('á', 'á'), + ('á°ģ', 'á°ŋ'), + ('áąž', 'áąŋ'), + ('âŧ', 'âŊ'), + ('â', 'â'), + ('⸎', '⸎'), + ('â¸ŧ', 'â¸ŧ'), + ('âš', 'âš'), + ('âš', 'âš'), + ('âš', 'âš'), + ('ã', 'ã'), + ('ęž', 'ęŋ'), + ('ę', 'ę'), + ('ęŗ', 'ęˇ'), + ('ęĄļ', 'ꥡ'), + ('ęŖ', 'ęŖ'), + ('ę¤¯', 'ę¤¯'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ęĢ°', 'ęĢą'), + ('ę¯Ģ', 'ę¯Ģ'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŊĄ', 'īŊĄ'), + ('īŊ¤', 'īŊ¤'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĄ', 'đĄ'), + ('đ¤', 'đ¤'), + ('đŠ', 'đŠ'), + ('đĢ°', 'đĢĩ'), + ('đŦē', 'đŦŋ'), + ('đŽ', 'đŽ'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đž', 'đ'), + ('đ
', 'đ
'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¸', 'đŧ'), + ('đŠ', 'đŠ'), + ('đ', 'đ'), + ('\u{1145a}', 'đ'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đŧ', 'đž'), + ('\u{11944}', '\u{11944}'), + ('\u{11946}', '\u{11946}'), + ('đŠ', 'đŠ'), + ('đĒ', 'đĒ'), + ('đĒĄ', 'đĒĸ'), + ('đą', 'đą'), + ('đąą', 'đąą'), + ('đģˇ', 'đģ¸'), + ('đ°', 'đ´'), + ('đŠŽ', 'đŠ¯'), + ('đĢĩ', 'đĢĩ'), + ('đŦˇ', 'đŦš'), + ('đ', 'đ'), + ('đē', 'đē'), + ('đ˛', 'đ˛'), + ('đĒ', 'đĒ'), +]; + +pub const UNIFIED_IDEOGRAPH: &'static [(char, char)] = &[ + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ī¨', 'ī¨'), + ('ī¨', 'ī¨'), + ('ī¨', 'ī¨'), + ('ī¨', 'ī¨'), + ('ī¨Ą', 'ī¨Ą'), + ('ī¨Ŗ', 'ī¨¤'), + ('ī¨§', 'ī¨Š'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('\u{30000}', '\u{3134a}'), +]; + +pub const UPPERCASE: &'static [(char, char)] = &[ + ('A', 'Z'), + ('Ã', 'Ã'), + ('Ã', 'Ã'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä ', 'Ä '), + ('Äĸ', 'Äĸ'), + ('Ĥ', 'Ĥ'), + ('ÄĻ', 'ÄĻ'), + ('Ĩ', 'Ĩ'), + ('ÄĒ', 'ÄĒ'), + ('ÄŦ', 'ÄŦ'), + ('ÄŽ', 'ÄŽ'), + ('Ä°', 'Ä°'), + ('IJ', 'IJ'), + ('Ä´', 'Ä´'), + ('Äļ', 'Äļ'), + ('Äš', 'Äš'), + ('Äģ', 'Äģ'), + ('ÄŊ', 'ÄŊ'), + ('Äŋ', 'Äŋ'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å
', 'Å
'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å ', 'Å '), + ('Åĸ', 'Åĸ'), + ('Ť', 'Ť'), + ('ÅĻ', 'ÅĻ'), + ('Ũ', 'Ũ'), + ('ÅĒ', 'ÅĒ'), + ('ÅŦ', 'ÅŦ'), + ('ÅŽ', 'ÅŽ'), + ('Å°', 'Å°'), + ('Å˛', 'Å˛'), + ('Å´', 'Å´'), + ('Åļ', 'Åļ'), + ('Ÿ', 'Åš'), + ('Åģ', 'Åģ'), + ('ÅŊ', 'ÅŊ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ '), + ('Æĸ', 'Æĸ'), + ('Ƥ', 'Ƥ'), + ('ÆĻ', 'Ƨ'), + ('ÆŠ', 'ÆŠ'), + ('ÆŦ', 'ÆŦ'), + ('ÆŽ', 'Ư'), + ('Æą', 'Æŗ'), + ('Æĩ', 'Æĩ'), + ('Æˇ', 'Ƹ'), + ('Æŧ', 'Æŧ'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į ', 'Į '), + ('Įĸ', 'Įĸ'), + ('Į¤', 'Į¤'), + ('ĮĻ', 'ĮĻ'), + ('Į¨', 'Į¨'), + ('ĮĒ', 'ĮĒ'), + ('ĮŦ', 'ĮŦ'), + ('ĮŽ', 'ĮŽ'), + ('Įą', 'Įą'), + ('Į´', 'Į´'), + ('Įļ', 'Į¸'), + ('Įē', 'Įē'), + ('Įŧ', 'Įŧ'), + ('Įž', 'Įž'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č ', 'Č '), + ('Čĸ', 'Čĸ'), + ('Ȥ', 'Ȥ'), + ('ČĻ', 'ČĻ'), + ('Ȩ', 'Ȩ'), + ('ČĒ', 'ČĒ'), + ('ČŦ', 'ČŦ'), + ('ČŽ', 'ČŽ'), + ('Č°', 'Č°'), + ('Ȳ', 'Ȳ'), + ('Čē', 'Čģ'), + ('ČŊ', 'Čž'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('Í°', 'Í°'), + ('Ͳ', 'Ͳ'), + ('Íļ', 'Íļ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'ÎĢ'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī ', 'Ī '), + ('Īĸ', 'Īĸ'), + ('Ī¤', 'Ī¤'), + ('ĪĻ', 'ĪĻ'), + ('Ī¨', 'Ī¨'), + ('ĪĒ', 'ĪĒ'), + ('ĪŦ', 'ĪŦ'), + ('ĪŽ', 'ĪŽ'), + ('Ī´', 'Ī´'), + ('Īˇ', 'Īˇ'), + ('Īš', 'Īē'), + ('ĪŊ', 'Đ¯'), + ('Ņ ', 'Ņ '), + ('Ņĸ', 'Ņĸ'), + ('Ņ¤', 'Ņ¤'), + ('ŅĻ', 'ŅĻ'), + ('Ņ¨', 'Ņ¨'), + ('ŅĒ', 'ŅĒ'), + ('ŅŦ', 'ŅŦ'), + ('ŅŽ', 'ŅŽ'), + ('Ņ°', 'Ņ°'), + ('Ņ˛', 'Ņ˛'), + ('Ņ´', 'Ņ´'), + ('Ņļ', 'Ņļ'), + ('Ņ¸', 'Ņ¸'), + ('Ņē', 'Ņē'), + ('Ņŧ', 'Ņŧ'), + ('Ņž', 'Ņž'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō ', 'Ō '), + ('Ōĸ', 'Ōĸ'), + ('Ō¤', 'Ō¤'), + ('ŌĻ', 'ŌĻ'), + ('Ō¨', 'Ō¨'), + ('ŌĒ', 'ŌĒ'), + ('ŌŦ', 'ŌŦ'), + ('ŌŽ', 'ŌŽ'), + ('Ō°', 'Ō°'), + ('Ō˛', 'Ō˛'), + ('Ō´', 'Ō´'), + ('Ōļ', 'Ōļ'), + ('Ō¸', 'Ō¸'), + ('Ōē', 'Ōē'), + ('Ōŧ', 'Ōŧ'), + ('Ōž', 'Ōž'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ
', 'Ķ
'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ ', 'Ķ '), + ('Ķĸ', 'Ķĸ'), + ('Ķ¤', 'Ķ¤'), + ('ĶĻ', 'ĶĻ'), + ('Ķ¨', 'Ķ¨'), + ('ĶĒ', 'ĶĒ'), + ('ĶŦ', 'ĶŦ'), + ('ĶŽ', 'ĶŽ'), + ('Ķ°', 'Ķ°'), + ('Ķ˛', 'Ķ˛'), + ('Ķ´', 'Ķ´'), + ('Ķļ', 'Ķļ'), + ('Ķ¸', 'Ķ¸'), + ('Ķē', 'Ķē'), + ('Ķŧ', 'Ķŧ'), + ('Ķž', 'Ķž'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô ', 'Ô '), + ('Ôĸ', 'Ôĸ'), + ('Ô¤', 'Ô¤'), + ('ÔĻ', 'ÔĻ'), + ('Ô¨', 'Ô¨'), + ('ÔĒ', 'ÔĒ'), + ('ÔŦ', 'ÔŦ'), + ('ÔŽ', 'ÔŽ'), + ('Ôą', 'Õ'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('Ḡ', 'Ḡ'), + ('á¸ĸ', 'á¸ĸ'), + ('Ḥ', 'Ḥ'), + ('á¸Ļ', 'á¸Ļ'), + ('Ḩ', 'Ḩ'), + ('á¸Ē', 'á¸Ē'), + ('á¸Ŧ', 'á¸Ŧ'), + ('Ḏ', 'Ḏ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('á¸ļ', 'á¸ļ'), + ('Ḹ', 'Ḹ'), + ('á¸ē', 'á¸ē'), + ('á¸ŧ', 'á¸ŧ'), + ('Ḟ', 'Ḟ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš ', 'áš '), + ('ášĸ', 'ášĸ'), + ('ᚤ', 'ᚤ'), + ('ášĻ', 'ášĻ'), + ('ᚨ', 'ᚨ'), + ('ášĒ', 'ášĒ'), + ('ášŦ', 'ášŦ'), + ('ᚎ', 'ᚎ'), + ('áš°', 'áš°'), + ('ᚲ', 'ᚲ'), + ('áš´', 'áš´'), + ('ášļ', 'ášļ'), + ('ᚸ', 'ᚸ'), + ('ášē', 'ášē'), + ('ášŧ', 'ášŧ'), + ('ášž', 'ášž'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē ', 'áē '), + ('áēĸ', 'áēĸ'), + ('áē¤', 'áē¤'), + ('áēĻ', 'áēĻ'), + ('áē¨', 'áē¨'), + ('áēĒ', 'áēĒ'), + ('áēŦ', 'áēŦ'), + ('áēŽ', 'áēŽ'), + ('áē°', 'áē°'), + ('áē˛', 'áē˛'), + ('áē´', 'áē´'), + ('áēļ', 'áēļ'), + ('áē¸', 'áē¸'), + ('áēē', 'áēē'), + ('áēŧ', 'áēŧ'), + ('áēž', 'áēž'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ ', 'áģ '), + ('áģĸ', 'áģĸ'), + ('áģ¤', 'áģ¤'), + ('áģĻ', 'áģĻ'), + ('áģ¨', 'áģ¨'), + ('áģĒ', 'áģĒ'), + ('áģŦ', 'áģŦ'), + ('áģŽ', 'áģŽ'), + ('áģ°', 'áģ°'), + ('áģ˛', 'áģ˛'), + ('áģ´', 'áģ´'), + ('áģļ', 'áģļ'), + ('áģ¸', 'áģ¸'), + ('áģē', 'áģē'), + ('áģŧ', 'áģŧ'), + ('áģž', 'áģž'), + ('áŧ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ¨', 'áŧ¯'), + ('áŧ¸', 'áŧŋ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ¨', 'áŊ¯'), + ('ី', 'ážģ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ¨', 'áŋŦ'), + ('áŋ¸', 'áŋģ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'â'), + ('â°', 'âŗ'), + ('âž', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
¯'), + ('â', 'â'), + ('âļ', 'â'), + ('â°', 'â°Ž'), + ('âą ', 'âą '), + ('âąĸ', '⹤'), + ('⹧', '⹧'), + ('⹊', '⹊'), + ('âąĢ', 'âąĢ'), + ('âą', 'âą°'), + ('⹲', '⹲'), + ('âąĩ', 'âąĩ'), + ('âąž', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('Ⲡ', 'Ⲡ'), + ('â˛ĸ', 'â˛ĸ'), + ('Ⲥ', 'Ⲥ'), + ('â˛Ļ', 'â˛Ļ'), + ('Ⲩ', 'Ⲩ'), + ('â˛Ē', 'â˛Ē'), + ('â˛Ŧ', 'â˛Ŧ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('â˛ļ', 'â˛ļ'), + ('Ⲹ', 'Ⲹ'), + ('â˛ē', 'â˛ē'), + ('â˛ŧ', 'â˛ŧ'), + ('Ⲟ', 'Ⲟ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ ', 'âŗ '), + ('âŗĸ', 'âŗĸ'), + ('âŗĢ', 'âŗĢ'), + ('âŗ', 'âŗ'), + ('âŗ˛', 'âŗ˛'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ę˛', 'ę˛'), + ('ę´', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęŽ'), + ('ę°', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', '\u{a7c7}'), + ('\u{a7c9}', '\u{a7c9}'), + ('\u{a7f5}', '\u{a7f5}'), + ('īŧĄ', 'īŧē'), + ('đ', 'đ§'), + ('đ°', 'đ'), + ('đ˛', 'đ˛˛'), + ('đĸ ', 'đĸŋ'), + ('đš', 'đš'), + ('đ', 'đ'), + ('đ´', 'đ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đĩ'), + ('đ', 'đŠ'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¸', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đŦ', 'đ
'), + ('đ ', 'đš'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đŧ', 'đ'), + ('đ°', 'đ'), + ('đ¨', 'đ'), + ('đĸ', 'đē'), + ('đ', 'đ´'), + ('đ', 'đŽ'), + ('đ', 'đ¨'), + ('đ', 'đ'), + ('đ¤', 'đ¤Ą'), + ('đ°', 'đ
'), + ('đ
', 'đ
Š'), + ('đ
°', 'đ'), +]; + +pub const VARIATION_SELECTOR: &'static [(char, char)] = &[ + ('\u{180b}', '\u{180d}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const XID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('¡', '¡'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Ë'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('ËŦ', 'ËŦ'), + ('ËŽ', 'ËŽ'), + ('\u{300}', 'Í´'), + ('Íļ', '͡'), + ('Íģ', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('\u{483}', '\u{487}'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ', 'Õ'), + ('Õ ', 'Ö'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('×', '×Ē'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('Ø ', 'ŲŠ'), + ('ŲŽ', 'Û'), + ('Û', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', '\u{74a}'), + ('Ũ', 'Ūą'), + ('ß', 'ßĩ'), + ('ßē', 'ßē'), + ('\u{7fd}', '\u{7fd}'), + ('ā ', '\u{82d}'), + ('āĄ', '\u{85b}'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('āĨĻ', 'āĨ¯'), + ('āĨą', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('\u{9bc}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('\u{9d7}', '\u{9d7}'), + ('ā§', 'ā§'), + ('ā§', '\u{9e3}'), + ('ā§Ļ', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ā¨'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('\u{a3c}', '\u{a3c}'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠĻ', '\u{a75}'), + ('\u{a81}', 'āĒ'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', '\u{acd}'), + ('āĢ', 'āĢ'), + ('āĢ ', '\u{ae3}'), + ('āĢĻ', 'āĢ¯'), + ('āĢš', '\u{aff}'), + ('\u{b01}', 'āŦ'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('\u{b3c}', '\u{b44}'), + ('ā', 'ā'), + ('ā', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ā', 'ā'), + ('ā', '\u{b63}'), + ('āĻ', 'ā¯'), + ('āą', 'āą'), + ('\u{b82}', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', '\u{bcd}'), + ('ā¯', 'ā¯'), + ('\u{bd7}', '\u{bd7}'), + ('ā¯Ļ', 'ā¯¯'), + ('\u{c00}', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('āą', 'āą'), + ('āą ', '\u{c63}'), + ('āąĻ', 'āą¯'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('\u{cbc}', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('āŗ', 'āŗ'), + ('āŗ ', '\u{ce3}'), + ('āŗĻ', 'āŗ¯'), + ('āŗą', 'āŗ˛'), + ('\u{d00}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d57}'), + ('āĩ', '\u{d63}'), + ('āĩĻ', 'āĩ¯'), + ('āĩē', 'āĩŋ'), + ('\u{d81}', 'āļ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇĻ', 'āˇ¯'), + ('āˇ˛', 'āˇŗ'), + ('ā¸', '\u{e3a}'), + ('āš', '\u{e4e}'), + ('āš', 'āš'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('\u{ec8}', '\u{ecd}'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('\u{f18}', '\u{f19}'), + ('āŧ ', 'āŧŠ'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('āŧž', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('á', 'á'), + ('á', '\u{109d}'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('\u{135d}', '\u{135f}'), + ('áŠ', 'áą'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áŽ', 'á¸'), + ('á', 'á'), + ('á', '\u{1714}'), + ('á ', '\u{1734}'), + ('á', '\u{1753}'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('\u{1772}', '\u{1773}'), + ('á', '\u{17d3}'), + ('á', 'á'), + ('á', '\u{17dd}'), + ('á ', 'áŠ'), + ('\u{180b}', '\u{180d}'), + ('á ', 'á '), + ('á ', '᥸'), + ('áĸ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', '\u{193b}'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á§', 'á§'), + ('á¨', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', 'áĒ'), + ('áĒ', 'áĒ'), + ('áĒ§', 'áĒ§'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{1b00}', 'á'), + ('á', 'á'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'á¯ŗ'), + ('á°', '\u{1c37}'), + ('áą', 'áą'), + ('áą', 'áąŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'áŗē'), + ('á´', '\u{1df9}'), + ('\u{1dfb}', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('âŋ', 'â'), + ('â', 'â'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20f0}'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗ¤'), + ('âŗĢ', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ¯'), + ('\u{2d7f}', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('\u{2de0}', '\u{2dff}'), + ('ã
', 'ã'), + ('ãĄ', '\u{302f}'), + ('ãą', 'ãĩ'), + ('ã¸', 'ãŧ'), + ('ã', 'ã'), + ('\u{3099}', '\u{309a}'), + ('ã', 'ã'), + ('ãĄ', 'ãē'), + ('ãŧ', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ã°', 'ãŋ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ęŊ'), + ('ę', 'ę'), + ('ę', 'ęĢ'), + ('ę', '\u{a66f}'), + ('\u{a674}', '\u{a67d}'), + ('ęŋ', '\u{a6f1}'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ę §'), + ('\u{a82c}', '\u{a82c}'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', '\u{a8c5}'), + ('ęŖ', 'ęŖ'), + ('\u{a8e0}', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', '\u{a92d}'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('\u{a980}', 'ę§'), + ('ę§', 'ę§'), + ('ę§ ', '꧞'), + ('ę¨', '\u{aa36}'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ ', 'ęŠļ'), + ('ęŠē', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ ', 'ęĢ¯'), + ('ęĢ˛', '\u{aaf6}'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab69}'), + ('ę°', 'ę¯Ē'), + ('ę¯Ŧ', '\u{abed}'), + ('ę¯°', 'ę¯š'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'īą'), + ('īą¤', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇš'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('ī¸ŗ', 'ī¸´'), + ('īš', 'īš'), + ('īšą', 'īšą'), + ('īšŗ', 'īšŗ'), + ('īšˇ', 'īšˇ'), + ('īšš', 'īšš'), + ('īšģ', 'īšģ'), + ('īšŊ', 'īšŊ'), + ('īšŋ', 'īģŧ'), + ('īŧ', 'īŧ'), + ('īŧĄ', 'īŧē'), + ('īŧŋ', 'īŧŋ'), + ('īŊ', 'īŊ'), + ('īŊĻ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ
', 'đ
´'), + ('\u{101fd}', '\u{101fd}'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('\u{102e0}', '\u{102e0}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{1037a}'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đŠ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', '\u{10ae6}'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đ´', '\u{10d27}'), + ('đ´°', 'đ´š'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', '\u{10f50}'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', '\u{11046}'), + ('đĻ', 'đ¯'), + ('\u{1107f}', '\u{110ba}'), + ('đ', 'đ¨'), + ('đ°', 'đš'), + ('\u{11100}', '\u{11134}'), + ('đļ', 'đŋ'), + ('đ
', '\u{11147}'), + ('đ
', '\u{11173}'), + ('đ
ļ', 'đ
ļ'), + ('\u{11180}', 'đ'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111ce}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', '\u{112ea}'), + ('đ°', 'đš'), + ('\u{11300}', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('\u{1133b}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đ', 'đŖ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{1145e}', '\u{11461}'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', '\u{115b5}'), + ('đ¸', '\u{115c0}'), + ('đ', '\u{115dd}'), + ('đ', '\u{11640}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¸'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{1171d}', '\u{1172b}'), + ('đ°', 'đš'), + ('đ ', '\u{1183a}'), + ('đĸ ', 'đŖŠ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{11943}'), + ('\u{11950}', '\u{11959}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', '\u{119d7}'), + ('\u{119da}', 'đ§Ą'), + ('đ§Ŗ', 'đ§¤'), + ('đ¨', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('đŠ', '\u{11a99}'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', '\u{11c36}'), + ('\u{11c38}', 'đą'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('\u{11c92}', '\u{11ca7}'), + ('đ˛Š', '\u{11cb6}'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', 'đļ'), + ('đļ ', 'đļŠ'), + ('đģ ', 'đģļ'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đŠ ', 'đŠŠ'), + ('đĢ', 'đĢ'), + ('\u{16af0}', '\u{16af4}'), + ('đŦ', '\u{16b36}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đš', 'đšŋ'), + ('đŧ', 'đŊ'), + ('\u{16f4f}', 'đž'), + ('\u{16f8f}', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', '\u{16fe4}'), + ('\u{16ff0}', '\u{16ff1}'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1d165}', '\u{1d169}'), + ('đ
', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŋ'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('đ', 'đŦ'), + ('\u{1e130}', 'đŊ'), + ('đ
', 'đ
'), + ('đ
', 'đ
'), + ('đ', 'đš'), + ('đ ', 'đŖ'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('đ¤', 'đĨ'), + ('đĨ', 'đĨ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('\u{1fbf0}', '\u{1fbf9}'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const XID_START: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Ë'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('ËŦ', 'ËŦ'), + ('ËŽ', 'ËŽ'), + ('Í°', 'Í´'), + ('Íļ', '͡'), + ('Íģ', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ', 'Õ'), + ('Õ ', 'Ö'), + ('×', '×Ē'), + ('ׯ', 'ײ'), + ('Ø ', 'Ų'), + ('ŲŽ', 'Ų¯'), + ('Ųą', 'Û'), + ('Û', 'Û'), + ('ÛĨ', 'ÛĻ'), + ('ÛŽ', 'Û¯'), + ('Ûē', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', 'Ü'), + ('Ü', 'ܯ'), + ('Ũ', 'ŪĨ'), + ('Ūą', 'Ūą'), + ('ß', 'ßĒ'), + ('ß´', 'ßĩ'), + ('ßē', 'ßē'), + ('ā ', 'ā '), + ('ā ', 'ā '), + ('ā ¤', 'ā ¤'), + ('ā ¨', 'ā ¨'), + ('āĄ', 'āĄ'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('ā¤', 'ā¤š'), + ('ā¤Ŋ', 'ā¤Ŋ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨĄ'), + ('āĨą', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('āĻŊ', 'āĻŊ'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§Ą'), + ('ā§°', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠ˛', 'āŠ´'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('āĒŊ', 'āĒŊ'), + ('āĢ', 'āĢ'), + ('āĢ ', 'āĢĄ'), + ('āĢš', 'āĢš'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('āŦŊ', 'āŦŊ'), + ('ā', 'ā'), + ('ā', 'āĄ'), + ('āą', 'āą'), + ('āŽ', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('ā¯', 'ā¯'), + ('ā°
', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'ā°Ŋ'), + ('āą', 'āą'), + ('āą ', 'āąĄ'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('ā˛Ŋ', 'ā˛Ŋ'), + ('āŗ', 'āŗ'), + ('āŗ ', 'āŗĄ'), + ('āŗą', 'āŗ˛'), + ('\u{d04}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', 'ā´ē'), + ('ā´Ŋ', 'ā´Ŋ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩĄ'), + ('āĩē', 'āĩŋ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('ā¸', 'ā¸°'), + ('ā¸˛', 'ā¸˛'), + ('āš', 'āš'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āē°'), + ('āē˛', 'āē˛'), + ('āēŊ', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('āŊ', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('āž', 'āž'), + ('á', 'áĒ'), + ('áŋ', 'áŋ'), + ('á', 'á'), + ('á', 'á'), + ('áĄ', 'áĄ'), + ('áĨ', 'áĻ'), + ('áŽ', 'á°'), + ('áĩ', 'á'), + ('á', 'á'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áŽ', 'á¸'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áą'), + ('á', 'á'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('á', 'áŗ'), + ('á', 'á'), + ('á', 'á'), + ('á ', '᥸'), + ('áĸ', 'áĸ¨'), + ('áĸĒ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á¨', 'á¨'), + ('ᨠ', 'áŠ'), + ('áĒ§', 'áĒ§'), + ('áŦ
', 'áŦŗ'), + ('á
', 'á'), + ('áŽ', 'Ꭰ'), + ('ᎎ', 'Ꭿ'), + ('áŽē', 'á¯Ĩ'), + ('á°', 'á°Ŗ'), + ('áą', 'áą'), + ('áą', 'áąŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('áŗŠ', 'áŗŦ'), + ('áŗŽ', 'áŗŗ'), + ('áŗĩ', 'áŗļ'), + ('áŗē', 'áŗē'), + ('á´', 'áļŋ'), + ('á¸', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗ¤'), + ('âŗĢ', 'âŗŽ'), + ('âŗ˛', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ¯'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('ã
', 'ã'), + ('ãĄ', 'ãŠ'), + ('ãą', 'ãĩ'), + ('ã¸', 'ãŧ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãĄ', 'ãē'), + ('ãŧ', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ã°', 'ãŋ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ęŊ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĒ', 'ęĢ'), + ('ę', 'ęŽ'), + ('ęŋ', 'ę'), + ('ę ', 'ę¯'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ę '), + ('ę ', 'ę
'), + ('ę ', 'ę '), + ('ę ', 'ę ĸ'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', 'ęĸŗ'), + ('ęŖ˛', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', 'ęŖž'), + ('ę¤', 'ę¤Ĩ'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('ęĻ', 'ęĻ˛'), + ('ę§', 'ę§'), + ('ę§ ', 'ꧤ'), + ('ę§Ļ', 'ę§¯'), + ('ę§ē', '꧞'), + ('ę¨', 'ꨨ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ ', 'ęŠļ'), + ('ęŠē', 'ęŠē'), + ('ꊞ', 'ęĒ¯'), + ('ęĒą', 'ęĒą'), + ('ęĒĩ', 'ęĒļ'), + ('ęĒš', 'ęĒŊ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ ', 'ęĢĒ'), + ('ęĢ˛', 'ęĢ´'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab69}'), + ('ę°', 'ę¯ĸ'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'īą'), + ('īą¤', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇš'), + ('īšą', 'īšą'), + ('īšŗ', 'īšŗ'), + ('īšˇ', 'īšˇ'), + ('īšš', 'īšš'), + ('īšģ', 'īšģ'), + ('īšŊ', 'īšŊ'), + ('īšŋ', 'īģŧ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), + ('īŊĻ', 'īž'), + ('īž ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ
', 'đ
´'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', 'đĢ¤'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đ´', 'đ´Ŗ'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', 'đŊ
'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', 'đˇ'), + ('đ', 'đ¯'), + ('đ', 'đ¨'), + ('đ', 'đĻ'), + ('đ
', 'đ
'), + ('\u{11147}', '\u{11147}'), + ('đ
', 'đ
˛'), + ('đ
ļ', 'đ
ļ'), + ('đ', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĢ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đ', 'đ´'), + ('đ', 'đ'), + ('đ', '\u{11461}'), + ('đ', 'đ¯'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đ¯'), + ('đ', 'đ'), + ('đ', 'đĒ'), + ('đ¸', 'đ¸'), + ('đ', 'đ'), + ('đ ', 'đ Ģ'), + ('đĸ ', 'đŖ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{1192f}'), + ('\u{1193f}', '\u{1193f}'), + ('\u{11941}', '\u{11941}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', 'đ§'), + ('đ§Ą', 'đ§Ą'), + ('đ§Ŗ', 'đ§Ŗ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨˛'), + ('đ¨ē', 'đ¨ē'), + ('đŠ', 'đŠ'), + ('đŠ', 'đĒ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', 'đ°Ž'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', 'đ´°'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('đļ', 'đļ'), + ('đģ ', 'đģ˛'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đĢ', 'đĢ'), + ('đŦ', 'đŦ¯'), + ('đ', 'đ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đš', 'đšŋ'), + ('đŧ', 'đŊ'), + ('đŊ', 'đŊ'), + ('đž', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', 'đŋŖ'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŦ'), + ('đˇ', 'đŊ'), + ('đ
', 'đ
'), + ('đ', 'đĢ'), + ('đ ', 'đŖ'), + ('đ¤', 'đĨ'), + ('đĨ', 'đĨ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/property_names.rs b/vendor/regex-syntax/src/unicode_tables/property_names.rs new file mode 100644 index 000000000..6393df2f8 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/property_names.rs @@ -0,0 +1,264 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-names ucd-13.0.0 +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[ + ("age", "Age"), + ("ahex", "ASCII_Hex_Digit"), + ("alpha", "Alphabetic"), + ("alphabetic", "Alphabetic"), + ("asciihexdigit", "ASCII_Hex_Digit"), + ("bc", "Bidi_Class"), + ("bidic", "Bidi_Control"), + ("bidiclass", "Bidi_Class"), + ("bidicontrol", "Bidi_Control"), + ("bidim", "Bidi_Mirrored"), + ("bidimirrored", "Bidi_Mirrored"), + ("bidimirroringglyph", "Bidi_Mirroring_Glyph"), + ("bidipairedbracket", "Bidi_Paired_Bracket"), + ("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"), + ("blk", "Block"), + ("block", "Block"), + ("bmg", "Bidi_Mirroring_Glyph"), + ("bpb", "Bidi_Paired_Bracket"), + ("bpt", "Bidi_Paired_Bracket_Type"), + ("canonicalcombiningclass", "Canonical_Combining_Class"), + ("cased", "Cased"), + ("casefolding", "Case_Folding"), + ("caseignorable", "Case_Ignorable"), + ("ccc", "Canonical_Combining_Class"), + ("ce", "Composition_Exclusion"), + ("cf", "Case_Folding"), + ("changeswhencasefolded", "Changes_When_Casefolded"), + ("changeswhencasemapped", "Changes_When_Casemapped"), + ("changeswhenlowercased", "Changes_When_Lowercased"), + ("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"), + ("changeswhentitlecased", "Changes_When_Titlecased"), + ("changeswhenuppercased", "Changes_When_Uppercased"), + ("ci", "Case_Ignorable"), + ("cjkaccountingnumeric", "kAccountingNumeric"), + ("cjkcompatibilityvariant", "kCompatibilityVariant"), + ("cjkiicore", "kIICore"), + ("cjkirggsource", "kIRG_GSource"), + ("cjkirghsource", "kIRG_HSource"), + ("cjkirgjsource", "kIRG_JSource"), + ("cjkirgkpsource", "kIRG_KPSource"), + ("cjkirgksource", "kIRG_KSource"), + ("cjkirgmsource", "kIRG_MSource"), + ("cjkirgssource", "kIRG_SSource"), + ("cjkirgtsource", "kIRG_TSource"), + ("cjkirguksource", "kIRG_UKSource"), + ("cjkirgusource", "kIRG_USource"), + ("cjkirgvsource", "kIRG_VSource"), + ("cjkothernumeric", "kOtherNumeric"), + ("cjkprimarynumeric", "kPrimaryNumeric"), + ("cjkrsunicode", "kRSUnicode"), + ("compex", "Full_Composition_Exclusion"), + ("compositionexclusion", "Composition_Exclusion"), + ("cwcf", "Changes_When_Casefolded"), + ("cwcm", "Changes_When_Casemapped"), + ("cwkcf", "Changes_When_NFKC_Casefolded"), + ("cwl", "Changes_When_Lowercased"), + ("cwt", "Changes_When_Titlecased"), + ("cwu", "Changes_When_Uppercased"), + ("dash", "Dash"), + ("decompositionmapping", "Decomposition_Mapping"), + ("decompositiontype", "Decomposition_Type"), + ("defaultignorablecodepoint", "Default_Ignorable_Code_Point"), + ("dep", "Deprecated"), + ("deprecated", "Deprecated"), + ("di", "Default_Ignorable_Code_Point"), + ("dia", "Diacritic"), + ("diacritic", "Diacritic"), + ("dm", "Decomposition_Mapping"), + ("dt", "Decomposition_Type"), + ("ea", "East_Asian_Width"), + ("eastasianwidth", "East_Asian_Width"), + ("ebase", "Emoji_Modifier_Base"), + ("ecomp", "Emoji_Component"), + ("emod", "Emoji_Modifier"), + ("emoji", "Emoji"), + ("emojicomponent", "Emoji_Component"), + ("emojimodifier", "Emoji_Modifier"), + ("emojimodifierbase", "Emoji_Modifier_Base"), + ("emojipresentation", "Emoji_Presentation"), + ("epres", "Emoji_Presentation"), + ("equideo", "Equivalent_Unified_Ideograph"), + ("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"), + ("expandsonnfc", "Expands_On_NFC"), + ("expandsonnfd", "Expands_On_NFD"), + ("expandsonnfkc", "Expands_On_NFKC"), + ("expandsonnfkd", "Expands_On_NFKD"), + ("ext", "Extender"), + ("extendedpictographic", "Extended_Pictographic"), + ("extender", "Extender"), + ("extpict", "Extended_Pictographic"), + ("fcnfkc", "FC_NFKC_Closure"), + ("fcnfkcclosure", "FC_NFKC_Closure"), + ("fullcompositionexclusion", "Full_Composition_Exclusion"), + ("gc", "General_Category"), + ("gcb", "Grapheme_Cluster_Break"), + ("generalcategory", "General_Category"), + ("graphemebase", "Grapheme_Base"), + ("graphemeclusterbreak", "Grapheme_Cluster_Break"), + ("graphemeextend", "Grapheme_Extend"), + ("graphemelink", "Grapheme_Link"), + ("grbase", "Grapheme_Base"), + ("grext", "Grapheme_Extend"), + ("grlink", "Grapheme_Link"), + ("hangulsyllabletype", "Hangul_Syllable_Type"), + ("hex", "Hex_Digit"), + ("hexdigit", "Hex_Digit"), + ("hst", "Hangul_Syllable_Type"), + ("hyphen", "Hyphen"), + ("idc", "ID_Continue"), + ("idcontinue", "ID_Continue"), + ("ideo", "Ideographic"), + ("ideographic", "Ideographic"), + ("ids", "ID_Start"), + ("idsb", "IDS_Binary_Operator"), + ("idsbinaryoperator", "IDS_Binary_Operator"), + ("idst", "IDS_Trinary_Operator"), + ("idstart", "ID_Start"), + ("idstrinaryoperator", "IDS_Trinary_Operator"), + ("indicpositionalcategory", "Indic_Positional_Category"), + ("indicsyllabiccategory", "Indic_Syllabic_Category"), + ("inpc", "Indic_Positional_Category"), + ("insc", "Indic_Syllabic_Category"), + ("isc", "ISO_Comment"), + ("jamoshortname", "Jamo_Short_Name"), + ("jg", "Joining_Group"), + ("joinc", "Join_Control"), + ("joincontrol", "Join_Control"), + ("joininggroup", "Joining_Group"), + ("joiningtype", "Joining_Type"), + ("jsn", "Jamo_Short_Name"), + ("jt", "Joining_Type"), + ("kaccountingnumeric", "kAccountingNumeric"), + ("kcompatibilityvariant", "kCompatibilityVariant"), + ("kiicore", "kIICore"), + ("kirggsource", "kIRG_GSource"), + ("kirghsource", "kIRG_HSource"), + ("kirgjsource", "kIRG_JSource"), + ("kirgkpsource", "kIRG_KPSource"), + ("kirgksource", "kIRG_KSource"), + ("kirgmsource", "kIRG_MSource"), + ("kirgssource", "kIRG_SSource"), + ("kirgtsource", "kIRG_TSource"), + ("kirguksource", "kIRG_UKSource"), + ("kirgusource", "kIRG_USource"), + ("kirgvsource", "kIRG_VSource"), + ("kothernumeric", "kOtherNumeric"), + ("kprimarynumeric", "kPrimaryNumeric"), + ("krsunicode", "kRSUnicode"), + ("lb", "Line_Break"), + ("lc", "Lowercase_Mapping"), + ("linebreak", "Line_Break"), + ("loe", "Logical_Order_Exception"), + ("logicalorderexception", "Logical_Order_Exception"), + ("lower", "Lowercase"), + ("lowercase", "Lowercase"), + ("lowercasemapping", "Lowercase_Mapping"), + ("math", "Math"), + ("na", "Name"), + ("na1", "Unicode_1_Name"), + ("name", "Name"), + ("namealias", "Name_Alias"), + ("nchar", "Noncharacter_Code_Point"), + ("nfcqc", "NFC_Quick_Check"), + ("nfcquickcheck", "NFC_Quick_Check"), + ("nfdqc", "NFD_Quick_Check"), + ("nfdquickcheck", "NFD_Quick_Check"), + ("nfkccasefold", "NFKC_Casefold"), + ("nfkccf", "NFKC_Casefold"), + ("nfkcqc", "NFKC_Quick_Check"), + ("nfkcquickcheck", "NFKC_Quick_Check"), + ("nfkdqc", "NFKD_Quick_Check"), + ("nfkdquickcheck", "NFKD_Quick_Check"), + ("noncharactercodepoint", "Noncharacter_Code_Point"), + ("nt", "Numeric_Type"), + ("numerictype", "Numeric_Type"), + ("numericvalue", "Numeric_Value"), + ("nv", "Numeric_Value"), + ("oalpha", "Other_Alphabetic"), + ("ocomment", "ISO_Comment"), + ("odi", "Other_Default_Ignorable_Code_Point"), + ("ogrext", "Other_Grapheme_Extend"), + ("oidc", "Other_ID_Continue"), + ("oids", "Other_ID_Start"), + ("olower", "Other_Lowercase"), + ("omath", "Other_Math"), + ("otheralphabetic", "Other_Alphabetic"), + ("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"), + ("othergraphemeextend", "Other_Grapheme_Extend"), + ("otheridcontinue", "Other_ID_Continue"), + ("otheridstart", "Other_ID_Start"), + ("otherlowercase", "Other_Lowercase"), + ("othermath", "Other_Math"), + ("otheruppercase", "Other_Uppercase"), + ("oupper", "Other_Uppercase"), + ("patsyn", "Pattern_Syntax"), + ("patternsyntax", "Pattern_Syntax"), + ("patternwhitespace", "Pattern_White_Space"), + ("patws", "Pattern_White_Space"), + ("pcm", "Prepended_Concatenation_Mark"), + ("prependedconcatenationmark", "Prepended_Concatenation_Mark"), + ("qmark", "Quotation_Mark"), + ("quotationmark", "Quotation_Mark"), + ("radical", "Radical"), + ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), + ("sb", "Sentence_Break"), + ("sc", "Script"), + ("scf", "Simple_Case_Folding"), + ("script", "Script"), + ("scriptextensions", "Script_Extensions"), + ("scx", "Script_Extensions"), + ("sd", "Soft_Dotted"), + ("sentencebreak", "Sentence_Break"), + ("sentenceterminal", "Sentence_Terminal"), + ("sfc", "Simple_Case_Folding"), + ("simplecasefolding", "Simple_Case_Folding"), + ("simplelowercasemapping", "Simple_Lowercase_Mapping"), + ("simpletitlecasemapping", "Simple_Titlecase_Mapping"), + ("simpleuppercasemapping", "Simple_Uppercase_Mapping"), + ("slc", "Simple_Lowercase_Mapping"), + ("softdotted", "Soft_Dotted"), + ("space", "White_Space"), + ("stc", "Simple_Titlecase_Mapping"), + ("sterm", "Sentence_Terminal"), + ("suc", "Simple_Uppercase_Mapping"), + ("tc", "Titlecase_Mapping"), + ("term", "Terminal_Punctuation"), + ("terminalpunctuation", "Terminal_Punctuation"), + ("titlecasemapping", "Titlecase_Mapping"), + ("uc", "Uppercase_Mapping"), + ("uideo", "Unified_Ideograph"), + ("unicode1name", "Unicode_1_Name"), + ("unicoderadicalstroke", "kRSUnicode"), + ("unifiedideograph", "Unified_Ideograph"), + ("upper", "Uppercase"), + ("uppercase", "Uppercase"), + ("uppercasemapping", "Uppercase_Mapping"), + ("urs", "kRSUnicode"), + ("variationselector", "Variation_Selector"), + ("verticalorientation", "Vertical_Orientation"), + ("vo", "Vertical_Orientation"), + ("vs", "Variation_Selector"), + ("wb", "Word_Break"), + ("whitespace", "White_Space"), + ("wordbreak", "Word_Break"), + ("wspace", "White_Space"), + ("xidc", "XID_Continue"), + ("xidcontinue", "XID_Continue"), + ("xids", "XID_Start"), + ("xidstart", "XID_Start"), + ("xonfc", "Expands_On_NFC"), + ("xonfd", "Expands_On_NFD"), + ("xonfkc", "Expands_On_NFKC"), + ("xonfkd", "Expands_On_NFKD"), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/property_values.rs b/vendor/regex-syntax/src/unicode_tables/property_values.rs new file mode 100644 index 000000000..c46653a7b --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/property_values.rs @@ -0,0 +1,896 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-values ucd-13.0.0 --include gc,script,scx,age,gcb,wb,sb +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const PROPERTY_VALUES: &'static [( + &'static str, + &'static [(&'static str, &'static str)], +)] = &[ + ( + "Age", + &[ + ("1.1", "V1_1"), + ("10.0", "V10_0"), + ("11.0", "V11_0"), + ("12.0", "V12_0"), + ("12.1", "V12_1"), + ("13.0", "V13_0"), + ("2.0", "V2_0"), + ("2.1", "V2_1"), + ("3.0", "V3_0"), + ("3.1", "V3_1"), + ("3.2", "V3_2"), + ("4.0", "V4_0"), + ("4.1", "V4_1"), + ("5.0", "V5_0"), + ("5.1", "V5_1"), + ("5.2", "V5_2"), + ("6.0", "V6_0"), + ("6.1", "V6_1"), + ("6.2", "V6_2"), + ("6.3", "V6_3"), + ("7.0", "V7_0"), + ("8.0", "V8_0"), + ("9.0", "V9_0"), + ("na", "Unassigned"), + ("unassigned", "Unassigned"), + ("v100", "V10_0"), + ("v11", "V1_1"), + ("v110", "V11_0"), + ("v120", "V12_0"), + ("v121", "V12_1"), + ("v130", "V13_0"), + ("v20", "V2_0"), + ("v21", "V2_1"), + ("v30", "V3_0"), + ("v31", "V3_1"), + ("v32", "V3_2"), + ("v40", "V4_0"), + ("v41", "V4_1"), + ("v50", "V5_0"), + ("v51", "V5_1"), + ("v52", "V5_2"), + ("v60", "V6_0"), + ("v61", "V6_1"), + ("v62", "V6_2"), + ("v63", "V6_3"), + ("v70", "V7_0"), + ("v80", "V8_0"), + ("v90", "V9_0"), + ], + ), + ( + "General_Category", + &[ + ("c", "Other"), + ("casedletter", "Cased_Letter"), + ("cc", "Control"), + ("cf", "Format"), + ("closepunctuation", "Close_Punctuation"), + ("cn", "Unassigned"), + ("cntrl", "Control"), + ("co", "Private_Use"), + ("combiningmark", "Mark"), + ("connectorpunctuation", "Connector_Punctuation"), + ("control", "Control"), + ("cs", "Surrogate"), + ("currencysymbol", "Currency_Symbol"), + ("dashpunctuation", "Dash_Punctuation"), + ("decimalnumber", "Decimal_Number"), + ("digit", "Decimal_Number"), + ("enclosingmark", "Enclosing_Mark"), + ("finalpunctuation", "Final_Punctuation"), + ("format", "Format"), + ("initialpunctuation", "Initial_Punctuation"), + ("l", "Letter"), + ("lc", "Cased_Letter"), + ("letter", "Letter"), + ("letternumber", "Letter_Number"), + ("lineseparator", "Line_Separator"), + ("ll", "Lowercase_Letter"), + ("lm", "Modifier_Letter"), + ("lo", "Other_Letter"), + ("lowercaseletter", "Lowercase_Letter"), + ("lt", "Titlecase_Letter"), + ("lu", "Uppercase_Letter"), + ("m", "Mark"), + ("mark", "Mark"), + ("mathsymbol", "Math_Symbol"), + ("mc", "Spacing_Mark"), + ("me", "Enclosing_Mark"), + ("mn", "Nonspacing_Mark"), + ("modifierletter", "Modifier_Letter"), + ("modifiersymbol", "Modifier_Symbol"), + ("n", "Number"), + ("nd", "Decimal_Number"), + ("nl", "Letter_Number"), + ("no", "Other_Number"), + ("nonspacingmark", "Nonspacing_Mark"), + ("number", "Number"), + ("openpunctuation", "Open_Punctuation"), + ("other", "Other"), + ("otherletter", "Other_Letter"), + ("othernumber", "Other_Number"), + ("otherpunctuation", "Other_Punctuation"), + ("othersymbol", "Other_Symbol"), + ("p", "Punctuation"), + ("paragraphseparator", "Paragraph_Separator"), + ("pc", "Connector_Punctuation"), + ("pd", "Dash_Punctuation"), + ("pe", "Close_Punctuation"), + ("pf", "Final_Punctuation"), + ("pi", "Initial_Punctuation"), + ("po", "Other_Punctuation"), + ("privateuse", "Private_Use"), + ("ps", "Open_Punctuation"), + ("punct", "Punctuation"), + ("punctuation", "Punctuation"), + ("s", "Symbol"), + ("sc", "Currency_Symbol"), + ("separator", "Separator"), + ("sk", "Modifier_Symbol"), + ("sm", "Math_Symbol"), + ("so", "Other_Symbol"), + ("spaceseparator", "Space_Separator"), + ("spacingmark", "Spacing_Mark"), + ("surrogate", "Surrogate"), + ("symbol", "Symbol"), + ("titlecaseletter", "Titlecase_Letter"), + ("unassigned", "Unassigned"), + ("uppercaseletter", "Uppercase_Letter"), + ("z", "Separator"), + ("zl", "Line_Separator"), + ("zp", "Paragraph_Separator"), + ("zs", "Space_Separator"), + ], + ), + ( + "Grapheme_Cluster_Break", + &[ + ("cn", "Control"), + ("control", "Control"), + ("cr", "CR"), + ("eb", "E_Base"), + ("ebase", "E_Base"), + ("ebasegaz", "E_Base_GAZ"), + ("ebg", "E_Base_GAZ"), + ("em", "E_Modifier"), + ("emodifier", "E_Modifier"), + ("ex", "Extend"), + ("extend", "Extend"), + ("gaz", "Glue_After_Zwj"), + ("glueafterzwj", "Glue_After_Zwj"), + ("l", "L"), + ("lf", "LF"), + ("lv", "LV"), + ("lvt", "LVT"), + ("other", "Other"), + ("pp", "Prepend"), + ("prepend", "Prepend"), + ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), + ("sm", "SpacingMark"), + ("spacingmark", "SpacingMark"), + ("t", "T"), + ("v", "V"), + ("xx", "Other"), + ("zwj", "ZWJ"), + ], + ), + ( + "Script", + &[ + ("adlam", "Adlam"), + ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), + ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), + ("arab", "Arabic"), + ("arabic", "Arabic"), + ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), + ("armn", "Armenian"), + ("avestan", "Avestan"), + ("avst", "Avestan"), + ("bali", "Balinese"), + ("balinese", "Balinese"), + ("bamu", "Bamum"), + ("bamum", "Bamum"), + ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), + ("batak", "Batak"), + ("batk", "Batak"), + ("beng", "Bengali"), + ("bengali", "Bengali"), + ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), + ("bopo", "Bopomofo"), + ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), + ("brahmi", "Brahmi"), + ("brai", "Braille"), + ("braille", "Braille"), + ("bugi", "Buginese"), + ("buginese", "Buginese"), + ("buhd", "Buhid"), + ("buhid", "Buhid"), + ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), + ("cari", "Carian"), + ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), + ("chakma", "Chakma"), + ("cham", "Cham"), + ("cher", "Cherokee"), + ("cherokee", "Cherokee"), + ("chorasmian", "Chorasmian"), + ("chrs", "Chorasmian"), + ("common", "Common"), + ("copt", "Coptic"), + ("coptic", "Coptic"), + ("cprt", "Cypriot"), + ("cuneiform", "Cuneiform"), + ("cypriot", "Cypriot"), + ("cyrillic", "Cyrillic"), + ("cyrl", "Cyrillic"), + ("deseret", "Deseret"), + ("deva", "Devanagari"), + ("devanagari", "Devanagari"), + ("diak", "Dives_Akuru"), + ("divesakuru", "Dives_Akuru"), + ("dogr", "Dogra"), + ("dogra", "Dogra"), + ("dsrt", "Deseret"), + ("dupl", "Duployan"), + ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), + ("elba", "Elbasan"), + ("elbasan", "Elbasan"), + ("elym", "Elymaic"), + ("elymaic", "Elymaic"), + ("ethi", "Ethiopic"), + ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), + ("georgian", "Georgian"), + ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), + ("gong", "Gunjala_Gondi"), + ("gonm", "Masaram_Gondi"), + ("goth", "Gothic"), + ("gothic", "Gothic"), + ("gran", "Grantha"), + ("grantha", "Grantha"), + ("greek", "Greek"), + ("grek", "Greek"), + ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), + ("gunjalagondi", "Gunjala_Gondi"), + ("gurmukhi", "Gurmukhi"), + ("guru", "Gurmukhi"), + ("han", "Han"), + ("hang", "Hangul"), + ("hangul", "Hangul"), + ("hani", "Han"), + ("hanifirohingya", "Hanifi_Rohingya"), + ("hano", "Hanunoo"), + ("hanunoo", "Hanunoo"), + ("hatr", "Hatran"), + ("hatran", "Hatran"), + ("hebr", "Hebrew"), + ("hebrew", "Hebrew"), + ("hira", "Hiragana"), + ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), + ("hmng", "Pahawh_Hmong"), + ("hmnp", "Nyiakeng_Puachue_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), + ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), + ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), + ("ital", "Old_Italic"), + ("java", "Javanese"), + ("javanese", "Javanese"), + ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), + ("kana", "Katakana"), + ("kannada", "Kannada"), + ("katakana", "Katakana"), + ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kayahli", "Kayah_Li"), + ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), + ("khitansmallscript", "Khitan_Small_Script"), + ("khmer", "Khmer"), + ("khmr", "Khmer"), + ("khoj", "Khojki"), + ("khojki", "Khojki"), + ("khudawadi", "Khudawadi"), + ("kits", "Khitan_Small_Script"), + ("knda", "Kannada"), + ("kthi", "Kaithi"), + ("lana", "Tai_Tham"), + ("lao", "Lao"), + ("laoo", "Lao"), + ("latin", "Latin"), + ("latn", "Latin"), + ("lepc", "Lepcha"), + ("lepcha", "Lepcha"), + ("limb", "Limbu"), + ("limbu", "Limbu"), + ("lina", "Linear_A"), + ("linb", "Linear_B"), + ("lineara", "Linear_A"), + ("linearb", "Linear_B"), + ("lisu", "Lisu"), + ("lyci", "Lycian"), + ("lycian", "Lycian"), + ("lydi", "Lydian"), + ("lydian", "Lydian"), + ("mahajani", "Mahajani"), + ("mahj", "Mahajani"), + ("maka", "Makasar"), + ("makasar", "Makasar"), + ("malayalam", "Malayalam"), + ("mand", "Mandaic"), + ("mandaic", "Mandaic"), + ("mani", "Manichaean"), + ("manichaean", "Manichaean"), + ("marc", "Marchen"), + ("marchen", "Marchen"), + ("masaramgondi", "Masaram_Gondi"), + ("medefaidrin", "Medefaidrin"), + ("medf", "Medefaidrin"), + ("meeteimayek", "Meetei_Mayek"), + ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), + ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), + ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), + ("miao", "Miao"), + ("mlym", "Malayalam"), + ("modi", "Modi"), + ("mong", "Mongolian"), + ("mongolian", "Mongolian"), + ("mro", "Mro"), + ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), + ("mult", "Multani"), + ("multani", "Multani"), + ("myanmar", "Myanmar"), + ("mymr", "Myanmar"), + ("nabataean", "Nabataean"), + ("nand", "Nandinagari"), + ("nandinagari", "Nandinagari"), + ("narb", "Old_North_Arabian"), + ("nbat", "Nabataean"), + ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), + ("nko", "Nko"), + ("nkoo", "Nko"), + ("nshu", "Nushu"), + ("nushu", "Nushu"), + ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"), + ("ogam", "Ogham"), + ("ogham", "Ogham"), + ("olchiki", "Ol_Chiki"), + ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), + ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), + ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), + ("oldsogdian", "Old_Sogdian"), + ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), + ("oriya", "Oriya"), + ("orkh", "Old_Turkic"), + ("orya", "Oriya"), + ("osage", "Osage"), + ("osge", "Osage"), + ("osma", "Osmanya"), + ("osmanya", "Osmanya"), + ("pahawhhmong", "Pahawh_Hmong"), + ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), + ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), + ("perm", "Old_Permic"), + ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), + ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), + ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), + ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), + ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), + ("qaai", "Inherited"), + ("rejang", "Rejang"), + ("rjng", "Rejang"), + ("rohg", "Hanifi_Rohingya"), + ("runic", "Runic"), + ("runr", "Runic"), + ("samaritan", "Samaritan"), + ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), + ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), + ("sgnw", "SignWriting"), + ("sharada", "Sharada"), + ("shavian", "Shavian"), + ("shaw", "Shavian"), + ("shrd", "Sharada"), + ("sidd", "Siddham"), + ("siddham", "Siddham"), + ("signwriting", "SignWriting"), + ("sind", "Khudawadi"), + ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), + ("sogd", "Sogdian"), + ("sogdian", "Sogdian"), + ("sogo", "Old_Sogdian"), + ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), + ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), + ("sund", "Sundanese"), + ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), + ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), + ("syriac", "Syriac"), + ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), + ("tagbanwa", "Tagbanwa"), + ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), + ("taiviet", "Tai_Viet"), + ("takr", "Takri"), + ("takri", "Takri"), + ("tale", "Tai_Le"), + ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), + ("taml", "Tamil"), + ("tang", "Tangut"), + ("tangut", "Tangut"), + ("tavt", "Tai_Viet"), + ("telu", "Telugu"), + ("telugu", "Telugu"), + ("tfng", "Tifinagh"), + ("tglg", "Tagalog"), + ("thaa", "Thaana"), + ("thaana", "Thaana"), + ("thai", "Thai"), + ("tibetan", "Tibetan"), + ("tibt", "Tibetan"), + ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), + ("tirhuta", "Tirhuta"), + ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), + ("unknown", "Unknown"), + ("vai", "Vai"), + ("vaii", "Vai"), + ("wancho", "Wancho"), + ("wara", "Warang_Citi"), + ("warangciti", "Warang_Citi"), + ("wcho", "Wancho"), + ("xpeo", "Old_Persian"), + ("xsux", "Cuneiform"), + ("yezi", "Yezidi"), + ("yezidi", "Yezidi"), + ("yi", "Yi"), + ("yiii", "Yi"), + ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), + ("zinh", "Inherited"), + ("zyyy", "Common"), + ("zzzz", "Unknown"), + ], + ), + ( + "Script_Extensions", + &[ + ("adlam", "Adlam"), + ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), + ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), + ("arab", "Arabic"), + ("arabic", "Arabic"), + ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), + ("armn", "Armenian"), + ("avestan", "Avestan"), + ("avst", "Avestan"), + ("bali", "Balinese"), + ("balinese", "Balinese"), + ("bamu", "Bamum"), + ("bamum", "Bamum"), + ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), + ("batak", "Batak"), + ("batk", "Batak"), + ("beng", "Bengali"), + ("bengali", "Bengali"), + ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), + ("bopo", "Bopomofo"), + ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), + ("brahmi", "Brahmi"), + ("brai", "Braille"), + ("braille", "Braille"), + ("bugi", "Buginese"), + ("buginese", "Buginese"), + ("buhd", "Buhid"), + ("buhid", "Buhid"), + ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), + ("cari", "Carian"), + ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), + ("chakma", "Chakma"), + ("cham", "Cham"), + ("cher", "Cherokee"), + ("cherokee", "Cherokee"), + ("chorasmian", "Chorasmian"), + ("chrs", "Chorasmian"), + ("common", "Common"), + ("copt", "Coptic"), + ("coptic", "Coptic"), + ("cprt", "Cypriot"), + ("cuneiform", "Cuneiform"), + ("cypriot", "Cypriot"), + ("cyrillic", "Cyrillic"), + ("cyrl", "Cyrillic"), + ("deseret", "Deseret"), + ("deva", "Devanagari"), + ("devanagari", "Devanagari"), + ("diak", "Dives_Akuru"), + ("divesakuru", "Dives_Akuru"), + ("dogr", "Dogra"), + ("dogra", "Dogra"), + ("dsrt", "Deseret"), + ("dupl", "Duployan"), + ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), + ("elba", "Elbasan"), + ("elbasan", "Elbasan"), + ("elym", "Elymaic"), + ("elymaic", "Elymaic"), + ("ethi", "Ethiopic"), + ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), + ("georgian", "Georgian"), + ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), + ("gong", "Gunjala_Gondi"), + ("gonm", "Masaram_Gondi"), + ("goth", "Gothic"), + ("gothic", "Gothic"), + ("gran", "Grantha"), + ("grantha", "Grantha"), + ("greek", "Greek"), + ("grek", "Greek"), + ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), + ("gunjalagondi", "Gunjala_Gondi"), + ("gurmukhi", "Gurmukhi"), + ("guru", "Gurmukhi"), + ("han", "Han"), + ("hang", "Hangul"), + ("hangul", "Hangul"), + ("hani", "Han"), + ("hanifirohingya", "Hanifi_Rohingya"), + ("hano", "Hanunoo"), + ("hanunoo", "Hanunoo"), + ("hatr", "Hatran"), + ("hatran", "Hatran"), + ("hebr", "Hebrew"), + ("hebrew", "Hebrew"), + ("hira", "Hiragana"), + ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), + ("hmng", "Pahawh_Hmong"), + ("hmnp", "Nyiakeng_Puachue_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), + ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), + ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), + ("ital", "Old_Italic"), + ("java", "Javanese"), + ("javanese", "Javanese"), + ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), + ("kana", "Katakana"), + ("kannada", "Kannada"), + ("katakana", "Katakana"), + ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kayahli", "Kayah_Li"), + ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), + ("khitansmallscript", "Khitan_Small_Script"), + ("khmer", "Khmer"), + ("khmr", "Khmer"), + ("khoj", "Khojki"), + ("khojki", "Khojki"), + ("khudawadi", "Khudawadi"), + ("kits", "Khitan_Small_Script"), + ("knda", "Kannada"), + ("kthi", "Kaithi"), + ("lana", "Tai_Tham"), + ("lao", "Lao"), + ("laoo", "Lao"), + ("latin", "Latin"), + ("latn", "Latin"), + ("lepc", "Lepcha"), + ("lepcha", "Lepcha"), + ("limb", "Limbu"), + ("limbu", "Limbu"), + ("lina", "Linear_A"), + ("linb", "Linear_B"), + ("lineara", "Linear_A"), + ("linearb", "Linear_B"), + ("lisu", "Lisu"), + ("lyci", "Lycian"), + ("lycian", "Lycian"), + ("lydi", "Lydian"), + ("lydian", "Lydian"), + ("mahajani", "Mahajani"), + ("mahj", "Mahajani"), + ("maka", "Makasar"), + ("makasar", "Makasar"), + ("malayalam", "Malayalam"), + ("mand", "Mandaic"), + ("mandaic", "Mandaic"), + ("mani", "Manichaean"), + ("manichaean", "Manichaean"), + ("marc", "Marchen"), + ("marchen", "Marchen"), + ("masaramgondi", "Masaram_Gondi"), + ("medefaidrin", "Medefaidrin"), + ("medf", "Medefaidrin"), + ("meeteimayek", "Meetei_Mayek"), + ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), + ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), + ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), + ("miao", "Miao"), + ("mlym", "Malayalam"), + ("modi", "Modi"), + ("mong", "Mongolian"), + ("mongolian", "Mongolian"), + ("mro", "Mro"), + ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), + ("mult", "Multani"), + ("multani", "Multani"), + ("myanmar", "Myanmar"), + ("mymr", "Myanmar"), + ("nabataean", "Nabataean"), + ("nand", "Nandinagari"), + ("nandinagari", "Nandinagari"), + ("narb", "Old_North_Arabian"), + ("nbat", "Nabataean"), + ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), + ("nko", "Nko"), + ("nkoo", "Nko"), + ("nshu", "Nushu"), + ("nushu", "Nushu"), + ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"), + ("ogam", "Ogham"), + ("ogham", "Ogham"), + ("olchiki", "Ol_Chiki"), + ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), + ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), + ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), + ("oldsogdian", "Old_Sogdian"), + ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), + ("oriya", "Oriya"), + ("orkh", "Old_Turkic"), + ("orya", "Oriya"), + ("osage", "Osage"), + ("osge", "Osage"), + ("osma", "Osmanya"), + ("osmanya", "Osmanya"), + ("pahawhhmong", "Pahawh_Hmong"), + ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), + ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), + ("perm", "Old_Permic"), + ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), + ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), + ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), + ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), + ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), + ("qaai", "Inherited"), + ("rejang", "Rejang"), + ("rjng", "Rejang"), + ("rohg", "Hanifi_Rohingya"), + ("runic", "Runic"), + ("runr", "Runic"), + ("samaritan", "Samaritan"), + ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), + ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), + ("sgnw", "SignWriting"), + ("sharada", "Sharada"), + ("shavian", "Shavian"), + ("shaw", "Shavian"), + ("shrd", "Sharada"), + ("sidd", "Siddham"), + ("siddham", "Siddham"), + ("signwriting", "SignWriting"), + ("sind", "Khudawadi"), + ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), + ("sogd", "Sogdian"), + ("sogdian", "Sogdian"), + ("sogo", "Old_Sogdian"), + ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), + ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), + ("sund", "Sundanese"), + ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), + ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), + ("syriac", "Syriac"), + ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), + ("tagbanwa", "Tagbanwa"), + ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), + ("taiviet", "Tai_Viet"), + ("takr", "Takri"), + ("takri", "Takri"), + ("tale", "Tai_Le"), + ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), + ("taml", "Tamil"), + ("tang", "Tangut"), + ("tangut", "Tangut"), + ("tavt", "Tai_Viet"), + ("telu", "Telugu"), + ("telugu", "Telugu"), + ("tfng", "Tifinagh"), + ("tglg", "Tagalog"), + ("thaa", "Thaana"), + ("thaana", "Thaana"), + ("thai", "Thai"), + ("tibetan", "Tibetan"), + ("tibt", "Tibetan"), + ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), + ("tirhuta", "Tirhuta"), + ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), + ("unknown", "Unknown"), + ("vai", "Vai"), + ("vaii", "Vai"), + ("wancho", "Wancho"), + ("wara", "Warang_Citi"), + ("warangciti", "Warang_Citi"), + ("wcho", "Wancho"), + ("xpeo", "Old_Persian"), + ("xsux", "Cuneiform"), + ("yezi", "Yezidi"), + ("yezidi", "Yezidi"), + ("yi", "Yi"), + ("yiii", "Yi"), + ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), + ("zinh", "Inherited"), + ("zyyy", "Common"), + ("zzzz", "Unknown"), + ], + ), + ( + "Sentence_Break", + &[ + ("at", "ATerm"), + ("aterm", "ATerm"), + ("cl", "Close"), + ("close", "Close"), + ("cr", "CR"), + ("ex", "Extend"), + ("extend", "Extend"), + ("fo", "Format"), + ("format", "Format"), + ("le", "OLetter"), + ("lf", "LF"), + ("lo", "Lower"), + ("lower", "Lower"), + ("nu", "Numeric"), + ("numeric", "Numeric"), + ("oletter", "OLetter"), + ("other", "Other"), + ("sc", "SContinue"), + ("scontinue", "SContinue"), + ("se", "Sep"), + ("sep", "Sep"), + ("sp", "Sp"), + ("st", "STerm"), + ("sterm", "STerm"), + ("up", "Upper"), + ("upper", "Upper"), + ("xx", "Other"), + ], + ), + ( + "Word_Break", + &[ + ("aletter", "ALetter"), + ("cr", "CR"), + ("doublequote", "Double_Quote"), + ("dq", "Double_Quote"), + ("eb", "E_Base"), + ("ebase", "E_Base"), + ("ebasegaz", "E_Base_GAZ"), + ("ebg", "E_Base_GAZ"), + ("em", "E_Modifier"), + ("emodifier", "E_Modifier"), + ("ex", "ExtendNumLet"), + ("extend", "Extend"), + ("extendnumlet", "ExtendNumLet"), + ("fo", "Format"), + ("format", "Format"), + ("gaz", "Glue_After_Zwj"), + ("glueafterzwj", "Glue_After_Zwj"), + ("hebrewletter", "Hebrew_Letter"), + ("hl", "Hebrew_Letter"), + ("ka", "Katakana"), + ("katakana", "Katakana"), + ("le", "ALetter"), + ("lf", "LF"), + ("mb", "MidNumLet"), + ("midletter", "MidLetter"), + ("midnum", "MidNum"), + ("midnumlet", "MidNumLet"), + ("ml", "MidLetter"), + ("mn", "MidNum"), + ("newline", "Newline"), + ("nl", "Newline"), + ("nu", "Numeric"), + ("numeric", "Numeric"), + ("other", "Other"), + ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), + ("singlequote", "Single_Quote"), + ("sq", "Single_Quote"), + ("wsegspace", "WSegSpace"), + ("xx", "Other"), + ("zwj", "ZWJ"), + ], + ), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/script.rs b/vendor/regex-syntax/src/unicode_tables/script.rs new file mode 100644 index 000000000..cd86cba0d --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/script.rs @@ -0,0 +1,1218 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script ucd-13.0.0 --chars +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), + ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), + ("Arabic", ARABIC), + ("Armenian", ARMENIAN), + ("Avestan", AVESTAN), + ("Balinese", BALINESE), + ("Bamum", BAMUM), + ("Bassa_Vah", BASSA_VAH), + ("Batak", BATAK), + ("Bengali", BENGALI), + ("Bhaiksuki", BHAIKSUKI), + ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), + ("Braille", BRAILLE), + ("Buginese", BUGINESE), + ("Buhid", BUHID), + ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), + ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), + ("Cham", CHAM), + ("Cherokee", CHEROKEE), + ("Chorasmian", CHORASMIAN), + ("Common", COMMON), + ("Coptic", COPTIC), + ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), + ("Cyrillic", CYRILLIC), + ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), + ("Dives_Akuru", DIVES_AKURU), + ("Dogra", DOGRA), + ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), + ("Elbasan", ELBASAN), + ("Elymaic", ELYMAIC), + ("Ethiopic", ETHIOPIC), + ("Georgian", GEORGIAN), + ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), + ("Grantha", GRANTHA), + ("Greek", GREEK), + ("Gujarati", GUJARATI), + ("Gunjala_Gondi", GUNJALA_GONDI), + ("Gurmukhi", GURMUKHI), + ("Han", HAN), + ("Hangul", HANGUL), + ("Hanifi_Rohingya", HANIFI_ROHINGYA), + ("Hanunoo", HANUNOO), + ("Hatran", HATRAN), + ("Hebrew", HEBREW), + ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), + ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), + ("Javanese", JAVANESE), + ("Kaithi", KAITHI), + ("Kannada", KANNADA), + ("Katakana", KATAKANA), + ("Kayah_Li", KAYAH_LI), + ("Kharoshthi", KHAROSHTHI), + ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), + ("Khmer", KHMER), + ("Khojki", KHOJKI), + ("Khudawadi", KHUDAWADI), + ("Lao", LAO), + ("Latin", LATIN), + ("Lepcha", LEPCHA), + ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), + ("Linear_B", LINEAR_B), + ("Lisu", LISU), + ("Lycian", LYCIAN), + ("Lydian", LYDIAN), + ("Mahajani", MAHAJANI), + ("Makasar", MAKASAR), + ("Malayalam", MALAYALAM), + ("Mandaic", MANDAIC), + ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), + ("Masaram_Gondi", MASARAM_GONDI), + ("Medefaidrin", MEDEFAIDRIN), + ("Meetei_Mayek", MEETEI_MAYEK), + ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), + ("Miao", MIAO), + ("Modi", MODI), + ("Mongolian", MONGOLIAN), + ("Mro", MRO), + ("Multani", MULTANI), + ("Myanmar", MYANMAR), + ("Nabataean", NABATAEAN), + ("Nandinagari", NANDINAGARI), + ("New_Tai_Lue", NEW_TAI_LUE), + ("Newa", NEWA), + ("Nko", NKO), + ("Nushu", NUSHU), + ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), + ("Ogham", OGHAM), + ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), + ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), + ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), + ("Old_Sogdian", OLD_SOGDIAN), + ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), + ("Oriya", ORIYA), + ("Osage", OSAGE), + ("Osmanya", OSMANYA), + ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), + ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), + ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), + ("Rejang", REJANG), + ("Runic", RUNIC), + ("Samaritan", SAMARITAN), + ("Saurashtra", SAURASHTRA), + ("Sharada", SHARADA), + ("Shavian", SHAVIAN), + ("Siddham", SIDDHAM), + ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), + ("Sogdian", SOGDIAN), + ("Sora_Sompeng", SORA_SOMPENG), + ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), + ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), + ("Tagalog", TAGALOG), + ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), + ("Tai_Tham", TAI_THAM), + ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), + ("Tamil", TAMIL), + ("Tangut", TANGUT), + ("Telugu", TELUGU), + ("Thaana", THAANA), + ("Thai", THAI), + ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), + ("Tirhuta", TIRHUTA), + ("Ugaritic", UGARITIC), + ("Vai", VAI), + ("Wancho", WANCHO), + ("Warang_Citi", WARANG_CITI), + ("Yezidi", YEZIDI), + ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = + &[('đ¤', 'đĨ'), ('đĨ', 'đĨ'), ('đĨ', 'đĨ')]; + +pub const AHOM: &'static [(char, char)] = + &[('đ', 'đ'), ('\u{1171d}', '\u{1172b}'), ('đ°', 'đŋ')]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('đ', 'đ')]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), + ('Ø', 'Ø'), + ('Ø', '\u{61a}'), + ('\u{61c}', '\u{61c}'), + ('Ø', 'Ø'), + ('Ø ', 'Øŋ'), + ('Ų', 'Ų'), + ('\u{656}', 'Ų¯'), + ('Ųą', '\u{6dc}'), + ('Û', 'Ûŋ'), + ('Ũ', 'Ũŋ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', '\u{8ff}'), + ('ī', 'ī¯'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇŊ'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('đš ', 'đšž'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đģ°', 'đģą'), +]; + +pub const ARMENIAN: &'static [(char, char)] = + &[('Ôą', 'Õ'), ('Õ', 'Ö'), ('Ö', 'Ö'), ('īŦ', 'īŦ')]; + +pub const AVESTAN: &'static [(char, char)] = &[('đŦ', 'đŦĩ'), ('đŦš', 'đŦŋ')]; + +pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'á'), ('á', 'áŧ')]; + +pub const BAMUM: &'static [(char, char)] = &[('ę ', 'ęˇ'), ('đ ', 'đ¨¸')]; + +pub const BASSA_VAH: &'static [(char, char)] = + &[('đĢ', 'đĢ'), ('\u{16af0}', 'đĢĩ')]; + +pub const BATAK: &'static [(char, char)] = &[('á¯', 'á¯ŗ'), ('á¯ŧ', 'á¯ŋ')]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('āĻ', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('\u{9bc}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('\u{9d7}', '\u{9d7}'), + ('ā§', 'ā§'), + ('ā§', '\u{9e3}'), + ('ā§Ļ', '\u{9fe}'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = + &[('đ°', 'đ°'), ('đ°', '\u{11c36}'), ('\u{11c38}', 'đą
'), ('đą', 'đąŦ')]; + +pub const BOPOMOFO: &'static [(char, char)] = + &[('ËĒ', 'ËĢ'), ('ã
', 'ã¯'), ('ã ', '\u{31bf}')]; + +pub const BRAHMI: &'static [(char, char)] = + &[('đ', 'đ'), ('đ', 'đ¯'), ('\u{1107f}', '\u{1107f}')]; + +pub const BRAILLE: &'static [(char, char)] = &[('â ', 'âŖŋ')]; + +pub const BUGINESE: &'static [(char, char)] = &[('á¨', '\u{1a1b}'), ('á¨', 'á¨')]; + +pub const BUHID: &'static [(char, char)] = &[('á', '\u{1753}')]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = + &[('á', 'áŋ'), ('áĸ°', 'áŖĩ')]; + +pub const CARIAN: &'static [(char, char)] = &[('đ ', 'đ')]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = + &[('đ°', 'đŖ'), ('đ¯', 'đ¯')]; + +pub const CHAKMA: &'static [(char, char)] = + &[('\u{11100}', '\u{11134}'), ('đļ', '\u{11147}')]; + +pub const CHAM: &'static [(char, char)] = + &[('ę¨', '\u{aa36}'), ('ęŠ', 'ęŠ'), ('ęŠ', 'ęŠ'), ('ęŠ', 'ęŠ')]; + +pub const CHEROKEE: &'static [(char, char)] = + &[('á ', 'áĩ'), ('á¸', 'áŊ'), ('ę°', 'ęŽŋ')]; + +pub const CHORASMIAN: &'static [(char, char)] = &[('\u{10fb0}', '\u{10fcb}')]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\u{0}', '@'), + ('[', '`'), + ('{', 'Š'), + ('ÂĢ', 'š'), + ('Âģ', 'Âŋ'), + ('Ã', 'Ã'), + ('Ãˇ', 'Ãˇ'), + ('Ęš', 'Ë'), + ('ËĨ', 'ËŠ'), + ('ËŦ', 'Ëŋ'), + ('Í´', 'Í´'), + ('Íž', 'Íž'), + ('Î
', 'Î
'), + ('Î', 'Î'), + ('\u{605}', '\u{605}'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ų', 'Ų'), + ('\u{6dd}', '\u{6dd}'), + ('\u{8e2}', '\u{8e2}'), + ('āĨ¤', 'āĨĨ'), + ('ā¸ŋ', 'ā¸ŋ'), + ('āŋ', 'āŋ'), + ('áģ', 'áģ'), + ('áĢ', 'á'), + ('áĩ', 'áļ'), + ('á ', 'á '), + ('á
', 'á
'), + ('áŗ', 'áŗ'), + ('áŗĄ', 'áŗĄ'), + ('áŗŠ', 'áŗŦ'), + ('áŗŽ', 'áŗŗ'), + ('áŗĩ', 'áŗˇ'), + ('áŗē', 'áŗē'), + ('\u{2000}', '\u{200b}'), + ('\u{200e}', '\u{2064}'), + ('\u{2066}', 'â°'), + ('â´', 'âž'), + ('â', 'â'), + ('â ', 'âŋ'), + ('â', 'âĨ'), + ('â§', 'âŠ'), + ('âŦ', 'âą'), + ('âŗ', 'â
'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'âĻ'), + ('â', 'â'), + ('â ', 'âŋ'), + ('â¤', 'âŗ'), + ('âļ', 'âŽ'), + ('\u{2b97}', 'â¯ŋ'), + ('â¸', '\u{2e52}'), + ('âŋ°', 'âŋģ'), + ('\u{3000}', 'ã'), + ('ã', 'ã'), + ('ã', 'ã '), + ('ã°', 'ãˇ'), + ('ãŧ', 'ãŋ'), + ('ã', 'ã'), + ('ã ', 'ã '), + ('ãģ', 'ãŧ'), + ('ã', 'ã'), + ('ã', 'ãŖ'), + ('ã ', 'ã'), + ('ãŋ', 'ã'), + ('ãŋ', 'ãŋ'), + ('ã', 'ãŋ'), + ('äˇ', 'äˇŋ'), + ('ę', 'ęĄ'), + ('ę', 'ę'), + ('ę °', 'ę š'), + ('ꤎ', 'ꤎ'), + ('ę§', 'ę§'), + ('ę', 'ę'), + ('\u{ab6a}', '\u{ab6b}'), + ('ī´ž', 'ī´ŋ'), + ('ī¸', 'ī¸'), + ('ī¸°', 'īš'), + ('īš', 'īšĻ'), + ('īš¨', 'īšĢ'), + ('\u{feff}', '\u{feff}'), + ('īŧ', 'īŧ '), + ('īŧģ', 'īŊ'), + ('īŊ', 'īŊĨ'), + ('īŊ°', 'īŊ°'), + ('\u{ff9e}', '\u{ff9f}'), + ('īŋ ', 'īŋĻ'), + ('īŋ¨', 'īŋŽ'), + ('\u{fff9}', 'īŋŊ'), + ('đ', 'đ'), + ('đ', 'đŗ'), + ('đˇ', 'đŋ'), + ('đ', '\u{1019c}'), + ('đ', 'đŧ'), + ('đĄ', 'đģ'), + ('đŋĸ', 'đŋŖ'), + ('\u{1bca0}', '\u{1bca3}'), + ('đ', 'đĩ'), + ('đ', 'đĻ'), + ('đŠ', 'đ
Ļ'), + ('đ
Ē', '\u{1d17a}'), + ('đ', 'đ'), + ('đ', 'đŠ'), + ('đŽ', 'đ¨'), + ('đ ', 'đŗ'), + ('đ', 'đ'), + ('đ ', 'đ¸'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đŋ'), + ('đąą', 'đ˛´'), + ('đ´', 'đ´Ŋ'), + ('đ', 'đĢ'), + ('đ°', 'đ'), + ('đ ', 'đŽ'), + ('đą', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', '\u{1f1ad}'), + ('đĻ', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ ', 'đĨ'), + ('đ', '\u{1f6d7}'), + ('đ ', 'đŦ'), + ('đ°', '\u{1f6fc}'), + ('đ', 'đŗ'), + ('đ', 'đ'), + ('đ ', 'đĢ'), + ('đ ', 'đ '), + ('đ ', 'đĄ'), + ('đĄ', 'đĄ'), + ('đĄ ', 'đĸ'), + ('đĸ', 'đĸ'), + ('\u{1f8b0}', '\u{1f8b1}'), + ('đ¤', '\u{1f978}'), + ('đĨē', '\u{1f9cb}'), + ('đ§', 'đŠ'), + ('đŠ ', 'đŠ'), + ('đŠ°', '\u{1fa74}'), + ('đŠ¸', 'đŠē'), + ('đĒ', '\u{1fa86}'), + ('đĒ', '\u{1faa8}'), + ('\u{1fab0}', '\u{1fab6}'), + ('\u{1fac0}', '\u{1fac2}'), + ('\u{1fad0}', '\u{1fad6}'), + ('\u{1fb00}', '\u{1fb92}'), + ('\u{1fb94}', '\u{1fbca}'), + ('\u{1fbf0}', '\u{1fbf9}'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = + &[('Īĸ', 'Ī¯'), ('â˛', 'âŗŗ'), ('âŗš', 'âŗŋ')]; + +pub const CUNEIFORM: &'static [(char, char)] = + &[('đ', 'đ'), ('đ', 'đŽ'), ('đ°', 'đ´'), ('đ', 'đ')]; + +pub const CYPRIOT: &'static [(char, char)] = + &[('đ ', 'đ
'), ('đ ', 'đ '), ('đ ', 'đ ĩ'), ('đ ˇ', 'đ ¸'), ('đ ŧ', 'đ ŧ'), ('đ ŋ', 'đ ŋ')]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Đ', '\u{484}'), + ('\u{487}', 'Ô¯'), + ('á˛', 'á˛'), + ('á´Ģ', 'á´Ģ'), + ('áĩ¸', 'áĩ¸'), + ('\u{2de0}', '\u{2dff}'), + ('ę', '\u{a69f}'), + ('\u{fe2e}', '\u{fe2f}'), +]; + +pub const DESERET: &'static [(char, char)] = &[('đ', 'đ')]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('\u{900}', 'āĨ'), + ('\u{955}', '\u{963}'), + ('āĨĻ', 'āĨŋ'), + ('\u{a8e0}', '\u{a8ff}'), +]; + +pub const DIVES_AKURU: &'static [(char, char)] = &[ + ('\u{11900}', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{11946}'), + ('\u{11950}', '\u{11959}'), +]; + +pub const DOGRA: &'static [(char, char)] = &[('đ ', 'đ ģ')]; + +pub const DUPLOYAN: &'static [(char, char)] = + &[('đ°', 'đąĒ'), ('đą°', 'đąŧ'), ('đ˛', 'đ˛'), ('đ˛', 'đ˛'), ('đ˛', 'đ˛')]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = + &[('đ', 'đŽ'), ('\u{13430}', '\u{13438}')]; + +pub const ELBASAN: &'static [(char, char)] = &[('đ', 'đ§')]; + +pub const ELYMAIC: &'static [(char, char)] = &[('đŋ ', 'đŋļ')]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('\u{135d}', 'áŧ'), + ('á', 'á'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'áŋ'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[('đ°', 'đ')]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('\u{11300}', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('\u{1133c}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đ', 'đŖ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('Í°', 'Íŗ'), + ('Íĩ', '͡'), + ('Íē', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'ĪĄ'), + ('Ī°', 'Īŋ'), + ('á´Ļ', 'á´Ē'), + ('áĩ', 'áĩĄ'), + ('áĩĻ', 'áĩĒ'), + ('áļŋ', 'áļŋ'), + ('áŧ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ¯'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋž'), + ('âĻ', 'âĻ'), + ('ęĨ', 'ęĨ'), + ('đ
', 'đ'), + ('đ ', 'đ '), + ('đ', 'đ
'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('\u{a81}', 'āĒ'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', '\u{acd}'), + ('āĢ', 'āĢ'), + ('āĢ ', '\u{ae3}'), + ('āĢĻ', 'āĢą'), + ('āĢš', '\u{aff}'), +]; + +pub const GUNJALA_GONDI: &'static [(char, char)] = &[ + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', 'đļ'), + ('đļ ', 'đļŠ'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('\u{a01}', 'ā¨'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('\u{a3c}', '\u{a3c}'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠĻ', 'āŠļ'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('âē', 'âē'), + ('âē', 'âģŗ'), + ('âŧ', 'âŋ'), + ('ã
', 'ã
'), + ('ã', 'ã'), + ('ãĄ', 'ãŠ'), + ('ã¸', 'ãģ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('\u{16ff0}', '\u{16ff1}'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('á', 'áŋ'), + ('\u{302e}', '\u{302f}'), + ('ãą', 'ã'), + ('ã', 'ã'), + ('ã ', 'ãž'), + ('ęĨ ', 'ęĨŧ'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('īž ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), +]; + +pub const HANIFI_ROHINGYA: &'static [(char, char)] = + &[('đ´', '\u{10d27}'), ('đ´°', 'đ´š')]; + +pub const HANUNOO: &'static [(char, char)] = &[('á ', '\u{1734}')]; + +pub const HATRAN: &'static [(char, char)] = + &[('đŖ ', 'đŖ˛'), ('đŖ´', 'đŖĩ'), ('đŖģ', 'đŖŋ')]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('\u{591}', '\u{5c7}'), + ('×', '×Ē'), + ('ׯ', '×´'), + ('īŦ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'ī'), +]; + +pub const HIRAGANA: &'static [(char, char)] = + &[('ã', 'ã'), ('ã', 'ã'), ('đ', 'đ'), ('đ
', 'đ
'), ('đ', 'đ')]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = + &[('đĄ', 'đĄ'), ('đĄ', 'đĄ')]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{485}', '\u{486}'), + ('\u{64b}', '\u{655}'), + ('\u{670}', '\u{670}'), + ('\u{951}', '\u{954}'), + ('\u{1ab0}', '\u{1ac0}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{200c}', '\u{200d}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{302a}', '\u{302d}'), + ('\u{3099}', '\u{309a}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2d}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{1133b}', '\u{1133b}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = + &[('đ ', 'đ˛'), ('đ¸', 'đŋ')]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = + &[('đ', 'đ'), ('đ', 'đ')]; + +pub const JAVANESE: &'static [(char, char)] = + &[('\u{a980}', 'ę§'), ('ę§', 'ę§'), ('ę§', 'ę§')]; + +pub const KAITHI: &'static [(char, char)] = + &[('\u{11080}', 'đ'), ('\u{110cd}', '\u{110cd}')]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('\u{cbc}', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('āŗ', 'āŗ'), + ('āŗ ', '\u{ce3}'), + ('āŗĻ', 'āŗ¯'), + ('āŗą', 'āŗ˛'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('ãĄ', 'ãē'), + ('ãŊ', 'ãŋ'), + ('ã°', 'ãŋ'), + ('ã', 'ãž'), + ('ã', 'ã'), + ('īŊĻ', 'īŊ¯'), + ('īŊą', 'īž'), + ('đ', 'đ'), + ('đ
¤', 'đ
§'), +]; + +pub const KAYAH_LI: &'static [(char, char)] = &[('ę¤', '\u{a92d}'), ('ę¤¯', 'ę¤¯')]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('đ¨', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', 'đŠ'), + ('đŠ', 'đŠ'), +]; + +pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] = + &[('\u{16fe4}', '\u{16fe4}'), ('\u{18b00}', '\u{18cd5}')]; + +pub const KHMER: &'static [(char, char)] = + &[('á', '\u{17dd}'), ('á ', 'áŠ'), ('á°', 'áš'), ('᧠', 'á§ŋ')]; + +pub const KHOJKI: &'static [(char, char)] = &[('đ', 'đ'), ('đ', '\u{1123e}')]; + +pub const KHUDAWADI: &'static [(char, char)] = + &[('đ°', '\u{112ea}'), ('đ°', 'đš')]; + +pub const LAO: &'static [(char, char)] = &[ + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('\u{ec8}', '\u{ecd}'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'ʸ'), + ('Ë ', 'ˤ'), + ('á´', 'á´Ĩ'), + ('á´Ŧ', 'áĩ'), + ('áĩĸ', 'áĩĨ'), + ('áĩĢ', 'áĩˇ'), + ('áĩš', 'áļž'), + ('á¸', 'áģŋ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('âĒ', 'âĢ'), + ('â˛', 'â˛'), + ('â
', 'â
'), + ('â
', 'â'), + ('âą ', 'âąŋ'), + ('ęĸ', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ęŋ'), + ('ęŦ°', 'ę'), + ('ę', 'ę¤'), + ('ęĻ', '\u{ab69}'), + ('īŦ', 'īŦ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), +]; + +pub const LEPCHA: &'static [(char, char)] = + &[('á°', '\u{1c37}'), ('á°ģ', 'áą'), ('áą', 'áą')]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('á¤', 'á¤'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', '\u{193b}'), + ('áĨ', 'áĨ'), + ('áĨ', 'áĨ'), +]; + +pub const LINEAR_A: &'static [(char, char)] = + &[('đ', 'đļ'), ('đ', 'đ'), ('đ ', 'đ§')]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), +]; + +pub const LISU: &'static [(char, char)] = + &[('ę', 'ęŋ'), ('\u{11fb0}', '\u{11fb0}')]; + +pub const LYCIAN: &'static [(char, char)] = &[('đ', 'đ')]; + +pub const LYDIAN: &'static [(char, char)] = &[('đ¤ ', 'đ¤š'), ('đ¤ŋ', 'đ¤ŋ')]; + +pub const MAHAJANI: &'static [(char, char)] = &[('đ
', 'đ
ļ')]; + +pub const MAKASAR: &'static [(char, char)] = &[('đģ ', 'đģ¸')]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('\u{d00}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d63}'), + ('āĩĻ', 'āĩŋ'), +]; + +pub const MANDAIC: &'static [(char, char)] = &[('āĄ', '\u{85b}'), ('āĄ', 'āĄ')]; + +pub const MANICHAEAN: &'static [(char, char)] = + &[('đĢ', '\u{10ae6}'), ('đĢĢ', 'đĢļ')]; + +pub const MARCHEN: &'static [(char, char)] = + &[('đą°', 'đ˛'), ('\u{11c92}', '\u{11ca7}'), ('đ˛Š', '\u{11cb6}')]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('đĩ', 'đĩ'), +]; + +pub const MEDEFAIDRIN: &'static [(char, char)] = &[('đš', 'đē')]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = + &[('ęĢ ', '\u{aaf6}'), ('ę¯', '\u{abed}'), ('ę¯°', 'ę¯š')]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = + &[('đ ', 'đŖ'), ('đŖ', '\u{1e8d6}')]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = + &[('đĻ ', 'đĻˇ'), ('đĻŧ', 'đ§'), ('đ§', 'đ§ŋ')]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('đĻ', 'đĻ')]; + +pub const MIAO: &'static [(char, char)] = + &[('đŧ', 'đŊ'), ('\u{16f4f}', 'đž'), ('\u{16f8f}', 'đž')]; + +pub const MODI: &'static [(char, char)] = &[('đ', 'đ'), ('đ', 'đ')]; + +pub const MONGOLIAN: &'static [(char, char)] = &[ + ('á ', 'á '), + ('á ', 'á '), + ('á ', '\u{180e}'), + ('á ', 'á '), + ('á ', '᥸'), + ('áĸ', 'áĸĒ'), + ('đ ', 'đŦ'), +]; + +pub const MRO: &'static [(char, char)] = &[('đŠ', 'đŠ'), ('đŠ ', 'đŠŠ'), ('đŠŽ', 'đŠ¯')]; + +pub const MULTANI: &'static [(char, char)] = + &[('đ', 'đ'), ('đ', 'đ'), ('đ', 'đ'), ('đ', 'đ'), ('đ', 'đŠ')]; + +pub const MYANMAR: &'static [(char, char)] = + &[('á', 'á'), ('ę§ ', '꧞'), ('ęŠ ', 'ęŠŋ')]; + +pub const NABATAEAN: &'static [(char, char)] = &[('đĸ', 'đĸ'), ('đĸ§', 'đĸ¯')]; + +pub const NANDINAGARI: &'static [(char, char)] = + &[('đĻ ', 'đĻ§'), ('đĻĒ', '\u{119d7}'), ('\u{119da}', 'đ§¤')]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = + &[('áĻ', 'áĻĢ'), ('áĻ°', 'á§'), ('á§', 'á§'), ('á§', 'á§')]; + +pub const NEWA: &'static [(char, char)] = &[('đ', 'đ'), ('đ', '\u{11461}')]; + +pub const NKO: &'static [(char, char)] = &[('ß', 'ßē'), ('\u{7fd}', 'ßŋ')]; + +pub const NUSHU: &'static [(char, char)] = &[('đŋĄ', 'đŋĄ'), ('đ
°', 'đģ')]; + +pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = + &[('đ', 'đŦ'), ('\u{1e130}', 'đŊ'), ('đ
', 'đ
'), ('đ
', 'đ
')]; + +pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', 'á')]; + +pub const OL_CHIKI: &'static [(char, char)] = &[('áą', 'áąŋ')]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = + &[('đ˛', 'đ˛˛'), ('đŗ', 'đŗ˛'), ('đŗē', 'đŗŋ')]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[('đ', 'đŖ'), ('đ', 'đ¯')]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('đĒ', 'đĒ')]; + +pub const OLD_PERMIC: &'static [(char, char)] = &[('đ', '\u{1037a}')]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[('đ ', 'đ'), ('đ', 'đ')]; + +pub const OLD_SOGDIAN: &'static [(char, char)] = &[('đŧ', 'đŧ§')]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('đŠ ', 'đŠŋ')]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[('đ°', 'đą')]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('\u{b01}', 'āŦ'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('\u{b3c}', '\u{b44}'), + ('ā', 'ā'), + ('ā', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ā', 'ā'), + ('ā', '\u{b63}'), + ('āĻ', 'āˇ'), +]; + +pub const OSAGE: &'static [(char, char)] = &[('đ°', 'đ'), ('đ', 'đģ')]; + +pub const OSMANYA: &'static [(char, char)] = &[('đ', 'đ'), ('đ ', 'đŠ')]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = + &[('đŦ', 'đ
'), ('đ', 'đ'), ('đ', 'đĄ'), ('đŖ', 'đˇ'), ('đŊ', 'đŽ')]; + +pub const PALMYRENE: &'static [(char, char)] = &[('đĄ ', 'đĄŋ')]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[('đĢ', 'đĢ¸')]; + +pub const PHAGS_PA: &'static [(char, char)] = &[('ęĄ', 'ꥡ')]; + +pub const PHOENICIAN: &'static [(char, char)] = &[('đ¤', 'đ¤'), ('đ¤', 'đ¤')]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = + &[('đŽ', 'đŽ'), ('đŽ', 'đŽ'), ('đŽŠ', 'đŽ¯')]; + +pub const REJANG: &'static [(char, char)] = &[('ꤰ', 'ęĨ'), ('ęĨ', 'ęĨ')]; + +pub const RUNIC: &'static [(char, char)] = &[('á ', 'áĒ'), ('áŽ', 'á¸')]; + +pub const SAMARITAN: &'static [(char, char)] = &[('ā ', '\u{82d}'), ('ā °', 'ā ž')]; + +pub const SAURASHTRA: &'static [(char, char)] = + &[('ęĸ', '\u{a8c5}'), ('ęŖ', 'ęŖ')]; + +pub const SHARADA: &'static [(char, char)] = &[('\u{11180}', 'đ')]; + +pub const SHAVIAN: &'static [(char, char)] = &[('đ', 'đŋ')]; + +pub const SIDDHAM: &'static [(char, char)] = + &[('đ', '\u{115b5}'), ('đ¸', '\u{115dd}')]; + +pub const SIGNWRITING: &'static [(char, char)] = + &[('đ ', 'đĒ'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('\u{d81}', 'āļ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇĻ', 'āˇ¯'), + ('āˇ˛', 'āˇ´'), + ('đĄ', 'đ´'), +]; + +pub const SOGDIAN: &'static [(char, char)] = &[('đŧ°', 'đŊ')]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[('đ', 'đ¨'), ('đ°', 'đš')]; + +pub const SOYOMBO: &'static [(char, char)] = &[('đŠ', 'đĒĸ')]; + +pub const SUNDANESE: &'static [(char, char)] = + &[('\u{1b80}', 'áŽŋ'), ('áŗ', 'áŗ')]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = &[('ę ', '\u{a82c}')]; + +pub const SYRIAC: &'static [(char, char)] = + &[('Ü', 'Ü'), ('\u{70f}', '\u{74a}'), ('Ũ', 'Ũ'), ('āĄ ', 'āĄĒ')]; + +pub const TAGALOG: &'static [(char, char)] = &[('á', 'á'), ('á', '\u{1714}')]; + +pub const TAGBANWA: &'static [(char, char)] = + &[('á ', 'áŦ'), ('áŽ', 'á°'), ('\u{1772}', '\u{1773}')]; + +pub const TAI_LE: &'static [(char, char)] = &[('áĨ', 'áĨ'), ('áĨ°', 'áĨ´')]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', 'áĒ'), + ('áĒ', 'áĒ'), + ('áĒ ', 'áĒ'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[('ęĒ', 'ęĢ'), ('ęĢ', 'ęĢ')]; + +pub const TAKRI: &'static [(char, char)] = &[('đ', 'đ¸'), ('đ', 'đ')]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('\u{b82}', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', '\u{bcd}'), + ('ā¯', 'ā¯'), + ('\u{bd7}', '\u{bd7}'), + ('ā¯Ļ', 'ā¯ē'), + ('đŋ', 'đŋą'), + ('đŋŋ', 'đŋŋ'), +]; + +pub const TANGUT: &'static [(char, char)] = &[ + ('đŋ ', 'đŋ '), + ('đ', 'đˇ'), + ('đ ', '\u{18aff}'), + ('\u{18d00}', '\u{18d08}'), +]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('\u{c00}', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('āą', 'āą'), + ('āą ', '\u{c63}'), + ('āąĻ', 'āą¯'), + ('āąˇ', 'āąŋ'), +]; + +pub const THAANA: &'static [(char, char)] = &[('Ū', 'Ūą')]; + +pub const THAI: &'static [(char, char)] = &[('ā¸', '\u{e3a}'), ('āš', 'āš')]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('āŧ', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('\u{f71}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('āžž', 'āŋ'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), +]; + +pub const TIFINAGH: &'static [(char, char)] = + &[('â´°', 'âĩ§'), ('âĩ¯', 'âĩ°'), ('\u{2d7f}', '\u{2d7f}')]; + +pub const TIRHUTA: &'static [(char, char)] = &[('đ', 'đ'), ('đ', 'đ')]; + +pub const UGARITIC: &'static [(char, char)] = &[('đ', 'đ'), ('đ', 'đ')]; + +pub const VAI: &'static [(char, char)] = &[('ę', 'ęĢ')]; + +pub const WANCHO: &'static [(char, char)] = &[('đ', 'đš'), ('đŋ', 'đŋ')]; + +pub const WARANG_CITI: &'static [(char, char)] = &[('đĸ ', 'đŖ˛'), ('đŖŋ', 'đŖŋ')]; + +pub const YEZIDI: &'static [(char, char)] = &[ + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eab}', '\u{10ead}'), + ('\u{10eb0}', '\u{10eb1}'), +]; + +pub const YI: &'static [(char, char)] = &[('ę', 'ę'), ('ę', 'ę')]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('đ¨', '\u{11a47}')]; diff --git a/vendor/regex-syntax/src/unicode_tables/script_extension.rs b/vendor/regex-syntax/src/unicode_tables/script_extension.rs new file mode 100644 index 000000000..7fca2af9d --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/script_extension.rs @@ -0,0 +1,1396 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script-extension ucd-13.0.0 --chars +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), + ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), + ("Arabic", ARABIC), + ("Armenian", ARMENIAN), + ("Avestan", AVESTAN), + ("Balinese", BALINESE), + ("Bamum", BAMUM), + ("Bassa_Vah", BASSA_VAH), + ("Batak", BATAK), + ("Bengali", BENGALI), + ("Bhaiksuki", BHAIKSUKI), + ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), + ("Braille", BRAILLE), + ("Buginese", BUGINESE), + ("Buhid", BUHID), + ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), + ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), + ("Cham", CHAM), + ("Cherokee", CHEROKEE), + ("Chorasmian", CHORASMIAN), + ("Common", COMMON), + ("Coptic", COPTIC), + ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), + ("Cyrillic", CYRILLIC), + ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), + ("Dives_Akuru", DIVES_AKURU), + ("Dogra", DOGRA), + ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), + ("Elbasan", ELBASAN), + ("Elymaic", ELYMAIC), + ("Ethiopic", ETHIOPIC), + ("Georgian", GEORGIAN), + ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), + ("Grantha", GRANTHA), + ("Greek", GREEK), + ("Gujarati", GUJARATI), + ("Gunjala_Gondi", GUNJALA_GONDI), + ("Gurmukhi", GURMUKHI), + ("Han", HAN), + ("Hangul", HANGUL), + ("Hanifi_Rohingya", HANIFI_ROHINGYA), + ("Hanunoo", HANUNOO), + ("Hatran", HATRAN), + ("Hebrew", HEBREW), + ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), + ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), + ("Javanese", JAVANESE), + ("Kaithi", KAITHI), + ("Kannada", KANNADA), + ("Katakana", KATAKANA), + ("Kayah_Li", KAYAH_LI), + ("Kharoshthi", KHAROSHTHI), + ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), + ("Khmer", KHMER), + ("Khojki", KHOJKI), + ("Khudawadi", KHUDAWADI), + ("Lao", LAO), + ("Latin", LATIN), + ("Lepcha", LEPCHA), + ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), + ("Linear_B", LINEAR_B), + ("Lisu", LISU), + ("Lycian", LYCIAN), + ("Lydian", LYDIAN), + ("Mahajani", MAHAJANI), + ("Makasar", MAKASAR), + ("Malayalam", MALAYALAM), + ("Mandaic", MANDAIC), + ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), + ("Masaram_Gondi", MASARAM_GONDI), + ("Medefaidrin", MEDEFAIDRIN), + ("Meetei_Mayek", MEETEI_MAYEK), + ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), + ("Miao", MIAO), + ("Modi", MODI), + ("Mongolian", MONGOLIAN), + ("Mro", MRO), + ("Multani", MULTANI), + ("Myanmar", MYANMAR), + ("Nabataean", NABATAEAN), + ("Nandinagari", NANDINAGARI), + ("New_Tai_Lue", NEW_TAI_LUE), + ("Newa", NEWA), + ("Nko", NKO), + ("Nushu", NUSHU), + ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), + ("Ogham", OGHAM), + ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), + ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), + ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), + ("Old_Sogdian", OLD_SOGDIAN), + ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), + ("Oriya", ORIYA), + ("Osage", OSAGE), + ("Osmanya", OSMANYA), + ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), + ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), + ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), + ("Rejang", REJANG), + ("Runic", RUNIC), + ("Samaritan", SAMARITAN), + ("Saurashtra", SAURASHTRA), + ("Sharada", SHARADA), + ("Shavian", SHAVIAN), + ("Siddham", SIDDHAM), + ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), + ("Sogdian", SOGDIAN), + ("Sora_Sompeng", SORA_SOMPENG), + ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), + ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), + ("Tagalog", TAGALOG), + ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), + ("Tai_Tham", TAI_THAM), + ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), + ("Tamil", TAMIL), + ("Tangut", TANGUT), + ("Telugu", TELUGU), + ("Thaana", THAANA), + ("Thai", THAI), + ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), + ("Tirhuta", TIRHUTA), + ("Ugaritic", UGARITIC), + ("Vai", VAI), + ("Wancho", WANCHO), + ("Warang_Citi", WARANG_CITI), + ("Yezidi", YEZIDI), + ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = + &[('Ų', 'Ų'), ('đ¤', 'đĨ'), ('đĨ', 'đĨ'), ('đĨ', 'đĨ')]; + +pub const AHOM: &'static [(char, char)] = + &[('đ', 'đ'), ('\u{1171d}', '\u{1172b}'), ('đ°', 'đŋ')]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('đ', 'đ')]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), + ('Ø', '\u{61c}'), + ('Ø', '\u{6dc}'), + ('Û', 'Ûŋ'), + ('Ũ', 'Ũŋ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', '\u{8ff}'), + ('ī', 'ī¯'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇŊ'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('\u{102e0}', 'đģ'), + ('đš ', 'đšž'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đģ°', 'đģą'), +]; + +pub const ARMENIAN: &'static [(char, char)] = + &[('Ôą', 'Õ'), ('Õ', 'Ö'), ('Ö', 'Ö'), ('īŦ', 'īŦ')]; + +pub const AVESTAN: &'static [(char, char)] = &[('đŦ', 'đŦĩ'), ('đŦš', 'đŦŋ')]; + +pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'á'), ('á', 'áŧ')]; + +pub const BAMUM: &'static [(char, char)] = &[('ę ', 'ęˇ'), ('đ ', 'đ¨¸')]; + +pub const BASSA_VAH: &'static [(char, char)] = + &[('đĢ', 'đĢ'), ('\u{16af0}', 'đĢĩ')]; + +pub const BATAK: &'static [(char, char)] = &[('á¯', 'á¯ŗ'), ('á¯ŧ', 'á¯ŋ')]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('āĻ', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('\u{9bc}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('\u{9d7}', '\u{9d7}'), + ('ā§', 'ā§'), + ('ā§', '\u{9e3}'), + ('ā§Ļ', '\u{9fe}'), + ('\u{1cd0}', '\u{1cd0}'), + ('\u{1cd2}', '\u{1cd2}'), + ('\u{1cd5}', '\u{1cd6}'), + ('\u{1cd8}', '\u{1cd8}'), + ('áŗĄ', 'áŗĄ'), + ('áŗĒ', 'áŗĒ'), + ('\u{1ced}', '\u{1ced}'), + ('áŗ˛', 'áŗ˛'), + ('áŗĩ', 'áŗˇ'), + ('\u{a8f1}', '\u{a8f1}'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = + &[('đ°', 'đ°'), ('đ°', '\u{11c36}'), ('\u{11c38}', 'đą
'), ('đą', 'đąŦ')]; + +pub const BOPOMOFO: &'static [(char, char)] = &[ + ('ËĒ', 'ËĢ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('\u{302a}', '\u{302d}'), + ('ã°', 'ã°'), + ('ãˇ', 'ãˇ'), + ('ãģ', 'ãģ'), + ('ã
', 'ã¯'), + ('ã ', '\u{31bf}'), + ('īš
', 'īš'), + ('īŊĄ', 'īŊĨ'), +]; + +pub const BRAHMI: &'static [(char, char)] = + &[('đ', 'đ'), ('đ', 'đ¯'), ('\u{1107f}', '\u{1107f}')]; + +pub const BRAILLE: &'static [(char, char)] = &[('â ', 'âŖŋ')]; + +pub const BUGINESE: &'static [(char, char)] = + &[('á¨', '\u{1a1b}'), ('á¨', 'á¨'), ('ę§', 'ę§')]; + +pub const BUHID: &'static [(char, char)] = &[('áĩ', 'áļ'), ('á', '\u{1753}')]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = + &[('á', 'áŋ'), ('áĸ°', 'áŖĩ')]; + +pub const CARIAN: &'static [(char, char)] = &[('đ ', 'đ')]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = + &[('đ°', 'đŖ'), ('đ¯', 'đ¯')]; + +pub const CHAKMA: &'static [(char, char)] = + &[('ā§Ļ', 'ā§¯'), ('á', 'á'), ('\u{11100}', '\u{11134}'), ('đļ', '\u{11147}')]; + +pub const CHAM: &'static [(char, char)] = + &[('ę¨', '\u{aa36}'), ('ęŠ', 'ęŠ'), ('ęŠ', 'ęŠ'), ('ęŠ', 'ęŠ')]; + +pub const CHEROKEE: &'static [(char, char)] = + &[('á ', 'áĩ'), ('á¸', 'áŊ'), ('ę°', 'ęŽŋ')]; + +pub const CHORASMIAN: &'static [(char, char)] = &[('\u{10fb0}', '\u{10fcb}')]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\u{0}', '@'), + ('[', '`'), + ('{', 'Š'), + ('ÂĢ', 'š'), + ('Âģ', 'Âŋ'), + ('Ã', 'Ã'), + ('Ãˇ', 'Ãˇ'), + ('Ęš', 'Ë'), + ('ËĨ', 'ËŠ'), + ('ËŦ', 'Ëŋ'), + ('Í´', 'Í´'), + ('Íž', 'Íž'), + ('Î
', 'Î
'), + ('Î', 'Î'), + ('\u{605}', '\u{605}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{8e2}', '\u{8e2}'), + ('ā¸ŋ', 'ā¸ŋ'), + ('āŋ', 'āŋ'), + ('áĢ', 'á'), + ('\u{2000}', '\u{200b}'), + ('\u{200e}', '\u{202e}'), + ('â°', '\u{2064}'), + ('\u{2066}', 'â°'), + ('â´', 'âž'), + ('â', 'â'), + ('â ', 'âŋ'), + ('â', 'âĨ'), + ('â§', 'âŠ'), + ('âŦ', 'âą'), + ('âŗ', 'â
'), + ('â
', 'â
'), + ('â', 'â'), + ('â', 'âĻ'), + ('â', 'â'), + ('â ', 'âŋ'), + ('â¤', 'âŗ'), + ('âļ', 'âŽ'), + ('\u{2b97}', 'â¯ŋ'), + ('â¸', 'âš'), + ('âš', '\u{2e52}'), + ('âŋ°', 'âŋģ'), + ('\u{3000}', '\u{3000}'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã ', 'ã '), + ('ãļ', 'ãļ'), + ('ã', 'ã'), + ('ãŋ', 'ãŋ'), + ('ãą', 'ãŋ'), + ('ã', 'ã'), + ('ãą', 'ãē'), + ('ã', 'ã'), + ('ãŋ', 'ãŋ'), + ('äˇ', 'äˇŋ'), + ('ę', 'ęĄ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('\u{ab6a}', '\u{ab6b}'), + ('ī´ž', 'ī´ŋ'), + ('ī¸', 'ī¸'), + ('ī¸°', 'īš'), + ('īš', 'īš'), + ('īš', 'īšĻ'), + ('īš¨', 'īšĢ'), + ('\u{feff}', '\u{feff}'), + ('īŧ', 'īŧ '), + ('īŧģ', 'īŊ'), + ('īŊ', 'īŊ '), + ('īŋ ', 'īŋĻ'), + ('īŋ¨', 'īŋŽ'), + ('\u{fff9}', 'īŋŊ'), + ('đ', '\u{1019c}'), + ('đ', 'đŧ'), + ('đŋĸ', 'đŋŖ'), + ('đ', 'đĩ'), + ('đ', 'đĻ'), + ('đŠ', 'đ
Ļ'), + ('đ
Ē', '\u{1d17a}'), + ('đ', 'đ'), + ('đ', 'đŠ'), + ('đŽ', 'đ¨'), + ('đ ', 'đŗ'), + ('đ', 'đ'), + ('đ˛', 'đ¸'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đŋ'), + ('đąą', 'đ˛´'), + ('đ´', 'đ´Ŋ'), + ('đ', 'đĢ'), + ('đ°', 'đ'), + ('đ ', 'đŽ'), + ('đą', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', '\u{1f1ad}'), + ('đĻ', 'đŋ'), + ('đ', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ'), + ('đ ', 'đĨ'), + ('đ', '\u{1f6d7}'), + ('đ ', 'đŦ'), + ('đ°', '\u{1f6fc}'), + ('đ', 'đŗ'), + ('đ', 'đ'), + ('đ ', 'đĢ'), + ('đ ', 'đ '), + ('đ ', 'đĄ'), + ('đĄ', 'đĄ'), + ('đĄ ', 'đĸ'), + ('đĸ', 'đĸ'), + ('\u{1f8b0}', '\u{1f8b1}'), + ('đ¤', '\u{1f978}'), + ('đĨē', '\u{1f9cb}'), + ('đ§', 'đŠ'), + ('đŠ ', 'đŠ'), + ('đŠ°', '\u{1fa74}'), + ('đŠ¸', 'đŠē'), + ('đĒ', '\u{1fa86}'), + ('đĒ', '\u{1faa8}'), + ('\u{1fab0}', '\u{1fab6}'), + ('\u{1fac0}', '\u{1fac2}'), + ('\u{1fad0}', '\u{1fad6}'), + ('\u{1fb00}', '\u{1fb92}'), + ('\u{1fb94}', '\u{1fbca}'), + ('\u{1fbf0}', '\u{1fbf9}'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = + &[('Īĸ', 'Ī¯'), ('â˛', 'âŗŗ'), ('âŗš', 'âŗŋ'), ('\u{102e0}', 'đģ')]; + +pub const CUNEIFORM: &'static [(char, char)] = + &[('đ', 'đ'), ('đ', 'đŽ'), ('đ°', 'đ´'), ('đ', 'đ')]; + +pub const CYPRIOT: &'static [(char, char)] = &[ + ('đ', 'đ'), + ('đ', 'đŗ'), + ('đˇ', 'đŋ'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đ ŋ'), +]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Đ', 'Ô¯'), + ('á˛', 'á˛'), + ('á´Ģ', 'á´Ģ'), + ('áĩ¸', 'áĩ¸'), + ('\u{1df8}', '\u{1df8}'), + ('\u{2de0}', '\u{2dff}'), + ('âš', 'âš'), + ('ę', '\u{a69f}'), + ('\u{fe2e}', '\u{fe2f}'), +]; + +pub const DESERET: &'static [(char, char)] = &[('đ', 'đ')]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('\u{900}', '\u{952}'), + ('\u{955}', 'āĨŋ'), + ('\u{1cd0}', 'áŗļ'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{20f0}', '\u{20f0}'), + ('ę °', 'ę š'), + ('\u{a8e0}', '\u{a8ff}'), +]; + +pub const DIVES_AKURU: &'static [(char, char)] = &[ + ('\u{11900}', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{11946}'), + ('\u{11950}', '\u{11959}'), +]; + +pub const DOGRA: &'static [(char, char)] = + &[('āĨ¤', 'āĨ¯'), ('ę °', 'ę š'), ('đ ', 'đ ģ')]; + +pub const DUPLOYAN: &'static [(char, char)] = + &[('đ°', 'đąĒ'), ('đą°', 'đąŧ'), ('đ˛', 'đ˛'), ('đ˛', 'đ˛'), ('đ˛', '\u{1bca3}')]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = + &[('đ', 'đŽ'), ('\u{13430}', '\u{13438}')]; + +pub const ELBASAN: &'static [(char, char)] = &[('đ', 'đ§')]; + +pub const ELYMAIC: &'static [(char, char)] = &[('đŋ ', 'đŋļ')]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('\u{135d}', 'áŧ'), + ('á', 'á'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áŋ'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('\u{484}', '\u{484}'), + ('\u{487}', '\u{487}'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âš', 'âš'), + ('\u{a66f}', '\u{a66f}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[('đ°', 'đ')]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('ā¯Ļ', 'ā¯ŗ'), + ('\u{1cd0}', '\u{1cd0}'), + ('\u{1cd2}', 'áŗ'), + ('áŗ˛', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{20f0}', '\u{20f0}'), + ('\u{11300}', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('\u{1133b}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đ', 'đŖ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('đŋ', 'đŋ'), + ('đŋ', 'đŋ'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('\u{342}', '\u{342}'), + ('\u{345}', '\u{345}'), + ('Í°', 'Íŗ'), + ('Íĩ', '͡'), + ('Íē', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'ĪĄ'), + ('Ī°', 'Īŋ'), + ('á´Ļ', 'á´Ē'), + ('áĩ', 'áĩĄ'), + ('áĩĻ', 'áĩĒ'), + ('áļŋ', '\u{1dc1}'), + ('áŧ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ¯'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋž'), + ('âĻ', 'âĻ'), + ('ęĨ', 'ęĨ'), + ('đ
', 'đ'), + ('đ ', 'đ '), + ('đ', 'đ
'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('\u{a81}', 'āĒ'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', '\u{acd}'), + ('āĢ', 'āĢ'), + ('āĢ ', '\u{ae3}'), + ('āĢĻ', 'āĢą'), + ('āĢš', '\u{aff}'), + ('ę °', 'ę š'), +]; + +pub const GUNJALA_GONDI: &'static [(char, char)] = &[ + ('āĨ¤', 'āĨĨ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', 'đļ'), + ('đļ ', 'đļŠ'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('\u{a01}', 'ā¨'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('\u{a3c}', '\u{a3c}'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠĻ', 'āŠļ'), + ('ę °', 'ę š'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('âē', 'âē'), + ('âē', 'âģŗ'), + ('âŧ', 'âŋ'), + ('ã', 'ã'), + ('ã
', 'ã'), + ('ã', 'ã'), + ('ãĄ', '\u{302d}'), + ('ã°', 'ã°'), + ('ãˇ', 'ãŋ'), + ('ãģ', 'ãģ'), + ('ã', 'ã'), + ('ã', 'ãŖ'), + ('ã ', 'ã'), + ('ã', 'ã°'), + ('ã', 'ã'), + ('ãŋ', 'ãŋ'), + ('ã', 'ã°'), + ('ãģ', 'ãŋ'), + ('ã ', 'ãž'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īš
', 'īš'), + ('īŊĄ', 'īŊĨ'), + ('\u{16ff0}', '\u{16ff1}'), + ('đ ', 'đą'), + ('đ', 'đ'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('á', 'áŋ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('\u{302e}', 'ã°'), + ('ãˇ', 'ãˇ'), + ('ãģ', 'ãģ'), + ('ãą', 'ã'), + ('ã', 'ã'), + ('ã ', 'ãž'), + ('ęĨ ', 'ęĨŧ'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('īš
', 'īš'), + ('īŊĄ', 'īŊĨ'), + ('īž ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), +]; + +pub const HANIFI_ROHINGYA: &'static [(char, char)] = &[ + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ų', 'Ų'), + ('Û', 'Û'), + ('đ´', '\u{10d27}'), + ('đ´°', 'đ´š'), +]; + +pub const HANUNOO: &'static [(char, char)] = &[('á ', 'áļ')]; + +pub const HATRAN: &'static [(char, char)] = + &[('đŖ ', 'đŖ˛'), ('đŖ´', 'đŖĩ'), ('đŖģ', 'đŖŋ')]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('\u{591}', '\u{5c7}'), + ('×', '×Ē'), + ('ׯ', '×´'), + ('īŦ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'ī'), +]; + +pub const HIRAGANA: &'static [(char, char)] = &[ + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã°', 'ãĩ'), + ('ãˇ', 'ãˇ'), + ('ãŧ', 'ãŊ'), + ('ã', 'ã'), + ('\u{3099}', 'ã '), + ('ãģ', 'ãŧ'), + ('īš
', 'īš'), + ('īŊĄ', 'īŊĨ'), + ('īŊ°', 'īŊ°'), + ('\u{ff9e}', '\u{ff9f}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), +]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = + &[('đĄ', 'đĄ'), ('đĄ', 'đĄ')]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('\u{300}', '\u{341}'), + ('\u{343}', '\u{344}'), + ('\u{346}', '\u{362}'), + ('\u{953}', '\u{954}'), + ('\u{1ab0}', '\u{1ac0}'), + ('\u{1dc2}', '\u{1df7}'), + ('\u{1df9}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{200c}', '\u{200d}'), + ('\u{20d0}', '\u{20ef}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2d}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = + &[('đ ', 'đ˛'), ('đ¸', 'đŋ')]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = + &[('đ', 'đ'), ('đ', 'đ')]; + +pub const JAVANESE: &'static [(char, char)] = + &[('\u{a980}', 'ę§'), ('ę§', 'ę§'), ('ę§', 'ę§')]; + +pub const KAITHI: &'static [(char, char)] = + &[('āĨĻ', 'āĨ¯'), ('ę °', 'ę š'), ('\u{11080}', 'đ'), ('\u{110cd}', '\u{110cd}')]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('\u{cbc}', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('āŗ', 'āŗ'), + ('āŗ ', '\u{ce3}'), + ('āŗĻ', 'āŗ¯'), + ('āŗą', 'āŗ˛'), + ('\u{1cd0}', '\u{1cd0}'), + ('\u{1cd2}', '\u{1cd2}'), + ('\u{1cda}', '\u{1cda}'), + ('áŗ˛', 'áŗ˛'), + ('\u{1cf4}', '\u{1cf4}'), + ('ę °', 'ę ĩ'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã°', 'ãĩ'), + ('ãˇ', 'ãˇ'), + ('ãŧ', 'ãŊ'), + ('\u{3099}', 'ã'), + ('ã ', 'ãŋ'), + ('ã°', 'ãŋ'), + ('ã', 'ãž'), + ('ã', 'ã'), + ('īš
', 'īš'), + ('īŊĄ', '\u{ff9f}'), + ('đ', 'đ'), + ('đ
¤', 'đ
§'), +]; + +pub const KAYAH_LI: &'static [(char, char)] = &[('ę¤', 'ę¤¯')]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('đ¨', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', 'đŠ'), + ('đŠ', 'đŠ'), +]; + +pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] = + &[('\u{16fe4}', '\u{16fe4}'), ('\u{18b00}', '\u{18cd5}')]; + +pub const KHMER: &'static [(char, char)] = + &[('á', '\u{17dd}'), ('á ', 'áŠ'), ('á°', 'áš'), ('᧠', 'á§ŋ')]; + +pub const KHOJKI: &'static [(char, char)] = + &[('āĢĻ', 'āĢ¯'), ('ę °', 'ę š'), ('đ', 'đ'), ('đ', '\u{1123e}')]; + +pub const KHUDAWADI: &'static [(char, char)] = + &[('āĨ¤', 'āĨĨ'), ('ę °', 'ę š'), ('đ°', '\u{112ea}'), ('đ°', 'đš')]; + +pub const LAO: &'static [(char, char)] = &[ + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('\u{ec8}', '\u{ecd}'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'ʸ'), + ('Ë ', 'ˤ'), + ('\u{363}', '\u{36f}'), + ('\u{485}', '\u{486}'), + ('\u{951}', '\u{952}'), + ('áģ', 'áģ'), + ('á´', 'á´Ĩ'), + ('á´Ŧ', 'áĩ'), + ('áĩĸ', 'áĩĨ'), + ('áĩĢ', 'áĩˇ'), + ('áĩš', 'áļž'), + ('á¸', 'áģŋ'), + ('\u{202f}', '\u{202f}'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('\u{20f0}', '\u{20f0}'), + ('âĒ', 'âĢ'), + ('â˛', 'â˛'), + ('â
', 'â
'), + ('â
', 'â'), + ('âą ', 'âąŋ'), + ('ę', 'ę'), + ('ęĸ', 'ę'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ęŋ'), + ('ꤎ', 'ꤎ'), + ('ęŦ°', 'ę'), + ('ę', 'ę¤'), + ('ęĻ', '\u{ab69}'), + ('īŦ', 'īŦ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), +]; + +pub const LEPCHA: &'static [(char, char)] = + &[('á°', '\u{1c37}'), ('á°ģ', 'áą'), ('áą', 'áą')]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('āĨĨ', 'āĨĨ'), + ('á¤', 'á¤'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', '\u{193b}'), + ('áĨ', 'áĨ'), + ('áĨ', 'áĨ'), +]; + +pub const LINEAR_A: &'static [(char, char)] = + &[('đ', 'đŗ'), ('đ', 'đļ'), ('đ', 'đ'), ('đ ', 'đ§')]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ', 'đ'), + ('đ', 'đŗ'), + ('đˇ', 'đŋ'), +]; + +pub const LISU: &'static [(char, char)] = + &[('ę', 'ęŋ'), ('\u{11fb0}', '\u{11fb0}')]; + +pub const LYCIAN: &'static [(char, char)] = &[('đ', 'đ')]; + +pub const LYDIAN: &'static [(char, char)] = &[('đ¤ ', 'đ¤š'), ('đ¤ŋ', 'đ¤ŋ')]; + +pub const MAHAJANI: &'static [(char, char)] = + &[('āĨ¤', 'āĨ¯'), ('ę °', 'ę š'), ('đ
', 'đ
ļ')]; + +pub const MAKASAR: &'static [(char, char)] = &[('đģ ', 'đģ¸')]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('\u{d00}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d63}'), + ('āĩĻ', 'āĩŋ'), + ('\u{1cda}', '\u{1cda}'), + ('ę °', 'ę ˛'), +]; + +pub const MANDAIC: &'static [(char, char)] = + &[('Ų', 'Ų'), ('āĄ', '\u{85b}'), ('āĄ', 'āĄ')]; + +pub const MANICHAEAN: &'static [(char, char)] = + &[('Ų', 'Ų'), ('đĢ', '\u{10ae6}'), ('đĢĢ', 'đĢļ')]; + +pub const MARCHEN: &'static [(char, char)] = + &[('đą°', 'đ˛'), ('\u{11c92}', '\u{11ca7}'), ('đ˛Š', '\u{11cb6}')]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('āĨ¤', 'āĨĨ'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('đĩ', 'đĩ'), +]; + +pub const MEDEFAIDRIN: &'static [(char, char)] = &[('đš', 'đē')]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = + &[('ęĢ ', '\u{aaf6}'), ('ę¯', '\u{abed}'), ('ę¯°', 'ę¯š')]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = + &[('đ ', 'đŖ'), ('đŖ', '\u{1e8d6}')]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = + &[('đĻ ', 'đĻˇ'), ('đĻŧ', 'đ§'), ('đ§', 'đ§ŋ')]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('đĻ', 'đĻ')]; + +pub const MIAO: &'static [(char, char)] = + &[('đŧ', 'đŊ'), ('\u{16f4f}', 'đž'), ('\u{16f8f}', 'đž')]; + +pub const MODI: &'static [(char, char)] = + &[('ę °', 'ę š'), ('đ', 'đ'), ('đ', 'đ')]; + +pub const MONGOLIAN: &'static [(char, char)] = &[ + ('á ', '\u{180e}'), + ('á ', 'á '), + ('á ', '᥸'), + ('áĸ', 'áĸĒ'), + ('\u{202f}', '\u{202f}'), + ('đ ', 'đŦ'), +]; + +pub const MRO: &'static [(char, char)] = &[('đŠ', 'đŠ'), ('đŠ ', 'đŠŠ'), ('đŠŽ', 'đŠ¯')]; + +pub const MULTANI: &'static [(char, char)] = + &[('āŠĻ', 'āŠ¯'), ('đ', 'đ'), ('đ', 'đ'), ('đ', 'đ'), ('đ', 'đ'), ('đ', 'đŠ')]; + +pub const MYANMAR: &'static [(char, char)] = + &[('á', 'á'), ('ꤎ', 'ꤎ'), ('ę§ ', '꧞'), ('ęŠ ', 'ęŠŋ')]; + +pub const NABATAEAN: &'static [(char, char)] = &[('đĸ', 'đĸ'), ('đĸ§', 'đĸ¯')]; + +pub const NANDINAGARI: &'static [(char, char)] = &[ + ('āĨ¤', 'āĨĨ'), + ('āŗĻ', 'āŗ¯'), + ('áŗŠ', 'áŗŠ'), + ('áŗ˛', 'áŗ˛'), + ('áŗē', 'áŗē'), + ('ę °', 'ę ĩ'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', '\u{119d7}'), + ('\u{119da}', 'đ§¤'), +]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = + &[('áĻ', 'áĻĢ'), ('áĻ°', 'á§'), ('á§', 'á§'), ('á§', 'á§')]; + +pub const NEWA: &'static [(char, char)] = &[('đ', 'đ'), ('đ', '\u{11461}')]; + +pub const NKO: &'static [(char, char)] = &[('ß', 'ßē'), ('\u{7fd}', 'ßŋ')]; + +pub const NUSHU: &'static [(char, char)] = &[('đŋĄ', 'đŋĄ'), ('đ
°', 'đģ')]; + +pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = + &[('đ', 'đŦ'), ('\u{1e130}', 'đŊ'), ('đ
', 'đ
'), ('đ
', 'đ
')]; + +pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', 'á')]; + +pub const OL_CHIKI: &'static [(char, char)] = &[('áą', 'áąŋ')]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = + &[('đ˛', 'đ˛˛'), ('đŗ', 'đŗ˛'), ('đŗē', 'đŗŋ')]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[('đ', 'đŖ'), ('đ', 'đ¯')]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('đĒ', 'đĒ')]; + +pub const OLD_PERMIC: &'static [(char, char)] = + &[('\u{483}', '\u{483}'), ('đ', '\u{1037a}')]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[('đ ', 'đ'), ('đ', 'đ')]; + +pub const OLD_SOGDIAN: &'static [(char, char)] = &[('đŧ', 'đŧ§')]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('đŠ ', 'đŠŋ')]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[('đ°', 'đą')]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('\u{b01}', 'āŦ'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('\u{b3c}', '\u{b44}'), + ('ā', 'ā'), + ('ā', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ā', 'ā'), + ('ā', '\u{b63}'), + ('āĻ', 'āˇ'), + ('\u{1cda}', '\u{1cda}'), + ('áŗ˛', 'áŗ˛'), +]; + +pub const OSAGE: &'static [(char, char)] = &[('đ°', 'đ'), ('đ', 'đģ')]; + +pub const OSMANYA: &'static [(char, char)] = &[('đ', 'đ'), ('đ ', 'đŠ')]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = + &[('đŦ', 'đ
'), ('đ', 'đ'), ('đ', 'đĄ'), ('đŖ', 'đˇ'), ('đŊ', 'đŽ')]; + +pub const PALMYRENE: &'static [(char, char)] = &[('đĄ ', 'đĄŋ')]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[('đĢ', 'đĢ¸')]; + +pub const PHAGS_PA: &'static [(char, char)] = + &[('á ', 'á '), ('á
', 'á
'), ('ęĄ', 'ꥡ')]; + +pub const PHOENICIAN: &'static [(char, char)] = &[('đ¤', 'đ¤'), ('đ¤', 'đ¤')]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = + &[('Ų', 'Ų'), ('đŽ', 'đŽ'), ('đŽ', 'đŽ'), ('đŽŠ', 'đŽ¯')]; + +pub const REJANG: &'static [(char, char)] = &[('ꤰ', 'ęĨ'), ('ęĨ', 'ęĨ')]; + +pub const RUNIC: &'static [(char, char)] = &[('á ', 'áĒ'), ('áŽ', 'á¸')]; + +pub const SAMARITAN: &'static [(char, char)] = &[('ā ', '\u{82d}'), ('ā °', 'ā ž')]; + +pub const SAURASHTRA: &'static [(char, char)] = + &[('ęĸ', '\u{a8c5}'), ('ęŖ', 'ęŖ')]; + +pub const SHARADA: &'static [(char, char)] = &[ + ('\u{951}', '\u{951}'), + ('\u{1cd7}', '\u{1cd7}'), + ('\u{1cd9}', '\u{1cd9}'), + ('\u{1cdc}', '\u{1cdd}'), + ('\u{1ce0}', '\u{1ce0}'), + ('\u{11180}', 'đ'), +]; + +pub const SHAVIAN: &'static [(char, char)] = &[('đ', 'đŋ')]; + +pub const SIDDHAM: &'static [(char, char)] = + &[('đ', '\u{115b5}'), ('đ¸', '\u{115dd}')]; + +pub const SIGNWRITING: &'static [(char, char)] = + &[('đ ', 'đĒ'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('āĨ¤', 'āĨĨ'), + ('\u{d81}', 'āļ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇĻ', 'āˇ¯'), + ('āˇ˛', 'āˇ´'), + ('đĄ', 'đ´'), +]; + +pub const SOGDIAN: &'static [(char, char)] = &[('Ų', 'Ų'), ('đŧ°', 'đŊ')]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[('đ', 'đ¨'), ('đ°', 'đš')]; + +pub const SOYOMBO: &'static [(char, char)] = &[('đŠ', 'đĒĸ')]; + +pub const SUNDANESE: &'static [(char, char)] = + &[('\u{1b80}', 'áŽŋ'), ('áŗ', 'áŗ')]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = + &[('āĨ¤', 'āĨĨ'), ('ā§Ļ', 'ā§¯'), ('ę ', '\u{a82c}')]; + +pub const SYRIAC: &'static [(char, char)] = &[ + ('Ø', 'Ø'), + ('Ø', '\u{61c}'), + ('Ø', 'Ø'), + ('Ų', 'Ų'), + ('\u{64b}', '\u{655}'), + ('\u{670}', '\u{670}'), + ('Ü', 'Ü'), + ('\u{70f}', '\u{74a}'), + ('Ũ', 'Ũ'), + ('āĄ ', 'āĄĒ'), + ('\u{1df8}', '\u{1df8}'), +]; + +pub const TAGALOG: &'static [(char, char)] = + &[('á', 'á'), ('á', '\u{1714}'), ('áĩ', 'áļ')]; + +pub const TAGBANWA: &'static [(char, char)] = + &[('áĩ', 'áļ'), ('á ', 'áŦ'), ('áŽ', 'á°'), ('\u{1772}', '\u{1773}')]; + +pub const TAI_LE: &'static [(char, char)] = + &[('á', 'á'), ('áĨ', 'áĨ'), ('áĨ°', 'áĨ´')]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', 'áĒ'), + ('áĒ', 'áĒ'), + ('áĒ ', 'áĒ'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[('ęĒ', 'ęĢ'), ('ęĢ', 'ęĢ')]; + +pub const TAKRI: &'static [(char, char)] = + &[('āĨ¤', 'āĨĨ'), ('ę °', 'ę š'), ('đ', 'đ¸'), ('đ', 'đ')]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('\u{b82}', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', '\u{bcd}'), + ('ā¯', 'ā¯'), + ('\u{bd7}', '\u{bd7}'), + ('ā¯Ļ', 'ā¯ē'), + ('\u{1cda}', '\u{1cda}'), + ('ęŖŗ', 'ęŖŗ'), + ('\u{11301}', '\u{11301}'), + ('đ', 'đ'), + ('\u{1133b}', '\u{1133c}'), + ('đŋ', 'đŋą'), + ('đŋŋ', 'đŋŋ'), +]; + +pub const TANGUT: &'static [(char, char)] = &[ + ('đŋ ', 'đŋ '), + ('đ', 'đˇ'), + ('đ ', '\u{18aff}'), + ('\u{18d00}', '\u{18d08}'), +]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('\u{c00}', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('āą', 'āą'), + ('āą ', '\u{c63}'), + ('āąĻ', 'āą¯'), + ('āąˇ', 'āąŋ'), + ('\u{1cda}', '\u{1cda}'), + ('áŗ˛', 'áŗ˛'), +]; + +pub const THAANA: &'static [(char, char)] = &[ + ('Ø', 'Ø'), + ('Ø', '\u{61c}'), + ('Ø', 'Ø'), + ('Ų ', 'ŲŠ'), + ('Ū', 'Ūą'), + ('īˇ˛', 'īˇ˛'), + ('īˇŊ', 'īˇŊ'), +]; + +pub const THAI: &'static [(char, char)] = &[('ā¸', '\u{e3a}'), ('āš', 'āš')]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('āŧ', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('\u{f71}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('āžž', 'āŋ'), + ('āŋ', 'āŋ'), + ('āŋ', 'āŋ'), +]; + +pub const TIFINAGH: &'static [(char, char)] = + &[('â´°', 'âĩ§'), ('âĩ¯', 'âĩ°'), ('\u{2d7f}', '\u{2d7f}')]; + +pub const TIRHUTA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('āĨ¤', 'āĨĨ'), + ('áŗ˛', 'áŗ˛'), + ('ę °', 'ę š'), + ('đ', 'đ'), + ('đ', 'đ'), +]; + +pub const UGARITIC: &'static [(char, char)] = &[('đ', 'đ'), ('đ', 'đ')]; + +pub const VAI: &'static [(char, char)] = &[('ę', 'ęĢ')]; + +pub const WANCHO: &'static [(char, char)] = &[('đ', 'đš'), ('đŋ', 'đŋ')]; + +pub const WARANG_CITI: &'static [(char, char)] = &[('đĸ ', 'đŖ˛'), ('đŖŋ', 'đŖŋ')]; + +pub const YEZIDI: &'static [(char, char)] = &[ + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ø', 'Ø'), + ('Ų ', 'ŲŠ'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eab}', '\u{10ead}'), + ('\u{10eb0}', '\u{10eb1}'), +]; + +pub const YI: &'static [(char, char)] = &[ + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãģ', 'ãģ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('īŊĄ', 'īŊĨ'), +]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('đ¨', '\u{11a47}')]; diff --git a/vendor/regex-syntax/src/unicode_tables/sentence_break.rs b/vendor/regex-syntax/src/unicode_tables/sentence_break.rs new file mode 100644 index 000000000..67d830f74 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/sentence_break.rs @@ -0,0 +1,2396 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate sentence-break ucd-13.0.0 --chars +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ATerm", ATERM), + ("CR", CR), + ("Close", CLOSE), + ("Extend", EXTEND), + ("Format", FORMAT), + ("LF", LF), + ("Lower", LOWER), + ("Numeric", NUMERIC), + ("OLetter", OLETTER), + ("SContinue", SCONTINUE), + ("STerm", STERM), + ("Sep", SEP), + ("Sp", SP), + ("Upper", UPPER), +]; + +pub const ATERM: &'static [(char, char)] = + &[('.', '.'), ('â¤', 'â¤'), ('īš', 'īš'), ('īŧ', 'īŧ')]; + +pub const CR: &'static [(char, char)] = &[('\r', '\r')]; + +pub const CLOSE: &'static [(char, char)] = &[ + ('\"', '\"'), + ('\'', ')'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('ÂĢ', 'ÂĢ'), + ('Âģ', 'Âģ'), + ('āŧē', 'āŧŊ'), + ('á', 'á'), + ('â', 'â'), + ('âš', 'âē'), + ('â
', 'â'), + ('âŊ', 'âž'), + ('â', 'â'), + ('â', 'â'), + ('âŠ', 'âĒ'), + ('â', 'â '), + ('â¨', 'âĩ'), + ('â
', 'â'), + ('âĻ', 'â¯'), + ('âĻ', 'âĻ'), + ('â§', 'â§'), + ('â§ŧ', 'â§Ŋ'), + ('â¸', 'â¸'), + ('â¸', 'â¸'), + ('⸠', '⸊'), + ('âš', 'âš'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ī´ž', 'ī´ŋ'), + ('ī¸', 'ī¸'), + ('ī¸ĩ', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), + ('īŧģ', 'īŧģ'), + ('īŧŊ', 'īŧŊ'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ'), + ('īŊ', 'īŊ '), + ('īŊĸ', 'īŊŖ'), + ('đļ', 'đ¸'), +]; + +pub const EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', 'ā¤'), + ('\u{93a}', '\u{93c}'), + ('ā¤ž', 'āĨ'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'āĻ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ā¨'), + ('\u{a3c}', '\u{a3c}'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'āĒ'), + ('\u{abc}', '\u{abc}'), + ('āĒž', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', 'āŦ'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b44}'), + ('ā', 'ā'), + ('ā', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3e}', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ā˛'), + ('\u{cbc}', '\u{cbc}'), + ('ā˛ž', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', 'ā´'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'āļ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇ˛', 'āˇŗ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ecd}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('āŧž', 'āŧŋ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('áĢ', '\u{103e}'), + ('á', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('áĸ', 'á¤'), + ('á§', 'á'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('á', 'á'), + ('á', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1734}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', '\u{193b}'), + ('\u{1a17}', '\u{1a1b}'), + ('áŠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ac0}'), + ('\u{1b00}', 'áŦ'), + ('\u{1b34}', 'á'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'áŽ'), + ('Ꭵ', '\u{1bad}'), + ('\u{1be6}', 'á¯ŗ'), + ('á°¤', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('áŗˇ', '\u{1cf9}'), + ('\u{1dc0}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{200c}', '\u{200d}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('ę Ŗ', 'ę §'), + ('\u{a82c}', '\u{a82c}'), + ('ęĸ', 'ęĸ'), + ('ęĸ´', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', 'ęĨ'), + ('\u{a980}', 'ęĻ'), + ('\u{a9b3}', 'ę§'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ęŠ'), + ('ęŠģ', 'ęŠŊ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ęĢĢ', 'ęĢ¯'), + ('ęĢĩ', '\u{aaf6}'), + ('ę¯Ŗ', 'ę¯Ē'), + ('ę¯Ŧ', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10f46}', '\u{10f50}'), + ('đ', 'đ'), + ('\u{11038}', '\u{11046}'), + ('\u{1107f}', 'đ'), + ('đ°', '\u{110ba}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11134}'), + ('đ
', 'đ
'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', 'đ'), + ('đŗ', 'đ'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111ce}', '\u{111cf}'), + ('đŦ', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{112df}', '\u{112ea}'), + ('\u{11300}', 'đ'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đĸ', 'đŖ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('đĩ', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114c3}'), + ('\u{115af}', '\u{115b5}'), + ('đ¸', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('đ°', '\u{11640}'), + ('\u{116ab}', '\u{116b7}'), + ('\u{1171d}', '\u{1172b}'), + ('đ Ŧ', '\u{1183a}'), + ('\u{11930}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{1193e}'), + ('\u{11940}', '\u{11940}'), + ('\u{11942}', '\u{11943}'), + ('đ§', '\u{119d7}'), + ('\u{119da}', '\u{119e0}'), + ('đ§¤', 'đ§¤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', 'đ¨š'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a99}'), + ('đ°¯', '\u{11c36}'), + ('\u{11c38}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('đ˛Š', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('đļ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', '\u{11d97}'), + ('\u{11ef3}', 'đģļ'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('đŊ', 'đž'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{16ff0}', '\u{16ff1}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1d165}', '\u{1d169}'), + ('đ
', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{600}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), + ('\u{200b}', '\u{200b}'), + ('\u{200e}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{13430}', '\u{13438}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), +]; + +pub const LF: &'static [(char, char)] = &[('\n', '\n')]; + +pub const LOWER: &'static [(char, char)] = &[ + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('Âē', 'Âē'), + ('Ã', 'Ãļ'), + ('ø', 'Ãŋ'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä
', 'Ä
'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('ÄĄ', 'ÄĄ'), + ('ÄŖ', 'ÄŖ'), + ('ÄĨ', 'ÄĨ'), + ('ħ', 'ħ'), + ('ÄŠ', 'ÄŠ'), + ('ÄĢ', 'ÄĢ'), + ('Ä', 'Ä'), + ('į', 'į'), + ('Äą', 'Äą'), + ('Äŗ', 'Äŗ'), + ('Äĩ', 'Äĩ'), + ('ġ', 'ĸ'), + ('Äē', 'Äē'), + ('Äŧ', 'Äŧ'), + ('Äž', 'Äž'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('ÅĄ', 'ÅĄ'), + ('ÅŖ', 'ÅŖ'), + ('ÅĨ', 'ÅĨ'), + ('ŧ', 'ŧ'), + ('ÅŠ', 'ÅŠ'), + ('ÅĢ', 'ÅĢ'), + ('Å', 'Å'), + ('ů', 'ů'), + ('Åą', 'Åą'), + ('Åŗ', 'Åŗ'), + ('Åĩ', 'Åĩ'), + ('Åˇ', 'Åˇ'), + ('Åē', 'Åē'), + ('Åŧ', 'Åŧ'), + ('Åž', 'Æ'), + ('Æ', 'Æ'), + ('Æ
', 'Æ
'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('ÆĄ', 'ÆĄ'), + ('ÆŖ', 'ÆŖ'), + ('ÆĨ', 'ÆĨ'), + ('ƨ', 'ƨ'), + ('ÆĒ', 'ÆĢ'), + ('Æ', 'Æ'), + ('Æ°', 'Æ°'), + ('Æ´', 'Æ´'), + ('Æļ', 'Æļ'), + ('Æš', 'Æē'), + ('ÆŊ', 'Æŋ'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('ĮĄ', 'ĮĄ'), + ('ĮŖ', 'ĮŖ'), + ('ĮĨ', 'ĮĨ'), + ('Į§', 'Į§'), + ('ĮŠ', 'ĮŠ'), + ('ĮĢ', 'ĮĢ'), + ('Į', 'Į'), + ('Į¯', 'Į°'), + ('Įŗ', 'Įŗ'), + ('Įĩ', 'Įĩ'), + ('Įš', 'Įš'), + ('Įģ', 'Įģ'), + ('ĮŊ', 'ĮŊ'), + ('Įŋ', 'Įŋ'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č
', 'Č
'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('ČĄ', 'ČĄ'), + ('ČŖ', 'ČŖ'), + ('ČĨ', 'ČĨ'), + ('ȧ', 'ȧ'), + ('ČŠ', 'ČŠ'), + ('ČĢ', 'ČĢ'), + ('Č', 'Č'), + ('Č¯', 'Č¯'), + ('Čą', 'Čą'), + ('Čŗ', 'Čš'), + ('Čŧ', 'Čŧ'), + ('Čŋ', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'Ę'), + ('Ę', 'ʸ'), + ('Ë', 'Ë'), + ('Ë ', 'ˤ'), + ('Íą', 'Íą'), + ('Íŗ', 'Íŗ'), + ('͡', '͡'), + ('Íē', 'ÍŊ'), + ('Î', 'Î'), + ('ÎŦ', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('ĪĄ', 'ĪĄ'), + ('ĪŖ', 'ĪŖ'), + ('ĪĨ', 'ĪĨ'), + ('Ī§', 'Ī§'), + ('ĪŠ', 'ĪŠ'), + ('ĪĢ', 'ĪĢ'), + ('Ī', 'Ī'), + ('Ī¯', 'Īŗ'), + ('Īĩ', 'Īĩ'), + ('Ī¸', 'Ī¸'), + ('Īģ', 'Īŧ'), + ('Đ°', 'Ņ'), + ('ŅĄ', 'ŅĄ'), + ('ŅŖ', 'ŅŖ'), + ('ŅĨ', 'ŅĨ'), + ('Ņ§', 'Ņ§'), + ('ŅŠ', 'ŅŠ'), + ('ŅĢ', 'ŅĢ'), + ('Ņ', 'Ņ'), + ('Ņ¯', 'Ņ¯'), + ('Ņą', 'Ņą'), + ('Ņŗ', 'Ņŗ'), + ('Ņĩ', 'Ņĩ'), + ('Ņˇ', 'Ņˇ'), + ('Ņš', 'Ņš'), + ('Ņģ', 'Ņģ'), + ('ŅŊ', 'ŅŊ'), + ('Ņŋ', 'Ņŋ'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('ŌĄ', 'ŌĄ'), + ('ŌŖ', 'ŌŖ'), + ('ŌĨ', 'ŌĨ'), + ('Ō§', 'Ō§'), + ('ŌŠ', 'ŌŠ'), + ('ŌĢ', 'ŌĢ'), + ('Ō', 'Ō'), + ('Ō¯', 'Ō¯'), + ('Ōą', 'Ōą'), + ('Ōŗ', 'Ōŗ'), + ('Ōĩ', 'Ōĩ'), + ('Ōˇ', 'Ōˇ'), + ('Ōš', 'Ōš'), + ('Ōģ', 'Ōģ'), + ('ŌŊ', 'ŌŊ'), + ('Ōŋ', 'Ōŋ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('ĶĄ', 'ĶĄ'), + ('ĶŖ', 'ĶŖ'), + ('ĶĨ', 'ĶĨ'), + ('Ķ§', 'Ķ§'), + ('ĶŠ', 'ĶŠ'), + ('ĶĢ', 'ĶĢ'), + ('Ķ', 'Ķ'), + ('Ķ¯', 'Ķ¯'), + ('Ķą', 'Ķą'), + ('Ķŗ', 'Ķŗ'), + ('Ķĩ', 'Ķĩ'), + ('Ķˇ', 'Ķˇ'), + ('Ķš', 'Ķš'), + ('Ķģ', 'Ķģ'), + ('ĶŊ', 'ĶŊ'), + ('Ķŋ', 'Ķŋ'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô
', 'Ô
'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('ÔĄ', 'ÔĄ'), + ('ÔŖ', 'ÔŖ'), + ('ÔĨ', 'ÔĨ'), + ('Ô§', 'Ô§'), + ('ÔŠ', 'ÔŠ'), + ('ÔĢ', 'ÔĢ'), + ('Ô', 'Ô'), + ('Ô¯', 'Ô¯'), + ('Õ ', 'Ö'), + ('á¸', 'áŊ'), + ('á˛', 'á˛'), + ('á´', 'áļŋ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸
', 'á¸
'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('ḥ', 'ḥ'), + ('á¸Ŗ', 'á¸Ŗ'), + ('á¸Ĩ', 'á¸Ĩ'), + ('ḧ', 'ḧ'), + ('Ḋ', 'Ḋ'), + ('á¸Ģ', 'á¸Ģ'), + ('á¸', 'á¸'), + ('ḯ', 'ḯ'), + ('ḹ', 'ḹ'), + ('á¸ŗ', 'á¸ŗ'), + ('á¸ĩ', 'á¸ĩ'), + ('ḡ', 'ḡ'), + ('Ḛ', 'Ḛ'), + ('á¸ģ', 'á¸ģ'), + ('á¸Ŋ', 'á¸Ŋ'), + ('á¸ŋ', 'á¸ŋ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš
', 'áš
'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('ᚥ', 'ᚥ'), + ('ášŖ', 'ášŖ'), + ('ášĨ', 'ášĨ'), + ('ᚧ', 'ᚧ'), + ('ᚊ', 'ᚊ'), + ('ášĢ', 'ášĢ'), + ('áš', 'áš'), + ('ᚯ', 'ᚯ'), + ('ášą', 'ášą'), + ('ášŗ', 'ášŗ'), + ('ášĩ', 'ášĩ'), + ('ᚡ', 'ᚡ'), + ('ášš', 'ášš'), + ('ášģ', 'ášģ'), + ('ášŊ', 'ášŊ'), + ('ášŋ', 'ášŋ'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē
', 'áē
'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áēĄ', 'áēĄ'), + ('áēŖ', 'áēŖ'), + ('áēĨ', 'áēĨ'), + ('áē§', 'áē§'), + ('áēŠ', 'áēŠ'), + ('áēĢ', 'áēĢ'), + ('áē', 'áē'), + ('áē¯', 'áē¯'), + ('áēą', 'áēą'), + ('áēŗ', 'áēŗ'), + ('áēĩ', 'áēĩ'), + ('áēˇ', 'áēˇ'), + ('áēš', 'áēš'), + ('áēģ', 'áēģ'), + ('áēŊ', 'áēŊ'), + ('áēŋ', 'áēŋ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ
', 'áģ
'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģĄ', 'áģĄ'), + ('áģŖ', 'áģŖ'), + ('áģĨ', 'áģĨ'), + ('áģ§', 'áģ§'), + ('áģŠ', 'áģŠ'), + ('áģĢ', 'áģĢ'), + ('áģ', 'áģ'), + ('áģ¯', 'áģ¯'), + ('áģą', 'áģą'), + ('áģŗ', 'áģŗ'), + ('áģĩ', 'áģĩ'), + ('áģˇ', 'áģˇ'), + ('áģš', 'áģš'), + ('áģģ', 'áģģ'), + ('áģŊ', 'áģŊ'), + ('áģŋ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŧ§'), + ('áŧ°', 'áŧˇ'), + ('áŊ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ ', 'áŊ§'), + ('áŊ°', 'áŊŊ'), + ('áž', 'áž'), + ('áž', 'áž'), + ('áž ', 'ឧ'), + ('áž°', 'áž´'), + ('ážļ', 'ឡ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋ§'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋˇ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¯', 'â¯'), + ('â´', 'â´'), + ('âš', 'âš'), + ('âŧ', 'âŊ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
°', 'â
ŋ'), + ('â', 'â'), + ('â', 'âŠ'), + ('â°°', 'âą'), + ('⹥', '⹥'), + ('âąĨ', 'âąĻ'), + ('⹨', '⹨'), + ('âąĒ', 'âąĒ'), + ('âąŦ', 'âąŦ'), + ('âąą', 'âąą'), + ('âąŗ', 'âą´'), + ('âąļ', 'âąŊ'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛
', 'â˛
'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('ⲥ', 'ⲥ'), + ('â˛Ŗ', 'â˛Ŗ'), + ('â˛Ĩ', 'â˛Ĩ'), + ('ⲧ', 'ⲧ'), + ('Ⲋ', 'Ⲋ'), + ('â˛Ģ', 'â˛Ģ'), + ('â˛', 'â˛'), + ('â˛¯', 'â˛¯'), + ('ⲹ', 'ⲹ'), + ('â˛ŗ', 'â˛ŗ'), + ('â˛ĩ', 'â˛ĩ'), + ('ⲡ', 'ⲡ'), + ('Ⲛ', 'Ⲛ'), + ('â˛ģ', 'â˛ģ'), + ('â˛Ŋ', 'â˛Ŋ'), + ('â˛ŋ', 'â˛ŋ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ
', 'âŗ
'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗĄ', 'âŗĄ'), + ('âŗŖ', 'âŗ¤'), + ('âŗŦ', 'âŗŦ'), + ('âŗŽ', 'âŗŽ'), + ('âŗŗ', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ęą'), + ('ęŗ', 'ęŗ'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ', 'ęĢ'), + ('ę', 'ę'), + ('ę¯', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę
', 'ę
'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĄ', 'ęĄ'), + ('ęŖ', 'ęŖ'), + ('ęĨ', 'ęĨ'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ę¯', 'ę¯'), + ('ęĩ', 'ęĩ'), + ('ęˇ', 'ęˇ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęŊ'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('\u{a7c8}', '\u{a7c8}'), + ('\u{a7ca}', '\u{a7ca}'), + ('\u{a7f6}', '\u{a7f6}'), + ('ę¸', 'ęē'), + ('ęŦ°', 'ę'), + ('ę', '\u{ab68}'), + ('ę°', 'ęŽŋ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('īŊ', 'īŊ'), + ('đ¨', 'đ'), + ('đ', 'đģ'), + ('đŗ', 'đŗ˛'), + ('đŖ', 'đŖ'), + ('đš ', 'đšŋ'), + ('đ', 'đŗ'), + ('đ', 'đ'), + ('đ', 'đ§'), + ('đ', 'đ'), + ('đļ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ'), + ('đĒ', 'đ'), + ('đ', 'đˇ'), + ('đ', 'đĢ'), + ('đ', 'đ'), + ('đē', 'đ'), + ('đŽ', 'đ'), + ('đĸ', 'đģ'), + ('đ', 'đ¯'), + ('đ', 'đĨ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đŧ', 'đ'), + ('đ', 'đ'), + ('đļ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đ'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¤ĸ', 'đĨ'), +]; + +pub const NUMERIC: &'static [(char, char)] = &[ + ('0', '9'), + ('Ų ', 'ŲŠ'), + ('ŲĢ', 'ŲŦ'), + ('Û°', 'Ûš'), + ('ß', 'ß'), + ('āĨĻ', 'āĨ¯'), + ('ā§Ļ', 'ā§¯'), + ('āŠĻ', 'āŠ¯'), + ('āĢĻ', 'āĢ¯'), + ('āĻ', 'ā¯'), + ('ā¯Ļ', 'ā¯¯'), + ('āąĻ', 'āą¯'), + ('āŗĻ', 'āŗ¯'), + ('āĩĻ', 'āĩ¯'), + ('āˇĻ', 'āˇ¯'), + ('āš', 'āš'), + ('āģ', 'āģ'), + ('āŧ ', 'āŧŠ'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áŠ'), + ('á ', 'á '), + ('áĨ', 'áĨ'), + ('á§', 'á§'), + ('áĒ', 'áĒ'), + ('áĒ', 'áĒ'), + ('á', 'á'), + ('Ꮀ', '᎚'), + ('áą', 'áą'), + ('áą', 'áą'), + ('ę ', 'ęŠ'), + ('ęŖ', 'ęŖ'), + ('ę¤', 'ę¤'), + ('ę§', 'ę§'), + ('꧰', '꧚'), + ('ęŠ', 'ęŠ'), + ('ę¯°', 'ę¯š'), + ('īŧ', 'īŧ'), + ('đ ', 'đŠ'), + ('đ´°', 'đ´š'), + ('đĻ', 'đ¯'), + ('đ°', 'đš'), + ('đļ', 'đŋ'), + ('đ', 'đ'), + ('đ°', 'đš'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đš'), + ('đŖ ', 'đŖŠ'), + ('\u{11950}', '\u{11959}'), + ('đą', 'đą'), + ('đĩ', 'đĩ'), + ('đļ ', 'đļŠ'), + ('đŠ ', 'đŠŠ'), + ('đ', 'đ'), + ('đ', 'đŋ'), + ('đ
', 'đ
'), + ('đ°', 'đš'), + ('đĨ', 'đĨ'), + ('\u{1fbf0}', '\u{1fbf9}'), +]; + +pub const OLETTER: &'static [(char, char)] = &[ + ('Æģ', 'Æģ'), + ('Į', 'Į'), + ('Ę', 'Ę'), + ('Ęš', 'Ęŋ'), + ('Ë', 'Ë'), + ('ËŦ', 'ËŦ'), + ('ËŽ', 'ËŽ'), + ('Í´', 'Í´'), + ('Õ', 'Õ'), + ('×', '×Ē'), + ('ׯ', '×ŗ'), + ('Ø ', 'Ų'), + ('ŲŽ', 'Ų¯'), + ('Ųą', 'Û'), + ('Û', 'Û'), + ('ÛĨ', 'ÛĻ'), + ('ÛŽ', 'Û¯'), + ('Ûē', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', 'Ü'), + ('Ü', 'ܯ'), + ('Ũ', 'ŪĨ'), + ('Ūą', 'Ūą'), + ('ß', 'ßĒ'), + ('ß´', 'ßĩ'), + ('ßē', 'ßē'), + ('ā ', 'ā '), + ('ā ', 'ā '), + ('ā ¤', 'ā ¤'), + ('ā ¨', 'ā ¨'), + ('āĄ', 'āĄ'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('ā¤', 'ā¤š'), + ('ā¤Ŋ', 'ā¤Ŋ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨĄ'), + ('āĨą', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('āĻŊ', 'āĻŊ'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§Ą'), + ('ā§°', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠ˛', 'āŠ´'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('āĒŊ', 'āĒŊ'), + ('āĢ', 'āĢ'), + ('āĢ ', 'āĢĄ'), + ('āĢš', 'āĢš'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('āŦŊ', 'āŦŊ'), + ('ā', 'ā'), + ('ā', 'āĄ'), + ('āą', 'āą'), + ('āŽ', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('ā¯', 'ā¯'), + ('ā°
', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'ā°Ŋ'), + ('āą', 'āą'), + ('āą ', 'āąĄ'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('ā˛Ŋ', 'ā˛Ŋ'), + ('āŗ', 'āŗ'), + ('āŗ ', 'āŗĄ'), + ('āŗą', 'āŗ˛'), + ('\u{d04}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', 'ā´ē'), + ('ā´Ŋ', 'ā´Ŋ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩĄ'), + ('āĩē', 'āĩŋ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('ā¸', 'ā¸°'), + ('ā¸˛', 'ā¸ŗ'), + ('āš', 'āš'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āē'), + ('āē', 'āēŖ'), + ('āēĨ', 'āēĨ'), + ('āē§', 'āē°'), + ('āē˛', 'āēŗ'), + ('āēŊ', 'āēŊ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āģ', 'āģ'), + ('āŧ', 'āŧ'), + ('āŊ', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('āž', 'āž'), + ('á', 'áĒ'), + ('áŋ', 'áŋ'), + ('á', 'á'), + ('á', 'á'), + ('áĄ', 'áĄ'), + ('áĨ', 'áĻ'), + ('áŽ', 'á°'), + ('áĩ', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áŽ', 'á¸'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áą'), + ('á', 'á'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('á', 'áŗ'), + ('á', 'á'), + ('á', 'á'), + ('á ', '᥸'), + ('áĸ', 'áĸ'), + ('áĸ', 'áĸ¨'), + ('áĸĒ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('áĨ', 'áĨ'), + ('áĨ°', 'áĨ´'), + ('áĻ', 'áĻĢ'), + ('áĻ°', 'á§'), + ('á¨', 'á¨'), + ('ᨠ', 'áŠ'), + ('áĒ§', 'áĒ§'), + ('áŦ
', 'áŦŗ'), + ('á
', 'á'), + ('áŽ', 'Ꭰ'), + ('ᎎ', 'Ꭿ'), + ('áŽē', 'á¯Ĩ'), + ('á°', 'á°Ŗ'), + ('áą', 'áą'), + ('áą', 'áąŊ'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('áŗŠ', 'áŗŦ'), + ('áŗŽ', 'áŗŗ'), + ('áŗĩ', 'áŗļ'), + ('áŗē', 'áŗē'), + ('âĩ', 'â¸'), + ('â', 'â'), + ('â
', 'â'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ¯'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('ⸯ', 'ⸯ'), + ('ã
', 'ã'), + ('ãĄ', 'ãŠ'), + ('ãą', 'ãĩ'), + ('ã¸', 'ãŧ'), + ('ã', 'ã'), + ('ã', 'ã'), + ('ãĄ', 'ãē'), + ('ãŧ', 'ãŋ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ã°', 'ãŋ'), + ('ã', '\u{4dbf}'), + ('ä¸', '\u{9ffc}'), + ('ę', 'ę'), + ('ę', 'ęŊ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĒ', 'ęĢ'), + ('ęŽ', 'ęŽ'), + ('ęŋ', 'ęŋ'), + ('ę ', 'ę¯'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęˇ', 'ęˇ'), + ('ęģ', 'ę '), + ('ę ', 'ę
'), + ('ę ', 'ę '), + ('ę ', 'ę ĸ'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', 'ęĸŗ'), + ('ęŖ˛', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', 'ęŖž'), + ('ę¤', 'ę¤Ĩ'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('ęĻ', 'ęĻ˛'), + ('ę§', 'ę§'), + ('ę§ ', 'ꧤ'), + ('ę§Ļ', 'ę§¯'), + ('ę§ē', '꧞'), + ('ę¨', 'ꨨ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęŠ ', 'ęŠļ'), + ('ęŠē', 'ęŠē'), + ('ꊞ', 'ęĒ¯'), + ('ęĒą', 'ęĒą'), + ('ęĒĩ', 'ęĒļ'), + ('ęĒš', 'ęĒŊ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ', 'ęĢ'), + ('ęĢ ', 'ęĢĒ'), + ('ęĢ˛', 'ęĢ´'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('\u{ab69}', '\u{ab69}'), + ('ę¯', 'ę¯ĸ'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('ī¤', 'īŠ'), + ('īŠ°', 'īĢ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'īŽą'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇģ'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('īŊĻ', 'īž'), + ('īž ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ
', 'đ
´'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', 'đĢ¤'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ´', 'đ´Ŗ'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', 'đŊ
'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', 'đˇ'), + ('đ', 'đ¯'), + ('đ', 'đ¨'), + ('đ', 'đĻ'), + ('đ
', 'đ
'), + ('\u{11147}', '\u{11147}'), + ('đ
', 'đ
˛'), + ('đ
ļ', 'đ
ļ'), + ('đ', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĢ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đ', 'đ´'), + ('đ', 'đ'), + ('đ', '\u{11461}'), + ('đ', 'đ¯'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đ¯'), + ('đ', 'đ'), + ('đ', 'đĒ'), + ('đ¸', 'đ¸'), + ('đ', 'đ'), + ('đ ', 'đ Ģ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{1192f}'), + ('\u{1193f}', '\u{1193f}'), + ('\u{11941}', '\u{11941}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', 'đ§'), + ('đ§Ą', 'đ§Ą'), + ('đ§Ŗ', 'đ§Ŗ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨˛'), + ('đ¨ē', 'đ¨ē'), + ('đŠ', 'đŠ'), + ('đŠ', 'đĒ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', 'đ°Ž'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', 'đ´°'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('đļ', 'đļ'), + ('đģ ', 'đģ˛'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đĢ', 'đĢ'), + ('đŦ', 'đŦ¯'), + ('đ', 'đ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đŧ', 'đŊ'), + ('đŊ', 'đŊ'), + ('đž', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', 'đŋŖ'), + ('đ', 'đˇ'), + ('đ ', '\u{18cd5}'), + ('\u{18d00}', '\u{18d08}'), + ('đ', 'đ'), + ('đ
', 'đ
'), + ('đ
¤', 'đ
§'), + ('đ
°', 'đģ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ', 'đŦ'), + ('đˇ', 'đŊ'), + ('đ
', 'đ
'), + ('đ', 'đĢ'), + ('đ ', 'đŖ'), + ('đĨ', 'đĨ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đ ', '\u{2a6dd}'), + ('đĒ', 'đĢ´'), + ('đĢ', 'đĢ '), + ('đĢ ', 'đŦēĄ'), + ('đŦē°', 'đŽ¯ '), + ('đ¯ ', 'đ¯¨'), + ('\u{30000}', '\u{3134a}'), +]; + +pub const SCONTINUE: &'static [(char, char)] = &[ + (',', '-'), + (':', ':'), + ('Õ', 'Õ'), + ('Ø', 'Ø'), + ('߸', '߸'), + ('á ', 'á '), + ('á ', 'á '), + ('â', 'â'), + ('ã', 'ã'), + ('ī¸', 'ī¸'), + ('ī¸', 'ī¸'), + ('ī¸ą', 'ī¸˛'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īšŖ', 'īšŖ'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŊ¤', 'īŊ¤'), +]; + +pub const STERM: &'static [(char, char)] = &[ + ('!', '!'), + ('?', '?'), + ('Ö', 'Ö'), + ('Ø', 'Ø'), + ('Û', 'Û'), + ('Ü', 'Ü'), + ('ßš', 'ßš'), + ('ā ˇ', 'ā ˇ'), + ('ā š', 'ā š'), + ('ā Ŋ', 'ā ž'), + ('āĨ¤', 'āĨĨ'), + ('á', 'á'), + ('áĸ', 'áĸ'), + ('á§', 'á¨'), + ('áŽ', 'áŽ'), + ('áĩ', 'áļ'), + ('á ', 'á '), + ('á ', 'á '), + ('áĨ', 'áĨ
'), + ('áĒ¨', 'áĒĢ'), + ('á', 'á'), + ('á', 'á'), + ('á°ģ', 'á°ŧ'), + ('áąž', 'áąŋ'), + ('âŧ', 'âŊ'), + ('â', 'â'), + ('⸎', '⸎'), + ('â¸ŧ', 'â¸ŧ'), + ('ã', 'ã'), + ('ęŋ', 'ęŋ'), + ('ę', 'ę'), + ('ęŗ', 'ęŗ'), + ('ęˇ', 'ęˇ'), + ('ęĄļ', 'ꥡ'), + ('ęŖ', 'ęŖ'), + ('ę¤¯', 'ę¤¯'), + ('ę§', 'ę§'), + ('ęŠ', 'ęŠ'), + ('ęĢ°', 'ęĢą'), + ('ę¯Ģ', 'ę¯Ģ'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), + ('īŊĄ', 'īŊĄ'), + ('đŠ', 'đŠ'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đž', 'đ'), + ('đ
', 'đ
'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¸', 'đš'), + ('đģ', 'đŧ'), + ('đŠ', 'đŠ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đŧ', 'đž'), + ('\u{11944}', '\u{11944}'), + ('\u{11946}', '\u{11946}'), + ('đŠ', 'đŠ'), + ('đĒ', 'đĒ'), + ('đą', 'đą'), + ('đģˇ', 'đģ¸'), + ('đŠŽ', 'đŠ¯'), + ('đĢĩ', 'đĢĩ'), + ('đŦˇ', 'đŦ¸'), + ('đ', 'đ'), + ('đē', 'đē'), + ('đ˛', 'đ˛'), + ('đĒ', 'đĒ'), +]; + +pub const SEP: &'static [(char, char)] = + &[('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')]; + +pub const SP: &'static [(char, char)] = &[ + ('\t', '\t'), + ('\u{b}', '\u{c}'), + (' ', ' '), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const UPPER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('Ã', 'Ã'), + ('Ã', 'Ã'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä', 'Ä'), + ('Ä ', 'Ä '), + ('Äĸ', 'Äĸ'), + ('Ĥ', 'Ĥ'), + ('ÄĻ', 'ÄĻ'), + ('Ĩ', 'Ĩ'), + ('ÄĒ', 'ÄĒ'), + ('ÄŦ', 'ÄŦ'), + ('ÄŽ', 'ÄŽ'), + ('Ä°', 'Ä°'), + ('IJ', 'IJ'), + ('Ä´', 'Ä´'), + ('Äļ', 'Äļ'), + ('Äš', 'Äš'), + ('Äģ', 'Äģ'), + ('ÄŊ', 'ÄŊ'), + ('Äŋ', 'Äŋ'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å
', 'Å
'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å', 'Å'), + ('Å ', 'Å '), + ('Åĸ', 'Åĸ'), + ('Ť', 'Ť'), + ('ÅĻ', 'ÅĻ'), + ('Ũ', 'Ũ'), + ('ÅĒ', 'ÅĒ'), + ('ÅŦ', 'ÅŦ'), + ('ÅŽ', 'ÅŽ'), + ('Å°', 'Å°'), + ('Å˛', 'Å˛'), + ('Å´', 'Å´'), + ('Åļ', 'Åļ'), + ('Ÿ', 'Åš'), + ('Åģ', 'Åģ'), + ('ÅŊ', 'ÅŊ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ'), + ('Æ', 'Æ '), + ('Æĸ', 'Æĸ'), + ('Ƥ', 'Ƥ'), + ('ÆĻ', 'Ƨ'), + ('ÆŠ', 'ÆŠ'), + ('ÆŦ', 'ÆŦ'), + ('ÆŽ', 'Ư'), + ('Æą', 'Æŗ'), + ('Æĩ', 'Æĩ'), + ('Æˇ', 'Ƹ'), + ('Æŧ', 'Æŧ'), + ('Į', 'Į
'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į', 'Į'), + ('Į ', 'Į '), + ('Įĸ', 'Įĸ'), + ('Į¤', 'Į¤'), + ('ĮĻ', 'ĮĻ'), + ('Į¨', 'Į¨'), + ('ĮĒ', 'ĮĒ'), + ('ĮŦ', 'ĮŦ'), + ('ĮŽ', 'ĮŽ'), + ('Įą', 'Į˛'), + ('Į´', 'Į´'), + ('Įļ', 'Į¸'), + ('Įē', 'Įē'), + ('Įŧ', 'Įŧ'), + ('Įž', 'Įž'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č', 'Č'), + ('Č ', 'Č '), + ('Čĸ', 'Čĸ'), + ('Ȥ', 'Ȥ'), + ('ČĻ', 'ČĻ'), + ('Ȩ', 'Ȩ'), + ('ČĒ', 'ČĒ'), + ('ČŦ', 'ČŦ'), + ('ČŽ', 'ČŽ'), + ('Č°', 'Č°'), + ('Ȳ', 'Ȳ'), + ('Čē', 'Čģ'), + ('ČŊ', 'Čž'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('É', 'É'), + ('Í°', 'Í°'), + ('Ͳ', 'Ͳ'), + ('Íļ', 'Íļ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'ÎĢ'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī', 'Ī'), + ('Ī ', 'Ī '), + ('Īĸ', 'Īĸ'), + ('Ī¤', 'Ī¤'), + ('ĪĻ', 'ĪĻ'), + ('Ī¨', 'Ī¨'), + ('ĪĒ', 'ĪĒ'), + ('ĪŦ', 'ĪŦ'), + ('ĪŽ', 'ĪŽ'), + ('Ī´', 'Ī´'), + ('Īˇ', 'Īˇ'), + ('Īš', 'Īē'), + ('ĪŊ', 'Đ¯'), + ('Ņ ', 'Ņ '), + ('Ņĸ', 'Ņĸ'), + ('Ņ¤', 'Ņ¤'), + ('ŅĻ', 'ŅĻ'), + ('Ņ¨', 'Ņ¨'), + ('ŅĒ', 'ŅĒ'), + ('ŅŦ', 'ŅŦ'), + ('ŅŽ', 'ŅŽ'), + ('Ņ°', 'Ņ°'), + ('Ņ˛', 'Ņ˛'), + ('Ņ´', 'Ņ´'), + ('Ņļ', 'Ņļ'), + ('Ņ¸', 'Ņ¸'), + ('Ņē', 'Ņē'), + ('Ņŧ', 'Ņŧ'), + ('Ņž', 'Ņž'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō', 'Ō'), + ('Ō ', 'Ō '), + ('Ōĸ', 'Ōĸ'), + ('Ō¤', 'Ō¤'), + ('ŌĻ', 'ŌĻ'), + ('Ō¨', 'Ō¨'), + ('ŌĒ', 'ŌĒ'), + ('ŌŦ', 'ŌŦ'), + ('ŌŽ', 'ŌŽ'), + ('Ō°', 'Ō°'), + ('Ō˛', 'Ō˛'), + ('Ō´', 'Ō´'), + ('Ōļ', 'Ōļ'), + ('Ō¸', 'Ō¸'), + ('Ōē', 'Ōē'), + ('Ōŧ', 'Ōŧ'), + ('Ōž', 'Ōž'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ
', 'Ķ
'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ', 'Ķ'), + ('Ķ ', 'Ķ '), + ('Ķĸ', 'Ķĸ'), + ('Ķ¤', 'Ķ¤'), + ('ĶĻ', 'ĶĻ'), + ('Ķ¨', 'Ķ¨'), + ('ĶĒ', 'ĶĒ'), + ('ĶŦ', 'ĶŦ'), + ('ĶŽ', 'ĶŽ'), + ('Ķ°', 'Ķ°'), + ('Ķ˛', 'Ķ˛'), + ('Ķ´', 'Ķ´'), + ('Ķļ', 'Ķļ'), + ('Ķ¸', 'Ķ¸'), + ('Ķē', 'Ķē'), + ('Ķŧ', 'Ķŧ'), + ('Ķž', 'Ķž'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô', 'Ô'), + ('Ô ', 'Ô '), + ('Ôĸ', 'Ôĸ'), + ('Ô¤', 'Ô¤'), + ('ÔĻ', 'ÔĻ'), + ('Ô¨', 'Ô¨'), + ('ÔĒ', 'ÔĒ'), + ('ÔŦ', 'ÔŦ'), + ('ÔŽ', 'ÔŽ'), + ('Ôą', 'Õ'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('á¸', 'á¸'), + ('Ḡ', 'Ḡ'), + ('á¸ĸ', 'á¸ĸ'), + ('Ḥ', 'Ḥ'), + ('á¸Ļ', 'á¸Ļ'), + ('Ḩ', 'Ḩ'), + ('á¸Ē', 'á¸Ē'), + ('á¸Ŧ', 'á¸Ŧ'), + ('Ḏ', 'Ḏ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('á¸ļ', 'á¸ļ'), + ('Ḹ', 'Ḹ'), + ('á¸ē', 'á¸ē'), + ('á¸ŧ', 'á¸ŧ'), + ('Ḟ', 'Ḟ'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš', 'áš'), + ('áš ', 'áš '), + ('ášĸ', 'ášĸ'), + ('ᚤ', 'ᚤ'), + ('ášĻ', 'ášĻ'), + ('ᚨ', 'ᚨ'), + ('ášĒ', 'ášĒ'), + ('ášŦ', 'ášŦ'), + ('ᚎ', 'ᚎ'), + ('áš°', 'áš°'), + ('ᚲ', 'ᚲ'), + ('áš´', 'áš´'), + ('ášļ', 'ášļ'), + ('ᚸ', 'ᚸ'), + ('ášē', 'ášē'), + ('ášŧ', 'ášŧ'), + ('ášž', 'ášž'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē', 'áē'), + ('áē ', 'áē '), + ('áēĸ', 'áēĸ'), + ('áē¤', 'áē¤'), + ('áēĻ', 'áēĻ'), + ('áē¨', 'áē¨'), + ('áēĒ', 'áēĒ'), + ('áēŦ', 'áēŦ'), + ('áēŽ', 'áēŽ'), + ('áē°', 'áē°'), + ('áē˛', 'áē˛'), + ('áē´', 'áē´'), + ('áēļ', 'áēļ'), + ('áē¸', 'áē¸'), + ('áēē', 'áēē'), + ('áēŧ', 'áēŧ'), + ('áēž', 'áēž'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ', 'áģ'), + ('áģ ', 'áģ '), + ('áģĸ', 'áģĸ'), + ('áģ¤', 'áģ¤'), + ('áģĻ', 'áģĻ'), + ('áģ¨', 'áģ¨'), + ('áģĒ', 'áģĒ'), + ('áģŦ', 'áģŦ'), + ('áģŽ', 'áģŽ'), + ('áģ°', 'áģ°'), + ('áģ˛', 'áģ˛'), + ('áģ´', 'áģ´'), + ('áģļ', 'áģļ'), + ('áģ¸', 'áģ¸'), + ('áģē', 'áģē'), + ('áģŧ', 'áģŧ'), + ('áģž', 'áģž'), + ('áŧ', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ¨', 'áŧ¯'), + ('áŧ¸', 'áŧŋ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ¨', 'áŊ¯'), + ('áž', 'áž'), + ('áž', 'áž'), + ('ឨ', 'ឯ'), + ('ី', 'ážŧ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ¨', 'áŋŦ'), + ('áŋ¸', 'áŋŧ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'â'), + ('â°', 'âŗ'), + ('âž', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
¯'), + ('â', 'â'), + ('âļ', 'â'), + ('â°', 'â°Ž'), + ('âą ', 'âą '), + ('âąĸ', '⹤'), + ('⹧', '⹧'), + ('⹊', '⹊'), + ('âąĢ', 'âąĢ'), + ('âą', 'âą°'), + ('⹲', '⹲'), + ('âąĩ', 'âąĩ'), + ('âąž', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('â˛', 'â˛'), + ('Ⲡ', 'Ⲡ'), + ('â˛ĸ', 'â˛ĸ'), + ('Ⲥ', 'Ⲥ'), + ('â˛Ļ', 'â˛Ļ'), + ('Ⲩ', 'Ⲩ'), + ('â˛Ē', 'â˛Ē'), + ('â˛Ŧ', 'â˛Ŧ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('â˛ļ', 'â˛ļ'), + ('Ⲹ', 'Ⲹ'), + ('â˛ē', 'â˛ē'), + ('â˛ŧ', 'â˛ŧ'), + ('Ⲟ', 'Ⲟ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ', 'âŗ'), + ('âŗ ', 'âŗ '), + ('âŗĸ', 'âŗĸ'), + ('âŗĢ', 'âŗĢ'), + ('âŗ', 'âŗ'), + ('âŗ˛', 'âŗ˛'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ę˛', 'ę˛'), + ('ę´', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęĒ'), + ('ęŦ', 'ęŦ'), + ('ęŽ', 'ęŽ'), + ('ęš', 'ęš'), + ('ęģ', 'ęģ'), + ('ęŊ', 'ęž'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ę ', 'ę '), + ('ęĸ', 'ęĸ'), + ('ę¤', 'ę¤'), + ('ęĻ', 'ęĻ'), + ('ę¨', 'ę¨'), + ('ęĒ', 'ęŽ'), + ('ę°', 'ę´'), + ('ęļ', 'ęļ'), + ('ę¸', 'ę¸'), + ('ęē', 'ęē'), + ('ęŧ', 'ęŧ'), + ('ęž', 'ęž'), + ('ę', 'ę'), + ('ę', '\u{a7c7}'), + ('\u{a7c9}', '\u{a7c9}'), + ('\u{a7f5}', '\u{a7f5}'), + ('īŧĄ', 'īŧē'), + ('đ', 'đ§'), + ('đ°', 'đ'), + ('đ˛', 'đ˛˛'), + ('đĸ ', 'đĸŋ'), + ('đš', 'đš'), + ('đ', 'đ'), + ('đ´', 'đ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đĩ'), + ('đ', 'đŠ'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ¸', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đŦ', 'đ
'), + ('đ ', 'đš'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đŧ', 'đ'), + ('đ°', 'đ'), + ('đ¨', 'đ'), + ('đĸ', 'đē'), + ('đ', 'đ´'), + ('đ', 'đŽ'), + ('đ', 'đ¨'), + ('đ', 'đ'), + ('đ¤', 'đ¤Ą'), + ('đ°', 'đ
'), + ('đ
', 'đ
Š'), + ('đ
°', 'đ'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/word_break.rs b/vendor/regex-syntax/src/unicode_tables/word_break.rs new file mode 100644 index 000000000..bd23e00a8 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/word_break.rs @@ -0,0 +1,1060 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate word-break ucd-13.0.0 --chars +// +// Unicode version: 13.0.0. +// +// ucd-generate 0.2.8 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ALetter", ALETTER), + ("CR", CR), + ("Double_Quote", DOUBLE_QUOTE), + ("Extend", EXTEND), + ("ExtendNumLet", EXTENDNUMLET), + ("Format", FORMAT), + ("Hebrew_Letter", HEBREW_LETTER), + ("Katakana", KATAKANA), + ("LF", LF), + ("MidLetter", MIDLETTER), + ("MidNum", MIDNUM), + ("MidNumLet", MIDNUMLET), + ("Newline", NEWLINE), + ("Numeric", NUMERIC), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("Single_Quote", SINGLE_QUOTE), + ("WSegSpace", WSEGSPACE), + ("ZWJ", ZWJ), +]; + +pub const ALETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ÂĒ', 'ÂĒ'), + ('Âĩ', 'Âĩ'), + ('Âē', 'Âē'), + ('Ã', 'Ã'), + ('Ã', 'Ãļ'), + ('ø', 'Ë'), + ('Ë', 'Ëŋ'), + ('Í°', 'Í´'), + ('Íļ', '͡'), + ('Íē', 'ÍŊ'), + ('Íŋ', 'Íŋ'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'Î'), + ('Î', 'ÎĄ'), + ('ÎŖ', 'Īĩ'), + ('Īˇ', 'Ō'), + ('Ō', 'Ô¯'), + ('Ôą', 'Õ'), + ('Õ', 'Õ'), + ('Õ', 'Õ'), + ('Õ ', 'Ö'), + ('Ö', 'Ö'), + ('×ŗ', '×ŗ'), + ('Ø ', 'Ų'), + ('ŲŽ', 'Ų¯'), + ('Ųą', 'Û'), + ('Û', 'Û'), + ('ÛĨ', 'ÛĻ'), + ('ÛŽ', 'Û¯'), + ('Ûē', 'Ûŧ'), + ('Ûŋ', 'Ûŋ'), + ('Ü', 'Ü'), + ('Ü', 'ܯ'), + ('Ũ', 'ŪĨ'), + ('Ūą', 'Ūą'), + ('ß', 'ßĒ'), + ('ß´', 'ßĩ'), + ('ßē', 'ßē'), + ('ā ', 'ā '), + ('ā ', 'ā '), + ('ā ¤', 'ā ¤'), + ('ā ¨', 'ā ¨'), + ('āĄ', 'āĄ'), + ('āĄ ', 'āĄĒ'), + ('āĸ ', 'āĸ´'), + ('āĸļ', '\u{8c7}'), + ('ā¤', 'ā¤š'), + ('ā¤Ŋ', 'ā¤Ŋ'), + ('āĨ', 'āĨ'), + ('āĨ', 'āĨĄ'), + ('āĨą', 'āĻ'), + ('āĻ
', 'āĻ'), + ('āĻ', 'āĻ'), + ('āĻ', 'āĻ¨'), + ('āĻĒ', 'āĻ°'), + ('āĻ˛', 'āĻ˛'), + ('āĻļ', 'āĻš'), + ('āĻŊ', 'āĻŊ'), + ('ā§', 'ā§'), + ('ā§', 'ā§'), + ('ā§', 'ā§Ą'), + ('ā§°', 'ā§ą'), + ('ā§ŧ', 'ā§ŧ'), + ('ā¨
', 'ā¨'), + ('ā¨', 'ā¨'), + ('ā¨', 'ā¨¨'), + ('ā¨Ē', 'ā¨°'), + ('ā¨˛', 'ā¨ŗ'), + ('ā¨ĩ', 'ā¨ļ'), + ('ā¨¸', 'ā¨š'), + ('āŠ', 'āŠ'), + ('āŠ', 'āŠ'), + ('āŠ˛', 'āŠ´'), + ('āĒ
', 'āĒ'), + ('āĒ', 'āĒ'), + ('āĒ', 'āĒ¨'), + ('āĒĒ', 'āĒ°'), + ('āĒ˛', 'āĒŗ'), + ('āĒĩ', 'āĒš'), + ('āĒŊ', 'āĒŊ'), + ('āĢ', 'āĢ'), + ('āĢ ', 'āĢĄ'), + ('āĢš', 'āĢš'), + ('āŦ
', 'āŦ'), + ('āŦ', 'āŦ'), + ('āŦ', 'āŦ¨'), + ('āŦĒ', 'āŦ°'), + ('āŦ˛', 'āŦŗ'), + ('āŦĩ', 'āŦš'), + ('āŦŊ', 'āŦŊ'), + ('ā', 'ā'), + ('ā', 'āĄ'), + ('āą', 'āą'), + ('āŽ', 'āŽ'), + ('āŽ
', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽ', 'āŽ'), + ('āŽŖ', 'āŽ¤'), + ('āŽ¨', 'āŽĒ'), + ('āŽŽ', 'āŽš'), + ('ā¯', 'ā¯'), + ('ā°
', 'ā°'), + ('ā°', 'ā°'), + ('ā°', 'ā°¨'), + ('ā°Ē', 'ā°š'), + ('ā°Ŋ', 'ā°Ŋ'), + ('āą', 'āą'), + ('āą ', 'āąĄ'), + ('ā˛', 'ā˛'), + ('ā˛
', 'ā˛'), + ('ā˛', 'ā˛'), + ('ā˛', 'ā˛¨'), + ('ā˛Ē', 'ā˛ŗ'), + ('ā˛ĩ', 'ā˛š'), + ('ā˛Ŋ', 'ā˛Ŋ'), + ('āŗ', 'āŗ'), + ('āŗ ', 'āŗĄ'), + ('āŗą', 'āŗ˛'), + ('\u{d04}', 'ā´'), + ('ā´', 'ā´'), + ('ā´', 'ā´ē'), + ('ā´Ŋ', 'ā´Ŋ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩ'), + ('āĩ', 'āĩĄ'), + ('āĩē', 'āĩŋ'), + ('āļ
', 'āļ'), + ('āļ', 'āļą'), + ('āļŗ', 'āļģ'), + ('āļŊ', 'āļŊ'), + ('āˇ', 'āˇ'), + ('āŧ', 'āŧ'), + ('āŊ', 'āŊ'), + ('āŊ', 'āŊŦ'), + ('āž', 'āž'), + ('á ', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'áē'), + ('áŧ', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'á'), + ('á', 'á'), + ('á', 'á°'), + ('á˛', 'áĩ'), + ('á¸', 'áž'), + ('á', 'á'), + ('á', 'á
'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áĩ'), + ('á¸', 'áŊ'), + ('á', 'áŦ'), + ('á¯', 'áŋ'), + ('á', 'á'), + ('á ', 'áĒ'), + ('áŽ', 'á¸'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áą'), + ('á', 'á'), + ('á ', 'áŦ'), + ('áŽ', 'á°'), + ('á ', '᥸'), + ('áĸ', 'áĸ'), + ('áĸ', 'áĸ¨'), + ('áĸĒ', 'áĸĒ'), + ('áĸ°', 'áŖĩ'), + ('á¤', 'á¤'), + ('á¨', 'á¨'), + ('áŦ
', 'áŦŗ'), + ('á
', 'á'), + ('áŽ', 'Ꭰ'), + ('ᎎ', 'Ꭿ'), + ('áŽē', 'á¯Ĩ'), + ('á°', 'á°Ŗ'), + ('áą', 'áą'), + ('áą', 'áąŊ'), + ('á˛', 'á˛'), + ('á˛', 'á˛ē'), + ('á˛Ŋ', 'á˛ŋ'), + ('áŗŠ', 'áŗŦ'), + ('áŗŽ', 'áŗŗ'), + ('áŗĩ', 'áŗļ'), + ('áŗē', 'áŗē'), + ('á´', 'áļŋ'), + ('á¸', 'áŧ'), + ('áŧ', 'áŧ'), + ('áŧ ', 'áŊ
'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊ'), + ('áŊ', 'áŊŊ'), + ('áž', 'áž´'), + ('ážļ', 'ážŧ'), + ('ážž', 'ážž'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ', 'áŋ'), + ('áŋ ', 'áŋŦ'), + ('áŋ˛', 'áŋ´'), + ('áŋļ', 'áŋŧ'), + ('âą', 'âą'), + ('âŋ', 'âŋ'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â', 'â'), + ('â¤', 'â¤'), + ('âĻ', 'âĻ'), + ('â¨', 'â¨'), + ('âĒ', 'â'), + ('â¯', 'âš'), + ('âŧ', 'âŋ'), + ('â
', 'â
'), + ('â
', 'â
'), + ('â
', 'â'), + ('âļ', 'âŠ'), + ('â°', 'â°Ž'), + ('â°°', 'âą'), + ('âą ', 'âŗ¤'), + ('âŗĢ', 'âŗŽ'), + ('âŗ˛', 'âŗŗ'), + ('â´', 'â´Ĩ'), + ('â´§', 'â´§'), + ('â´', 'â´'), + ('â´°', 'âĩ§'), + ('âĩ¯', 'âĩ¯'), + ('âļ', 'âļ'), + ('âļ ', 'âļĻ'), + ('âļ¨', 'âļŽ'), + ('âļ°', 'âļļ'), + ('âļ¸', 'âļž'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('âˇ', 'âˇ'), + ('ⸯ', 'ⸯ'), + ('ã
', 'ã
'), + ('ãģ', 'ãŧ'), + ('ã
', 'ã¯'), + ('ãą', 'ã'), + ('ã ', '\u{31bf}'), + ('ę', 'ę'), + ('ę', 'ęŊ'), + ('ę', 'ę'), + ('ę', 'ę'), + ('ęĒ', 'ęĢ'), + ('ę', 'ęŽ'), + ('ęŋ', 'ę'), + ('ę ', 'ę¯'), + ('ę', 'ęŋ'), + ('ę', '\u{a7ca}'), + ('\u{a7f5}', 'ę '), + ('ę ', 'ę
'), + ('ę ', 'ę '), + ('ę ', 'ę ĸ'), + ('ęĄ', 'ęĄŗ'), + ('ęĸ', 'ęĸŗ'), + ('ęŖ˛', 'ęŖˇ'), + ('ęŖģ', 'ęŖģ'), + ('ęŖŊ', 'ęŖž'), + ('ę¤', 'ę¤Ĩ'), + ('ꤰ', 'ęĨ'), + ('ęĨ ', 'ęĨŧ'), + ('ęĻ', 'ęĻ˛'), + ('ę§', 'ę§'), + ('ę¨', 'ꨨ'), + ('ęŠ', 'ęŠ'), + ('ęŠ', 'ęŠ'), + ('ęĢ ', 'ęĢĒ'), + ('ęĢ˛', 'ęĢ´'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ', 'ęŦ'), + ('ęŦ ', 'ęŦĻ'), + ('ęŦ¨', 'ęŦŽ'), + ('ęŦ°', '\u{ab69}'), + ('ę°', 'ę¯ĸ'), + ('ę°', 'íŖ'), + ('í°', 'í'), + ('í', 'íģ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ'), + ('ī', 'īŽą'), + ('ī¯', 'ī´Ŋ'), + ('īĩ', 'īļ'), + ('īļ', 'īˇ'), + ('īˇ°', 'īˇģ'), + ('īš°', 'īš´'), + ('īšļ', 'īģŧ'), + ('īŧĄ', 'īŧē'), + ('īŊ', 'īŊ'), + ('īž ', 'īžž'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('īŋ', 'īŋ'), + ('đ', 'đ'), + ('đ', 'đĻ'), + ('đ¨', 'đē'), + ('đŧ', 'đŊ'), + ('đŋ', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đ
', 'đ
´'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĩ'), + ('đ', 'đ'), + ('đ ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đ'), + ('đ', 'đģ'), + ('đ', 'đ§'), + ('đ°', 'đŖ'), + ('đ', 'đļ'), + ('đ', 'đ'), + ('đ ', 'đ§'), + ('đ ', 'đ
'), + ('đ ', 'đ '), + ('đ ', 'đ ĩ'), + ('đ ˇ', 'đ ¸'), + ('đ ŧ', 'đ ŧ'), + ('đ ŋ', 'đĄ'), + ('đĄ ', 'đĄļ'), + ('đĸ', 'đĸ'), + ('đŖ ', 'đŖ˛'), + ('đŖ´', 'đŖĩ'), + ('đ¤', 'đ¤'), + ('đ¤ ', 'đ¤š'), + ('đĻ', 'đĻˇ'), + ('đĻž', 'đĻŋ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨ĩ'), + ('đŠ ', 'đŠŧ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ'), + ('đĢ', 'đĢ¤'), + ('đŦ', 'đŦĩ'), + ('đ', 'đ'), + ('đ ', 'đ˛'), + ('đŽ', 'đŽ'), + ('đ°', 'đą'), + ('đ˛', 'đ˛˛'), + ('đŗ', 'đŗ˛'), + ('đ´', 'đ´Ŗ'), + ('\u{10e80}', '\u{10ea9}'), + ('\u{10eb0}', '\u{10eb1}'), + ('đŧ', 'đŧ'), + ('đŧ§', 'đŧ§'), + ('đŧ°', 'đŊ
'), + ('\u{10fb0}', '\u{10fc4}'), + ('đŋ ', 'đŋļ'), + ('đ', 'đˇ'), + ('đ', 'đ¯'), + ('đ', 'đ¨'), + ('đ', 'đĻ'), + ('đ
', 'đ
'), + ('\u{11147}', '\u{11147}'), + ('đ
', 'đ
˛'), + ('đ
ļ', 'đ
ļ'), + ('đ', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĢ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đ°', 'đ'), + ('đ
', 'đ'), + ('đ', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ°'), + ('đ˛', 'đŗ'), + ('đĩ', 'đš'), + ('đŊ', 'đŊ'), + ('đ', 'đ'), + ('đ', 'đĄ'), + ('đ', 'đ´'), + ('đ', 'đ'), + ('đ', '\u{11461}'), + ('đ', 'đ¯'), + ('đ', 'đ
'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đ¯'), + ('đ', 'đ'), + ('đ', 'đĒ'), + ('đ¸', 'đ¸'), + ('đ ', 'đ Ģ'), + ('đĸ ', 'đŖ'), + ('đŖŋ', '\u{11906}'), + ('\u{11909}', '\u{11909}'), + ('\u{1190c}', '\u{11913}'), + ('\u{11915}', '\u{11916}'), + ('\u{11918}', '\u{1192f}'), + ('\u{1193f}', '\u{1193f}'), + ('\u{11941}', '\u{11941}'), + ('đĻ ', 'đĻ§'), + ('đĻĒ', 'đ§'), + ('đ§Ą', 'đ§Ą'), + ('đ§Ŗ', 'đ§Ŗ'), + ('đ¨', 'đ¨'), + ('đ¨', 'đ¨˛'), + ('đ¨ē', 'đ¨ē'), + ('đŠ', 'đŠ'), + ('đŠ', 'đĒ'), + ('đĒ', 'đĒ'), + ('đĢ', 'đĢ¸'), + ('đ°', 'đ°'), + ('đ°', 'đ°Ž'), + ('đą', 'đą'), + ('đą˛', 'đ˛'), + ('đ´', 'đ´'), + ('đ´', 'đ´'), + ('đ´', 'đ´°'), + ('đĩ', 'đĩ'), + ('đĩ ', 'đĩĨ'), + ('đĩ§', 'đĩ¨'), + ('đĩĒ', 'đļ'), + ('đļ', 'đļ'), + ('đģ ', 'đģ˛'), + ('\u{11fb0}', '\u{11fb0}'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ', 'đŽ'), + ('đ', 'đ'), + ('đ ', 'đ¨¸'), + ('đŠ', 'đŠ'), + ('đĢ', 'đĢ'), + ('đŦ', 'đŦ¯'), + ('đ', 'đ'), + ('đŖ', 'đˇ'), + ('đŊ', 'đŽ'), + ('đš', 'đšŋ'), + ('đŧ', 'đŊ'), + ('đŊ', 'đŊ'), + ('đž', 'đž'), + ('đŋ ', 'đŋĄ'), + ('đŋŖ', 'đŋŖ'), + ('đ°', 'đąĒ'), + ('đą°', 'đąŧ'), + ('đ˛', 'đ˛'), + ('đ˛', 'đ˛'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đĸ', 'đĸ'), + ('đĨ', 'đĻ'), + ('đŠ', 'đŦ'), + ('đŽ', 'đš'), + ('đģ', 'đģ'), + ('đŊ', 'đ'), + ('đ
', 'đ
'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đš'), + ('đģ', 'đž'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đĨ'), + ('đ¨', 'đ'), + ('đ', 'đ'), + ('đ', 'đē'), + ('đŧ', 'đ'), + ('đ', 'đ´'), + ('đļ', 'đ'), + ('đ', 'đŽ'), + ('đ°', 'đ'), + ('đ', 'đ¨'), + ('đĒ', 'đ'), + ('đ', 'đ'), + ('đ', 'đŦ'), + ('đˇ', 'đŊ'), + ('đ
', 'đ
'), + ('đ', 'đĢ'), + ('đ ', 'đŖ'), + ('đ¤', 'đĨ'), + ('đĨ', 'đĨ'), + ('đ¸', 'đ¸'), + ('đ¸
', 'đ¸'), + ('đ¸Ą', 'đ¸ĸ'), + ('đ¸¤', 'đ¸¤'), + ('đ¸§', 'đ¸§'), + ('đ¸Š', 'đ¸˛'), + ('đ¸´', 'đ¸ˇ'), + ('đ¸š', 'đ¸š'), + ('đ¸ģ', 'đ¸ģ'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đš', 'đš'), + ('đšĄ', 'đšĸ'), + ('đš¤', 'đš¤'), + ('đš§', 'đšĒ'), + ('đšŦ', 'đš˛'), + ('đš´', 'đšˇ'), + ('đšš', 'đšŧ'), + ('đšž', 'đšž'), + ('đē', 'đē'), + ('đē', 'đē'), + ('đēĄ', 'đēŖ'), + ('đēĨ', 'đēŠ'), + ('đēĢ', 'đēģ'), + ('đ°', 'đ
'), + ('đ
', 'đ
Š'), + ('đ
°', 'đ'), +]; + +pub const CR: &'static [(char, char)] = &[('\r', '\r')]; + +pub const DOUBLE_QUOTE: &'static [(char, char)] = &[('\"', '\"')]; + +pub const EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{8d3}', '\u{8e1}'), + ('\u{8e3}', 'ā¤'), + ('\u{93a}', '\u{93c}'), + ('ā¤ž', 'āĨ'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'āĻ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ā§', 'ā§'), + ('ā§', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ā¨'), + ('\u{a3c}', '\u{a3c}'), + ('ā¨ž', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'āĒ'), + ('\u{abc}', '\u{abc}'), + ('āĒž', '\u{ac5}'), + ('\u{ac7}', 'āĢ'), + ('āĢ', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', 'āŦ'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b44}'), + ('ā', 'ā'), + ('ā', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ā¯'), + ('ā¯', 'ā¯'), + ('ā¯', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3e}', 'āą'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ā˛'), + ('\u{cbc}', '\u{cbc}'), + ('ā˛ž', 'āŗ'), + ('\u{cc6}', 'āŗ'), + ('āŗ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', 'ā´'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d44}'), + ('āĩ', 'āĩ'), + ('āĩ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'āļ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('āˇ', '\u{ddf}'), + ('āˇ˛', 'āˇŗ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ecd}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('āŧž', 'āŧŋ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('áĢ', '\u{103e}'), + ('á', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('áĸ', 'á¤'), + ('á§', 'á'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('á', 'á'), + ('á', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1734}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'á¤Ģ'), + ('ᤰ', '\u{193b}'), + ('\u{1a17}', '\u{1a1b}'), + ('áŠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ac0}'), + ('\u{1b00}', 'áŦ'), + ('\u{1b34}', 'á'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'áŽ'), + ('Ꭵ', '\u{1bad}'), + ('\u{1be6}', 'á¯ŗ'), + ('á°¤', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('áŗˇ', '\u{1cf9}'), + ('\u{1dc0}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{200c}', '\u{200c}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('ę Ŗ', 'ę §'), + ('\u{a82c}', '\u{a82c}'), + ('ęĸ', 'ęĸ'), + ('ęĸ´', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', 'ęĨ'), + ('\u{a980}', 'ęĻ'), + ('\u{a9b3}', 'ę§'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ęŠ'), + ('ęŠģ', 'ęŠŊ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ęĢĢ', 'ęĢ¯'), + ('ęĢĩ', '\u{aaf6}'), + ('ę¯Ŗ', 'ę¯Ē'), + ('ę¯Ŧ', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10f46}', '\u{10f50}'), + ('đ', 'đ'), + ('\u{11038}', '\u{11046}'), + ('\u{1107f}', 'đ'), + ('đ°', '\u{110ba}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11134}'), + ('đ
', 'đ
'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', 'đ'), + ('đŗ', 'đ'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111ce}', '\u{111cf}'), + ('đŦ', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{112df}', '\u{112ea}'), + ('\u{11300}', 'đ'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('\u{11357}', '\u{11357}'), + ('đĸ', 'đŖ'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('đĩ', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114c3}'), + ('\u{115af}', '\u{115b5}'), + ('đ¸', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('đ°', '\u{11640}'), + ('\u{116ab}', '\u{116b7}'), + ('\u{1171d}', '\u{1172b}'), + ('đ Ŧ', '\u{1183a}'), + ('\u{11930}', '\u{11935}'), + ('\u{11937}', '\u{11938}'), + ('\u{1193b}', '\u{1193e}'), + ('\u{11940}', '\u{11940}'), + ('\u{11942}', '\u{11943}'), + ('đ§', '\u{119d7}'), + ('\u{119da}', '\u{119e0}'), + ('đ§¤', 'đ§¤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', 'đ¨š'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a99}'), + ('đ°¯', '\u{11c36}'), + ('\u{11c38}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('đ˛Š', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('đļ', 'đļ'), + ('\u{11d90}', '\u{11d91}'), + ('đļ', '\u{11d97}'), + ('\u{11ef3}', 'đģļ'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('đŊ', 'đž'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{16ff0}', '\u{16ff1}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1d165}', '\u{1d169}'), + ('đ
', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('đģ', 'đŋ'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const EXTENDNUMLET: &'static [(char, char)] = &[ + ('_', '_'), + ('\u{202f}', '\u{202f}'), + ('âŋ', 'â'), + ('â', 'â'), + ('ī¸ŗ', 'ī¸´'), + ('īš', 'īš'), + ('īŧŋ', 'īŧŋ'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{600}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), + ('\u{200e}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{13430}', '\u{13438}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), +]; + +pub const HEBREW_LETTER: &'static [(char, char)] = &[ + ('×', '×Ē'), + ('ׯ', 'ײ'), + ('īŦ', 'īŦ'), + ('īŦ', 'īŦ¨'), + ('īŦĒ', 'īŦļ'), + ('īŦ¸', 'īŦŧ'), + ('īŦž', 'īŦž'), + ('ī', 'ī'), + ('ī', 'ī'), + ('ī', 'ī'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('ãą', 'ãĩ'), + ('ã', 'ã'), + ('ã ', 'ãē'), + ('ãŧ', 'ãŋ'), + ('ã°', 'ãŋ'), + ('ã', 'ãž'), + ('ã', 'ã'), + ('īŊĻ', 'īž'), + ('đ', 'đ'), + ('đ
¤', 'đ
§'), +]; + +pub const LF: &'static [(char, char)] = &[('\n', '\n')]; + +pub const MIDLETTER: &'static [(char, char)] = &[ + (':', ':'), + ('¡', '¡'), + ('Î', 'Î'), + ('Õ', 'Õ'), + ('×´', '×´'), + ('â§', 'â§'), + ('ī¸', 'ī¸'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), +]; + +pub const MIDNUM: &'static [(char, char)] = &[ + (',', ','), + (';', ';'), + ('Íž', 'Íž'), + ('Ö', 'Ö'), + ('Ø', 'Ø'), + ('ŲŦ', 'ŲŦ'), + ('߸', '߸'), + ('â', 'â'), + ('ī¸', 'ī¸'), + ('ī¸', 'ī¸'), + ('īš', 'īš'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), +]; + +pub const MIDNUMLET: &'static [(char, char)] = &[ + ('.', '.'), + ('â', 'â'), + ('â¤', 'â¤'), + ('īš', 'īš'), + ('īŧ', 'īŧ'), + ('īŧ', 'īŧ'), +]; + +pub const NEWLINE: &'static [(char, char)] = + &[('\u{b}', '\u{c}'), ('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')]; + +pub const NUMERIC: &'static [(char, char)] = &[ + ('0', '9'), + ('Ų ', 'ŲŠ'), + ('ŲĢ', 'ŲĢ'), + ('Û°', 'Ûš'), + ('ß', 'ß'), + ('āĨĻ', 'āĨ¯'), + ('ā§Ļ', 'ā§¯'), + ('āŠĻ', 'āŠ¯'), + ('āĢĻ', 'āĢ¯'), + ('āĻ', 'ā¯'), + ('ā¯Ļ', 'ā¯¯'), + ('āąĻ', 'āą¯'), + ('āŗĻ', 'āŗ¯'), + ('āĩĻ', 'āĩ¯'), + ('āˇĻ', 'āˇ¯'), + ('āš', 'āš'), + ('āģ', 'āģ'), + ('āŧ ', 'āŧŠ'), + ('á', 'á'), + ('á', 'á'), + ('á ', 'áŠ'), + ('á ', 'á '), + ('áĨ', 'áĨ'), + ('á§', 'á§'), + ('áĒ', 'áĒ'), + ('áĒ', 'áĒ'), + ('á', 'á'), + ('Ꮀ', '᎚'), + ('áą', 'áą'), + ('áą', 'áą'), + ('ę ', 'ęŠ'), + ('ęŖ', 'ęŖ'), + ('ę¤', 'ę¤'), + ('ę§', 'ę§'), + ('꧰', '꧚'), + ('ęŠ', 'ęŠ'), + ('ę¯°', 'ę¯š'), + ('īŧ', 'īŧ'), + ('đ ', 'đŠ'), + ('đ´°', 'đ´š'), + ('đĻ', 'đ¯'), + ('đ°', 'đš'), + ('đļ', 'đŋ'), + ('đ', 'đ'), + ('đ°', 'đš'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ', 'đ'), + ('đ°', 'đš'), + ('đŖ ', 'đŖŠ'), + ('\u{11950}', '\u{11959}'), + ('đą', 'đą'), + ('đĩ', 'đĩ'), + ('đļ ', 'đļŠ'), + ('đŠ ', 'đŠŠ'), + ('đ', 'đ'), + ('đ', 'đŋ'), + ('đ
', 'đ
'), + ('đ°', 'đš'), + ('đĨ', 'đĨ'), + ('\u{1fbf0}', '\u{1fbf9}'), +]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('đĻ', 'đŋ')]; + +pub const SINGLE_QUOTE: &'static [(char, char)] = &[('\'', '\'')]; + +pub const WSEGSPACE: &'static [(char, char)] = &[ + (' ', ' '), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{2006}'), + ('\u{2008}', '\u{200a}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')]; diff --git a/vendor/regex-syntax/src/utf8.rs b/vendor/regex-syntax/src/utf8.rs new file mode 100644 index 000000000..dc055033e --- /dev/null +++ b/vendor/regex-syntax/src/utf8.rs @@ -0,0 +1,587 @@ +/*! +Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes. + +This is sub-module is useful for constructing byte based automatons that need +to embed UTF-8 decoding. The most common use of this module is in conjunction +with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type. + +See the documentation on the `Utf8Sequences` iterator for more details and +an example. + +# Wait, what is this? + +This is simplest to explain with an example. Let's say you wanted to test +whether a particular byte sequence was a Cyrillic character. One possible +scalar value range is `[0400-04FF]`. The set of allowed bytes for this +range can be expressed as a sequence of byte ranges: + +```text +[D0-D3][80-BF] +``` + +This is simple enough: simply encode the boundaries, `0400` encodes to +`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each +corresponding pair of bytes: `D0` to `D3` and `80` to `BF`. + +However, what if you wanted to add the Cyrillic Supplementary characters to +your range? Your range might then become `[0400-052F]`. The same procedure +as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges +you'd get from the previous transformation would be `[D0-D4][80-AF]`. However, +this isn't quite correct because this range doesn't capture many characters, +for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`). + +Instead, you need multiple sequences of byte ranges: + +```text +[D0-D3][80-BF] # matches codepoints 0400-04FF +[D4][80-AF] # matches codepoints 0500-052F +``` + +This gets even more complicated if you want bigger ranges, particularly if +they naively contain surrogate codepoints. For example, the sequence of byte +ranges for the basic multilingual plane (`[0000-FFFF]`) look like this: + +```text +[0-7F] +[C2-DF][80-BF] +[E0][A0-BF][80-BF] +[E1-EC][80-BF][80-BF] +[ED][80-9F][80-BF] +[EE-EF][80-BF][80-BF] +``` + +Note that the byte ranges above will *not* match any erroneous encoding of +UTF-8, including encodings of surrogate codepoints. + +And, of course, for all of Unicode (`[000000-10FFFF]`): + +```text +[0-7F] +[C2-DF][80-BF] +[E0][A0-BF][80-BF] +[E1-EC][80-BF][80-BF] +[ED][80-9F][80-BF] +[EE-EF][80-BF][80-BF] +[F0][90-BF][80-BF][80-BF] +[F1-F3][80-BF][80-BF][80-BF] +[F4][80-8F][80-BF][80-BF] +``` + +This module automates the process of creating these byte ranges from ranges of +Unicode scalar values. + +# Lineage + +I got the idea and general implementation strategy from Russ Cox in his +[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2. +Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?). +I also got the idea from +[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java), +which uses it for executing automata on their term index. +*/ + +#![deny(missing_docs)] + +use std::char; +use std::fmt; +use std::iter::FusedIterator; +use std::slice; + +const MAX_UTF8_BYTES: usize = 4; + +/// Utf8Sequence represents a sequence of byte ranges. +/// +/// To match a Utf8Sequence, a candidate byte sequence must match each +/// successive range. +/// +/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte +/// sequence `\xDD\x61` would not match because `0x61 < 0x80`. +#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)] +pub enum Utf8Sequence { + /// One byte range. + One(Utf8Range), + /// Two successive byte ranges. + Two([Utf8Range; 2]), + /// Three successive byte ranges. + Three([Utf8Range; 3]), + /// Four successive byte ranges. + Four([Utf8Range; 4]), +} + +impl Utf8Sequence { + /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value + /// range. + /// + /// This assumes that `start` and `end` have the same length. + fn from_encoded_range(start: &[u8], end: &[u8]) -> Self { + assert_eq!(start.len(), end.len()); + match start.len() { + 2 => Utf8Sequence::Two([ + Utf8Range::new(start[0], end[0]), + Utf8Range::new(start[1], end[1]), + ]), + 3 => Utf8Sequence::Three([ + Utf8Range::new(start[0], end[0]), + Utf8Range::new(start[1], end[1]), + Utf8Range::new(start[2], end[2]), + ]), + 4 => Utf8Sequence::Four([ + Utf8Range::new(start[0], end[0]), + Utf8Range::new(start[1], end[1]), + Utf8Range::new(start[2], end[2]), + Utf8Range::new(start[3], end[3]), + ]), + n => unreachable!("invalid encoded length: {}", n), + } + } + + /// Returns the underlying sequence of byte ranges as a slice. + pub fn as_slice(&self) -> &[Utf8Range] { + use self::Utf8Sequence::*; + match *self { + One(ref r) => slice::from_ref(r), + Two(ref r) => &r[..], + Three(ref r) => &r[..], + Four(ref r) => &r[..], + } + } + + /// Returns the number of byte ranges in this sequence. + /// + /// The length is guaranteed to be in the closed interval `[1, 4]`. + pub fn len(&self) -> usize { + self.as_slice().len() + } + + /// Reverses the ranges in this sequence. + /// + /// For example, if this corresponds to the following sequence: + /// + /// ```text + /// [D0-D3][80-BF] + /// ``` + /// + /// Then after reversal, it will be + /// + /// ```text + /// [80-BF][D0-D3] + /// ``` + /// + /// This is useful when one is constructing a UTF-8 automaton to match + /// character classes in reverse. + pub fn reverse(&mut self) { + match *self { + Utf8Sequence::One(_) => {} + Utf8Sequence::Two(ref mut x) => x.reverse(), + Utf8Sequence::Three(ref mut x) => x.reverse(), + Utf8Sequence::Four(ref mut x) => x.reverse(), + } + } + + /// Returns true if and only if a prefix of `bytes` matches this sequence + /// of byte ranges. + pub fn matches(&self, bytes: &[u8]) -> bool { + if bytes.len() < self.len() { + return false; + } + for (&b, r) in bytes.iter().zip(self) { + if !r.matches(b) { + return false; + } + } + true + } +} + +impl<'a> IntoIterator for &'a Utf8Sequence { + type IntoIter = slice::Iter<'a, Utf8Range>; + type Item = &'a Utf8Range; + + fn into_iter(self) -> Self::IntoIter { + self.as_slice().into_iter() + } +} + +impl fmt::Debug for Utf8Sequence { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::Utf8Sequence::*; + match *self { + One(ref r) => write!(f, "{:?}", r), + Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]), + Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]), + Four(ref r) => { + write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3]) + } + } + } +} + +/// A single inclusive range of UTF-8 bytes. +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +pub struct Utf8Range { + /// Start of byte range (inclusive). + pub start: u8, + /// End of byte range (inclusive). + pub end: u8, +} + +impl Utf8Range { + fn new(start: u8, end: u8) -> Self { + Utf8Range { start, end } + } + + /// Returns true if and only if the given byte is in this range. + pub fn matches(&self, b: u8) -> bool { + self.start <= b && b <= self.end + } +} + +impl fmt::Debug for Utf8Range { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.start == self.end { + write!(f, "[{:X}]", self.start) + } else { + write!(f, "[{:X}-{:X}]", self.start, self.end) + } + } +} + +/// An iterator over ranges of matching UTF-8 byte sequences. +/// +/// The iteration represents an alternation of comprehensive byte sequences +/// that match precisely the set of UTF-8 encoded scalar values. +/// +/// A byte sequence corresponds to one of the scalar values in the range given +/// if and only if it completely matches exactly one of the sequences of byte +/// ranges produced by this iterator. +/// +/// Each sequence of byte ranges matches a unique set of bytes. That is, no two +/// sequences will match the same bytes. +/// +/// # Example +/// +/// This shows how to match an arbitrary byte sequence against a range of +/// scalar values. +/// +/// ```rust +/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence}; +/// +/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool { +/// for range in seqs { +/// if range.matches(bytes) { +/// return true; +/// } +/// } +/// false +/// } +/// +/// // Test the basic multilingual plane. +/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect(); +/// +/// // UTF-8 encoding of 'a'. +/// assert!(matches(&seqs, &[0x61])); +/// // UTF-8 encoding of 'â' (`\u{2603}`). +/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83])); +/// // UTF-8 encoding of `\u{10348}` (outside the BMP). +/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88])); +/// // Tries to match against a UTF-8 encoding of a surrogate codepoint, +/// // which is invalid UTF-8, and therefore fails, despite the fact that +/// // the corresponding codepoint (0xD800) falls in the range given. +/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80])); +/// // And fails against plain old invalid UTF-8. +/// assert!(!matches(&seqs, &[0xFF, 0xFF])); +/// ``` +/// +/// If this example seems circuitous, that's because it is! It's meant to be +/// illustrative. In practice, you could just try to decode your byte sequence +/// and compare it with the scalar value range directly. However, this is not +/// always possible (for example, in a byte based automaton). +#[derive(Debug)] +pub struct Utf8Sequences { + range_stack: Vec<ScalarRange>, +} + +impl Utf8Sequences { + /// Create a new iterator over UTF-8 byte ranges for the scalar value range + /// given. + pub fn new(start: char, end: char) -> Self { + let mut it = Utf8Sequences { range_stack: vec![] }; + it.push(start as u32, end as u32); + it + } + + /// reset resets the scalar value range. + /// Any existing state is cleared, but resources may be reused. + /// + /// N.B. Benchmarks say that this method is dubious. + #[doc(hidden)] + pub fn reset(&mut self, start: char, end: char) { + self.range_stack.clear(); + self.push(start as u32, end as u32); + } + + fn push(&mut self, start: u32, end: u32) { + self.range_stack.push(ScalarRange { start, end }); + } +} + +struct ScalarRange { + start: u32, + end: u32, +} + +impl fmt::Debug for ScalarRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "ScalarRange({:X}, {:X})", self.start, self.end) + } +} + +impl Iterator for Utf8Sequences { + type Item = Utf8Sequence; + + fn next(&mut self) -> Option<Self::Item> { + 'TOP: while let Some(mut r) = self.range_stack.pop() { + 'INNER: loop { + if let Some((r1, r2)) = r.split() { + self.push(r2.start, r2.end); + r.start = r1.start; + r.end = r1.end; + continue 'INNER; + } + if !r.is_valid() { + continue 'TOP; + } + for i in 1..MAX_UTF8_BYTES { + let max = max_scalar_value(i); + if r.start <= max && max < r.end { + self.push(max + 1, r.end); + r.end = max; + continue 'INNER; + } + } + if let Some(ascii_range) = r.as_ascii() { + return Some(Utf8Sequence::One(ascii_range)); + } + for i in 1..MAX_UTF8_BYTES { + let m = (1 << (6 * i)) - 1; + if (r.start & !m) != (r.end & !m) { + if (r.start & m) != 0 { + self.push((r.start | m) + 1, r.end); + r.end = r.start | m; + continue 'INNER; + } + if (r.end & m) != m { + self.push(r.end & !m, r.end); + r.end = (r.end & !m) - 1; + continue 'INNER; + } + } + } + let mut start = [0; MAX_UTF8_BYTES]; + let mut end = [0; MAX_UTF8_BYTES]; + let n = r.encode(&mut start, &mut end); + return Some(Utf8Sequence::from_encoded_range( + &start[0..n], + &end[0..n], + )); + } + } + None + } +} + +impl FusedIterator for Utf8Sequences {} + +impl ScalarRange { + /// split splits this range if it overlaps with a surrogate codepoint. + /// + /// Either or both ranges may be invalid. + fn split(&self) -> Option<(ScalarRange, ScalarRange)> { + if self.start < 0xE000 && self.end > 0xD7FF { + Some(( + ScalarRange { start: self.start, end: 0xD7FF }, + ScalarRange { start: 0xE000, end: self.end }, + )) + } else { + None + } + } + + /// is_valid returns true if and only if start <= end. + fn is_valid(&self) -> bool { + self.start <= self.end + } + + /// as_ascii returns this range as a Utf8Range if and only if all scalar + /// values in this range can be encoded as a single byte. + fn as_ascii(&self) -> Option<Utf8Range> { + if self.is_ascii() { + Some(Utf8Range::new(self.start as u8, self.end as u8)) + } else { + None + } + } + + /// is_ascii returns true if the range is ASCII only (i.e., takes a single + /// byte to encode any scalar value). + fn is_ascii(&self) -> bool { + self.is_valid() && self.end <= 0x7f + } + + /// encode writes the UTF-8 encoding of the start and end of this range + /// to the corresponding destination slices, and returns the number of + /// bytes written. + /// + /// The slices should have room for at least `MAX_UTF8_BYTES`. + fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize { + let cs = char::from_u32(self.start).unwrap(); + let ce = char::from_u32(self.end).unwrap(); + let ss = cs.encode_utf8(start); + let se = ce.encode_utf8(end); + assert_eq!(ss.len(), se.len()); + ss.len() + } +} + +fn max_scalar_value(nbytes: usize) -> u32 { + match nbytes { + 1 => 0x007F, + 2 => 0x07FF, + 3 => 0xFFFF, + 4 => 0x10FFFF, + _ => unreachable!("invalid UTF-8 byte sequence size"), + } +} + +#[cfg(test)] +mod tests { + use std::char; + + use crate::utf8::{Utf8Range, Utf8Sequences}; + + fn rutf8(s: u8, e: u8) -> Utf8Range { + Utf8Range::new(s, e) + } + + fn never_accepts_surrogate_codepoints(start: char, end: char) { + for cp in 0xD800..0xE000 { + let buf = encode_surrogate(cp); + for r in Utf8Sequences::new(start, end) { + if r.matches(&buf) { + panic!( + "Sequence ({:X}, {:X}) contains range {:?}, \ + which matches surrogate code point {:X} \ + with encoded bytes {:?}", + start as u32, end as u32, r, cp, buf, + ); + } + } + } + } + + #[test] + fn codepoints_no_surrogates() { + never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}'); + never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}'); + never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}'); + never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}'); + never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}'); + } + + #[test] + fn single_codepoint_one_sequence() { + // Tests that every range of scalar values that contains a single + // scalar value is recognized by one sequence of byte ranges. + for i in 0x0..(0x10FFFF + 1) { + let c = match char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let seqs: Vec<_> = Utf8Sequences::new(c, c).collect(); + assert_eq!(seqs.len(), 1); + } + } + + #[test] + fn bmp() { + use crate::utf8::Utf8Sequence::*; + + let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>(); + assert_eq!( + seqs, + vec![ + One(rutf8(0x0, 0x7F)), + Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]), + Three([ + rutf8(0xE0, 0xE0), + rutf8(0xA0, 0xBF), + rutf8(0x80, 0xBF) + ]), + Three([ + rutf8(0xE1, 0xEC), + rutf8(0x80, 0xBF), + rutf8(0x80, 0xBF) + ]), + Three([ + rutf8(0xED, 0xED), + rutf8(0x80, 0x9F), + rutf8(0x80, 0xBF) + ]), + Three([ + rutf8(0xEE, 0xEF), + rutf8(0x80, 0xBF), + rutf8(0x80, 0xBF) + ]), + ] + ); + } + + #[test] + fn reverse() { + use crate::utf8::Utf8Sequence::*; + + let mut s = One(rutf8(0xA, 0xB)); + s.reverse(); + assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]); + + let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]); + s.reverse(); + assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]); + + let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]); + s.reverse(); + assert_eq!( + s.as_slice(), + &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)] + ); + + let mut s = Four([ + rutf8(0xA, 0xB), + rutf8(0xB, 0xC), + rutf8(0xC, 0xD), + rutf8(0xD, 0xE), + ]); + s.reverse(); + assert_eq!( + s.as_slice(), + &[ + rutf8(0xD, 0xE), + rutf8(0xC, 0xD), + rutf8(0xB, 0xC), + rutf8(0xA, 0xB) + ] + ); + } + + fn encode_surrogate(cp: u32) -> [u8; 3] { + const TAG_CONT: u8 = 0b1000_0000; + const TAG_THREE_B: u8 = 0b1110_0000; + + assert!(0xD800 <= cp && cp < 0xE000); + let mut dst = [0; 3]; + dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B; + dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT; + dst[2] = (cp & 0x3F) as u8 | TAG_CONT; + dst + } +} |