From 2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 4 May 2024 14:47:55 +0200 Subject: Adding upstream version 0.70.1+ds1. Signed-off-by: Daniel Baumann --- vendor/regex-syntax/.cargo-checksum.json | 1 + vendor/regex-syntax/Cargo.toml | 61 + vendor/regex-syntax/LICENSE-APACHE | 201 + vendor/regex-syntax/LICENSE-MIT | 25 + vendor/regex-syntax/README.md | 96 + vendor/regex-syntax/src/ast/mod.rs | 1809 +++ vendor/regex-syntax/src/ast/parse.rs | 6326 +++++++++++ vendor/regex-syntax/src/ast/print.rs | 577 + vendor/regex-syntax/src/ast/visitor.rs | 522 + vendor/regex-syntax/src/debug.rs | 107 + vendor/regex-syntax/src/either.rs | 8 + vendor/regex-syntax/src/error.rs | 311 + vendor/regex-syntax/src/hir/interval.rs | 581 + vendor/regex-syntax/src/hir/literal.rs | 3214 ++++++ vendor/regex-syntax/src/hir/mod.rs | 3861 +++++++ vendor/regex-syntax/src/hir/print.rs | 608 + vendor/regex-syntax/src/hir/translate.rs | 3724 ++++++ vendor/regex-syntax/src/hir/visitor.rs | 215 + vendor/regex-syntax/src/lib.rs | 431 + vendor/regex-syntax/src/parser.rs | 254 + vendor/regex-syntax/src/rank.rs | 258 + vendor/regex-syntax/src/unicode.rs | 1039 ++ .../src/unicode_tables/LICENSE-UNICODE | 57 + vendor/regex-syntax/src/unicode_tables/age.rs | 1791 +++ .../src/unicode_tables/case_folding_simple.rs | 2888 +++++ .../src/unicode_tables/general_category.rs | 6552 +++++++++++ .../src/unicode_tables/grapheme_cluster_break.rs | 1416 +++ vendor/regex-syntax/src/unicode_tables/mod.rs | 57 + .../src/unicode_tables/perl_decimal.rs | 77 + .../regex-syntax/src/unicode_tables/perl_space.rs | 23 + .../regex-syntax/src/unicode_tables/perl_word.rs | 781 ++ .../src/unicode_tables/property_bool.rs | 11367 +++++++++++++++++++ .../src/unicode_tables/property_names.rs | 264 + .../src/unicode_tables/property_values.rs | 924 ++ vendor/regex-syntax/src/unicode_tables/script.rs | 1263 +++ .../src/unicode_tables/script_extension.rs | 1457 +++ .../src/unicode_tables/sentence_break.rs | 2477 ++++ .../regex-syntax/src/unicode_tables/word_break.rs | 1120 ++ vendor/regex-syntax/src/utf8.rs | 592 + vendor/regex-syntax/test | 30 + 40 files changed, 57365 insertions(+) create mode 100644 vendor/regex-syntax/.cargo-checksum.json create mode 100644 vendor/regex-syntax/Cargo.toml create mode 100644 vendor/regex-syntax/LICENSE-APACHE create mode 100644 vendor/regex-syntax/LICENSE-MIT create mode 100644 vendor/regex-syntax/README.md create mode 100644 vendor/regex-syntax/src/ast/mod.rs create mode 100644 vendor/regex-syntax/src/ast/parse.rs create mode 100644 vendor/regex-syntax/src/ast/print.rs create mode 100644 vendor/regex-syntax/src/ast/visitor.rs create mode 100644 vendor/regex-syntax/src/debug.rs create mode 100644 vendor/regex-syntax/src/either.rs create mode 100644 vendor/regex-syntax/src/error.rs create mode 100644 vendor/regex-syntax/src/hir/interval.rs create mode 100644 vendor/regex-syntax/src/hir/literal.rs create mode 100644 vendor/regex-syntax/src/hir/mod.rs create mode 100644 vendor/regex-syntax/src/hir/print.rs create mode 100644 vendor/regex-syntax/src/hir/translate.rs create mode 100644 vendor/regex-syntax/src/hir/visitor.rs create mode 100644 vendor/regex-syntax/src/lib.rs create mode 100644 vendor/regex-syntax/src/parser.rs create mode 100644 vendor/regex-syntax/src/rank.rs create mode 100644 vendor/regex-syntax/src/unicode.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE create mode 100644 vendor/regex-syntax/src/unicode_tables/age.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/general_category.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/mod.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/perl_decimal.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/perl_space.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/perl_word.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/property_bool.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/property_names.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/property_values.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/script.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/script_extension.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/sentence_break.rs create mode 100644 vendor/regex-syntax/src/unicode_tables/word_break.rs create mode 100644 vendor/regex-syntax/src/utf8.rs create mode 100755 vendor/regex-syntax/test (limited to 'vendor/regex-syntax') diff --git a/vendor/regex-syntax/.cargo-checksum.json b/vendor/regex-syntax/.cargo-checksum.json new file mode 100644 index 0000000..cac2bb8 --- /dev/null +++ b/vendor/regex-syntax/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{},"package":"c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"} \ No newline at end of file diff --git a/vendor/regex-syntax/Cargo.toml b/vendor/regex-syntax/Cargo.toml new file mode 100644 index 0000000..3602ab3 --- /dev/null +++ b/vendor/regex-syntax/Cargo.toml @@ -0,0 +1,61 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +rust-version = "1.65" +name = "regex-syntax" +version = "0.8.2" +authors = [ + "The Rust Project Developers", + "Andrew Gallant ", +] +description = "A regular expression parser." +documentation = "https://docs.rs/regex-syntax" +readme = "README.md" +license = "MIT OR Apache-2.0" +repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = [ + "--cfg", + "docsrs", +] + +[dependencies.arbitrary] +version = "1.3.0" +features = ["derive"] +optional = true + +[features] +arbitrary = ["dep:arbitrary"] +default = [ + "std", + "unicode", +] +std = [] +unicode = [ + "unicode-age", + "unicode-bool", + "unicode-case", + "unicode-gencat", + "unicode-perl", + "unicode-script", + "unicode-segment", +] +unicode-age = [] +unicode-bool = [] +unicode-case = [] +unicode-gencat = [] +unicode-perl = [] +unicode-script = [] +unicode-segment = [] diff --git a/vendor/regex-syntax/LICENSE-APACHE b/vendor/regex-syntax/LICENSE-APACHE new file mode 100644 index 0000000..16fe87b --- /dev/null +++ b/vendor/regex-syntax/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/vendor/regex-syntax/LICENSE-MIT b/vendor/regex-syntax/LICENSE-MIT new file mode 100644 index 0000000..39d4bdb --- /dev/null +++ b/vendor/regex-syntax/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2014 The Rust Project Developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/vendor/regex-syntax/README.md b/vendor/regex-syntax/README.md new file mode 100644 index 0000000..529513b --- /dev/null +++ b/vendor/regex-syntax/README.md @@ -0,0 +1,96 @@ +regex-syntax +============ +This crate provides a robust regular expression parser. + +[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) +[![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax) + + +### Documentation + +https://docs.rs/regex-syntax + + +### Overview + +There are two primary types exported by this crate: `Ast` and `Hir`. The former +is a faithful abstract syntax of a regular expression, and can convert regular +expressions back to their concrete syntax while mostly preserving its original +form. The latter type is a high level intermediate representation of a regular +expression that is amenable to analysis and compilation into byte codes or +automata. An `Hir` achieves this by drastically simplifying the syntactic +structure of the regular expression. While an `Hir` can be converted back to +its equivalent concrete syntax, the result is unlikely to resemble the original +concrete syntax that produced the `Hir`. + + +### Example + +This example shows how to parse a pattern string into its HIR: + +```rust +use regex_syntax::{hir::Hir, parse}; + +let hir = parse("a|b").unwrap(); +assert_eq!(hir, Hir::alternation(vec![ + Hir::literal("a".as_bytes()), + Hir::literal("b".as_bytes()), +])); +``` + + +### Safety + +This crate has no `unsafe` code and sets `forbid(unsafe_code)`. While it's +possible this crate could use `unsafe` code in the future, the standard +for doing so is extremely high. In general, most code in this crate is not +performance critical, since it tends to be dwarfed by the time it takes to +compile a regular expression into an automaton. Therefore, there is little need +for extreme optimization, and therefore, use of `unsafe`. + +The standard for using `unsafe` in this crate is extremely high because this +crate is intended to be reasonably safe to use with user supplied regular +expressions. Therefore, while there may be bugs in the regex parser itself, +they should _never_ result in memory unsafety unless there is either a bug +in the compiler or the standard library. (Since `regex-syntax` has zero +dependencies.) + + +### Crate features + +By default, this crate bundles a fairly large amount of Unicode data tables +(a source size of ~750KB). Because of their large size, one can disable some +or all of these data tables. If a regular expression attempts to use Unicode +data that is not available, then an error will occur when translating the `Ast` +to the `Hir`. + +The full set of features one can disable are +[in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features). + + +### Testing + +Simply running `cargo test` will give you very good coverage. However, because +of the large number of features exposed by this crate, a `test` script is +included in this directory which will test several feature combinations. This +is the same script that is run in CI. + + +### Motivation + +The primary purpose of this crate is to provide the parser used by `regex`. +Specifically, this crate is treated as an implementation detail of the `regex`, +and is primarily developed for the needs of `regex`. + +Since this crate is an implementation detail of `regex`, it may experience +breaking change releases at a different cadence from `regex`. This is only +possible because this crate is _not_ a public dependency of `regex`. + +Another consequence of this de-coupling is that there is no direct way to +compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must +first convert the `Hir` to a string (via its `std::fmt::Display`) and then +compile that via `Regex::new`. While this does repeat some work, compilation +typically takes much longer than parsing. + +Stated differently, the coupling between `regex` and `regex-syntax` exists only +at the level of the concrete syntax. diff --git a/vendor/regex-syntax/src/ast/mod.rs b/vendor/regex-syntax/src/ast/mod.rs new file mode 100644 index 0000000..6a77ee1 --- /dev/null +++ b/vendor/regex-syntax/src/ast/mod.rs @@ -0,0 +1,1809 @@ +/*! +Defines an abstract syntax for regular expressions. +*/ + +use core::cmp::Ordering; + +use alloc::{boxed::Box, string::String, vec, vec::Vec}; + +pub use crate::ast::visitor::{visit, Visitor}; + +pub mod parse; +pub mod print; +mod visitor; + +/// An error that occurred while parsing a regular expression into an abstract +/// syntax tree. +/// +/// Note that not all ASTs represents a valid regular expression. For example, +/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a +/// valid Unicode property name. That particular error is reported when +/// translating an AST to the high-level intermediate representation (`HIR`). +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the parser generated the error from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } + + /// Return an auxiliary span. This span exists only for some errors that + /// benefit from being able to point to two locations in the original + /// regular expression. For example, "duplicate" errors will have the + /// main error position set to the duplicate occurrence while its + /// auxiliary span will be set to the initial occurrence. + pub fn auxiliary_span(&self) -> Option<&Span> { + use self::ErrorKind::*; + match self.kind { + FlagDuplicate { ref original } => Some(original), + FlagRepeatedNegation { ref original, .. } => Some(original), + GroupNameDuplicate { ref original, .. } => Some(original), + _ => None, + } + } +} + +/// The type of an error that occurred while building an AST. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum ErrorKind { + /// The capturing group limit was exceeded. + /// + /// Note that this represents a limit on the total number of capturing + /// groups in a regex and not necessarily the number of nested capturing + /// groups. That is, the nest limit can be low and it is still possible for + /// this error to occur. + CaptureLimitExceeded, + /// An invalid escape sequence was found in a character class set. + ClassEscapeInvalid, + /// An invalid character class range was found. An invalid range is any + /// range where the start is greater than the end. + ClassRangeInvalid, + /// An invalid range boundary was found in a character class. Range + /// boundaries must be a single literal codepoint, but this error indicates + /// that something else was found, such as a nested class. + ClassRangeLiteral, + /// An opening `[` was found with no corresponding closing `]`. + ClassUnclosed, + /// Note that this error variant is no longer used. Namely, a decimal + /// number can only appear as a repetition quantifier. When the number + /// in a repetition quantifier is empty, then it gets its own specialized + /// error, `RepetitionCountDecimalEmpty`. + DecimalEmpty, + /// An invalid decimal number was given where one was expected. + DecimalInvalid, + /// A bracketed hex literal was empty. + EscapeHexEmpty, + /// A bracketed hex literal did not correspond to a Unicode scalar value. + EscapeHexInvalid, + /// An invalid hexadecimal digit was found. + EscapeHexInvalidDigit, + /// EOF was found before an escape sequence was completed. + EscapeUnexpectedEof, + /// An unrecognized escape sequence. + EscapeUnrecognized, + /// A dangling negation was used when setting flags, e.g., `i-`. + FlagDanglingNegation, + /// A flag was used twice, e.g., `i-i`. + FlagDuplicate { + /// The position of the original flag. The error position + /// points to the duplicate flag. + original: Span, + }, + /// The negation operator was used twice, e.g., `-i-s`. + FlagRepeatedNegation { + /// The position of the original negation operator. The error position + /// points to the duplicate negation operator. + original: Span, + }, + /// Expected a flag but got EOF, e.g., `(?`. + FlagUnexpectedEof, + /// Unrecognized flag, e.g., `a`. + FlagUnrecognized, + /// A duplicate capture name was found. + GroupNameDuplicate { + /// The position of the initial occurrence of the capture name. The + /// error position itself points to the duplicate occurrence. + original: Span, + }, + /// A capture group name is empty, e.g., `(?P<>abc)`. + GroupNameEmpty, + /// An invalid character was seen for a capture group name. This includes + /// errors where the first character is a digit (even though subsequent + /// characters are allowed to be digits). + GroupNameInvalid, + /// A closing `>` could not be found for a capture group name. + GroupNameUnexpectedEof, + /// An unclosed group, e.g., `(ab`. + /// + /// The span of this error corresponds to the unclosed parenthesis. + GroupUnclosed, + /// An unopened group, e.g., `ab)`. + GroupUnopened, + /// The nest limit was exceeded. The limit stored here is the limit + /// configured in the parser. + NestLimitExceeded(u32), + /// The range provided in a counted repetition operator is invalid. The + /// range is invalid if the start is greater than the end. + RepetitionCountInvalid, + /// An opening `{` was not followed by a valid decimal value. + /// For example, `x{}` or `x{]}` would fail. + RepetitionCountDecimalEmpty, + /// An opening `{` was found with no corresponding closing `}`. + RepetitionCountUnclosed, + /// A repetition operator was applied to a missing sub-expression. This + /// occurs, for example, in the regex consisting of just a `*` or even + /// `(?i)*`. It is, however, possible to create a repetition operating on + /// an empty sub-expression. For example, `()*` is still considered valid. + RepetitionMissing, + /// The special word boundary syntax, `\b{something}`, was used, but + /// either EOF without `}` was seen, or an invalid character in the + /// braces was seen. + SpecialWordBoundaryUnclosed, + /// The special word boundary syntax, `\b{something}`, was used, but + /// `something` was not recognized as a valid word boundary kind. + SpecialWordBoundaryUnrecognized, + /// The syntax `\b{` was observed, but afterwards the end of the pattern + /// was observed without being able to tell whether it was meant to be a + /// bounded repetition on the `\b` or the beginning of a special word + /// boundary assertion. + SpecialWordOrRepetitionUnexpectedEof, + /// The Unicode class is not valid. This typically occurs when a `\p` is + /// followed by something other than a `{`. + UnicodeClassInvalid, + /// When octal support is disabled, this error is produced when an octal + /// escape is used. The octal escape is assumed to be an invocation of + /// a backreference, which is the common case. + UnsupportedBackreference, + /// When syntax similar to PCRE's look-around is used, this error is + /// returned. Some example syntaxes that are rejected include, but are + /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and + /// `(?) -> core::fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl core::fmt::Display for ErrorKind { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use self::ErrorKind::*; + match *self { + CaptureLimitExceeded => write!( + f, + "exceeded the maximum number of \ + capturing groups ({})", + u32::MAX + ), + ClassEscapeInvalid => { + write!(f, "invalid escape sequence found in character class") + } + ClassRangeInvalid => write!( + f, + "invalid character class range, \ + the start must be <= the end" + ), + ClassRangeLiteral => { + write!(f, "invalid range boundary, must be a literal") + } + ClassUnclosed => write!(f, "unclosed character class"), + DecimalEmpty => write!(f, "decimal literal empty"), + DecimalInvalid => write!(f, "decimal literal invalid"), + EscapeHexEmpty => write!(f, "hexadecimal literal empty"), + EscapeHexInvalid => { + write!(f, "hexadecimal literal is not a Unicode scalar value") + } + EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"), + EscapeUnexpectedEof => write!( + f, + "incomplete escape sequence, \ + reached end of pattern prematurely" + ), + EscapeUnrecognized => write!(f, "unrecognized escape sequence"), + FlagDanglingNegation => { + write!(f, "dangling flag negation operator") + } + FlagDuplicate { .. } => write!(f, "duplicate flag"), + FlagRepeatedNegation { .. } => { + write!(f, "flag negation operator repeated") + } + FlagUnexpectedEof => { + write!(f, "expected flag but got end of regex") + } + FlagUnrecognized => write!(f, "unrecognized flag"), + GroupNameDuplicate { .. } => { + write!(f, "duplicate capture group name") + } + GroupNameEmpty => write!(f, "empty capture group name"), + GroupNameInvalid => write!(f, "invalid capture group character"), + GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), + GroupUnclosed => write!(f, "unclosed group"), + GroupUnopened => write!(f, "unopened group"), + NestLimitExceeded(limit) => write!( + f, + "exceed the maximum number of \ + nested parentheses/brackets ({})", + limit + ), + RepetitionCountInvalid => write!( + f, + "invalid repetition count range, \ + the start must be <= the end" + ), + RepetitionCountDecimalEmpty => { + write!(f, "repetition quantifier expects a valid decimal") + } + RepetitionCountUnclosed => { + write!(f, "unclosed counted repetition") + } + RepetitionMissing => { + write!(f, "repetition operator missing expression") + } + SpecialWordBoundaryUnclosed => { + write!( + f, + "special word boundary assertion is either \ + unclosed or contains an invalid character", + ) + } + SpecialWordBoundaryUnrecognized => { + write!( + f, + "unrecognized special word boundary assertion, \ + valid choices are: start, end, start-half \ + or end-half", + ) + } + SpecialWordOrRepetitionUnexpectedEof => { + write!( + f, + "found either the beginning of a special word \ + boundary or a bounded repetition on a \\b with \ + an opening brace, but no closing brace", + ) + } + UnicodeClassInvalid => { + write!(f, "invalid Unicode character class") + } + UnsupportedBackreference => { + write!(f, "backreferences are not supported") + } + UnsupportedLookAround => write!( + f, + "look-around, including look-ahead and look-behind, \ + is not supported" + ), + } + } +} + +/// Span represents the position information of a single AST item. +/// +/// All span positions are absolute byte offsets that can be used on the +/// original regular expression that was parsed. +#[derive(Clone, Copy, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Span { + /// The start byte offset. + pub start: Position, + /// The end byte offset. + pub end: Position, +} + +impl core::fmt::Debug for Span { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "Span({:?}, {:?})", self.start, self.end) + } +} + +impl Ord for Span { + fn cmp(&self, other: &Span) -> Ordering { + (&self.start, &self.end).cmp(&(&other.start, &other.end)) + } +} + +impl PartialOrd for Span { + fn partial_cmp(&self, other: &Span) -> Option { + Some(self.cmp(other)) + } +} + +/// A single position in a regular expression. +/// +/// A position encodes one half of a span, and include the byte offset, line +/// number and column number. +#[derive(Clone, Copy, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Position { + /// The absolute offset of this position, starting at `0` from the + /// beginning of the regular expression pattern string. + pub offset: usize, + /// The line number, starting at `1`. + pub line: usize, + /// The approximate column number, starting at `1`. + pub column: usize, +} + +impl core::fmt::Debug for Position { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "Position(o: {:?}, l: {:?}, c: {:?})", + self.offset, self.line, self.column + ) + } +} + +impl Ord for Position { + fn cmp(&self, other: &Position) -> Ordering { + self.offset.cmp(&other.offset) + } +} + +impl PartialOrd for Position { + fn partial_cmp(&self, other: &Position) -> Option { + Some(self.cmp(other)) + } +} + +impl Span { + /// Create a new span with the given positions. + pub fn new(start: Position, end: Position) -> Span { + Span { start, end } + } + + /// Create a new span using the given position as the start and end. + pub fn splat(pos: Position) -> Span { + Span::new(pos, pos) + } + + /// Create a new span by replacing the starting the position with the one + /// given. + pub fn with_start(self, pos: Position) -> Span { + Span { start: pos, ..self } + } + + /// Create a new span by replacing the ending the position with the one + /// given. + pub fn with_end(self, pos: Position) -> Span { + Span { end: pos, ..self } + } + + /// Returns true if and only if this span occurs on a single line. + pub fn is_one_line(&self) -> bool { + self.start.line == self.end.line + } + + /// Returns true if and only if this span is empty. That is, it points to + /// a single position in the concrete syntax of a regular expression. + pub fn is_empty(&self) -> bool { + self.start.offset == self.end.offset + } +} + +impl Position { + /// Create a new position with the given information. + /// + /// `offset` is the absolute offset of the position, starting at `0` from + /// the beginning of the regular expression pattern string. + /// + /// `line` is the line number, starting at `1`. + /// + /// `column` is the approximate column number, starting at `1`. + pub fn new(offset: usize, line: usize, column: usize) -> Position { + Position { offset, line, column } + } +} + +/// An abstract syntax tree for a singular expression along with comments +/// found. +/// +/// Comments are not stored in the tree itself to avoid complexity. Each +/// comment contains a span of precisely where it occurred in the original +/// regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct WithComments { + /// The actual ast. + pub ast: Ast, + /// All comments found in the original regular expression. + pub comments: Vec, +} + +/// A comment from a regular expression with an associated span. +/// +/// A regular expression can only contain comments when the `x` flag is +/// enabled. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Comment { + /// The span of this comment, including the beginning `#` and ending `\n`. + pub span: Span, + /// The comment text, starting with the first character following the `#` + /// and ending with the last character preceding the `\n`. + pub comment: String, +} + +/// An abstract syntax tree for a single regular expression. +/// +/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap +/// space proportional to the size of the `Ast`. +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the `Ast`. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum Ast { + /// An empty regex that matches everything. + Empty(Box), + /// A set of flags, e.g., `(?is)`. + Flags(Box), + /// A single character literal, which includes escape sequences. + Literal(Box), + /// The "any character" class. + Dot(Box), + /// A single zero-width assertion. + Assertion(Box), + /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. + ClassUnicode(Box), + /// A single perl character class, e.g., `\d` or `\W`. + ClassPerl(Box), + /// A single bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + ClassBracketed(Box), + /// A repetition operator applied to an arbitrary regular expression. + Repetition(Box), + /// A grouped regular expression. + Group(Box), + /// An alternation of regular expressions. + Alternation(Box), + /// A concatenation of regular expressions. + Concat(Box), +} + +impl Ast { + /// Create an "empty" AST item. + pub fn empty(span: Span) -> Ast { + Ast::Empty(Box::new(span)) + } + + /// Create a "flags" AST item. + pub fn flags(e: SetFlags) -> Ast { + Ast::Flags(Box::new(e)) + } + + /// Create a "literal" AST item. + pub fn literal(e: Literal) -> Ast { + Ast::Literal(Box::new(e)) + } + + /// Create a "dot" AST item. + pub fn dot(span: Span) -> Ast { + Ast::Dot(Box::new(span)) + } + + /// Create a "assertion" AST item. + pub fn assertion(e: Assertion) -> Ast { + Ast::Assertion(Box::new(e)) + } + + /// Create a "Unicode class" AST item. + pub fn class_unicode(e: ClassUnicode) -> Ast { + Ast::ClassUnicode(Box::new(e)) + } + + /// Create a "Perl class" AST item. + pub fn class_perl(e: ClassPerl) -> Ast { + Ast::ClassPerl(Box::new(e)) + } + + /// Create a "bracketed class" AST item. + pub fn class_bracketed(e: ClassBracketed) -> Ast { + Ast::ClassBracketed(Box::new(e)) + } + + /// Create a "repetition" AST item. + pub fn repetition(e: Repetition) -> Ast { + Ast::Repetition(Box::new(e)) + } + + /// Create a "group" AST item. + pub fn group(e: Group) -> Ast { + Ast::Group(Box::new(e)) + } + + /// Create a "alternation" AST item. + pub fn alternation(e: Alternation) -> Ast { + Ast::Alternation(Box::new(e)) + } + + /// Create a "concat" AST item. + pub fn concat(e: Concat) -> Ast { + Ast::Concat(Box::new(e)) + } + + /// Return the span of this abstract syntax tree. + pub fn span(&self) -> &Span { + match *self { + Ast::Empty(ref span) => span, + Ast::Flags(ref x) => &x.span, + Ast::Literal(ref x) => &x.span, + Ast::Dot(ref span) => span, + Ast::Assertion(ref x) => &x.span, + Ast::ClassUnicode(ref x) => &x.span, + Ast::ClassPerl(ref x) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + } + } + + /// Return true if and only if this Ast is empty. + pub fn is_empty(&self) -> bool { + match *self { + Ast::Empty(_) => true, + _ => false, + } + } + + /// Returns true if and only if this AST has any (including possibly empty) + /// subexpressions. + fn has_subexprs(&self) -> bool { + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => false, + Ast::ClassBracketed(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => true, + } + } +} + +/// Print a display representation of this Ast. +/// +/// This does not preserve any of the original whitespace formatting that may +/// have originally been present in the concrete syntax from which this Ast +/// was generated. +/// +/// This implementation uses constant stack space and heap space proportional +/// to the size of the `Ast`. +impl core::fmt::Display for Ast { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use crate::ast::print::Printer; + Printer::new().print(self, f) + } +} + +/// An alternation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Alternation { + /// The span of this alternation. + pub span: Span, + /// The alternate regular expressions. + pub asts: Vec, +} + +impl Alternation { + /// Return this alternation as an AST. + /// + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::alternation` is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::alternation(self), + } + } +} + +/// A concatenation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Concat { + /// The span of this concatenation. + pub span: Span, + /// The concatenation regular expressions. + pub asts: Vec, +} + +impl Concat { + /// Return this concatenation as an AST. + /// + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::concat` is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::concat(self), + } + } +} + +/// A single literal expression. +/// +/// A literal corresponds to a single Unicode scalar value. Literals may be +/// represented in their literal form, e.g., `a` or in their escaped form, +/// e.g., `\x61`. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Literal { + /// The span of this literal. + pub span: Span, + /// The kind of this literal. + pub kind: LiteralKind, + /// The Unicode scalar value corresponding to this literal. + pub c: char, +} + +impl Literal { + /// If this literal was written as a `\x` hex escape, then this returns + /// the corresponding byte value. Otherwise, this returns `None`. + pub fn byte(&self) -> Option { + match self.kind { + LiteralKind::HexFixed(HexLiteralKind::X) => { + u8::try_from(self.c).ok() + } + _ => None, + } + } +} + +/// The kind of a single literal expression. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum LiteralKind { + /// The literal is written verbatim, e.g., `a` or `☃`. + Verbatim, + /// The literal is written as an escape because it is otherwise a special + /// regex meta character, e.g., `\*` or `\[`. + Meta, + /// The literal is written as an escape despite the fact that the escape is + /// unnecessary, e.g., `\%` or `\/`. + Superfluous, + /// The literal is written as an octal escape, e.g., `\141`. + Octal, + /// The literal is written as a hex code with a fixed number of digits + /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or + /// `\U00000061`. + HexFixed(HexLiteralKind), + /// The literal is written as a hex code with a bracketed number of + /// digits. The only restriction is that the bracketed hex code must refer + /// to a valid Unicode scalar value. + HexBrace(HexLiteralKind), + /// The literal is written as a specially recognized escape, e.g., `\f` + /// or `\n`. + Special(SpecialLiteralKind), +} + +/// The type of a special literal. +/// +/// A special literal is a special escape sequence recognized by the regex +/// parser, e.g., `\f` or `\n`. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum SpecialLiteralKind { + /// Bell, spelled `\a` (`\x07`). + Bell, + /// Form feed, spelled `\f` (`\x0C`). + FormFeed, + /// Tab, spelled `\t` (`\x09`). + Tab, + /// Line feed, spelled `\n` (`\x0A`). + LineFeed, + /// Carriage return, spelled `\r` (`\x0D`). + CarriageReturn, + /// Vertical tab, spelled `\v` (`\x0B`). + VerticalTab, + /// Space, spelled `\ ` (`\x20`). Note that this can only appear when + /// parsing in verbose mode. + Space, +} + +/// The type of a Unicode hex literal. +/// +/// Note that all variants behave the same when used with brackets. They only +/// differ when used without brackets in the number of hex digits that must +/// follow. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum HexLiteralKind { + /// A `\x` prefix. When used without brackets, this form is limited to + /// two digits. + X, + /// A `\u` prefix. When used without brackets, this form is limited to + /// four digits. + UnicodeShort, + /// A `\U` prefix. When used without brackets, this form is limited to + /// eight digits. + UnicodeLong, +} + +impl HexLiteralKind { + /// The number of digits that must be used with this literal form when + /// used without brackets. When used with brackets, there is no + /// restriction on the number of digits. + pub fn digits(&self) -> u32 { + match *self { + HexLiteralKind::X => 2, + HexLiteralKind::UnicodeShort => 4, + HexLiteralKind::UnicodeLong => 8, + } + } +} + +/// A Perl character class. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct ClassPerl { + /// The span of this class. + pub span: Span, + /// The kind of Perl class. + pub kind: ClassPerlKind, + /// Whether the class is negated or not. e.g., `\d` is not negated but + /// `\D` is. + pub negated: bool, +} + +/// The available Perl character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum ClassPerlKind { + /// Decimal numbers. + Digit, + /// Whitespace. + Space, + /// Word characters. + Word, +} + +/// An ASCII character class. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct ClassAscii { + /// The span of this class. + pub span: Span, + /// The kind of ASCII class. + pub kind: ClassAsciiKind, + /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated + /// but `[[:^alpha:]]` is. + pub negated: bool, +} + +/// The available ASCII character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum ClassAsciiKind { + /// `[0-9A-Za-z]` + Alnum, + /// `[A-Za-z]` + Alpha, + /// `[\x00-\x7F]` + Ascii, + /// `[ \t]` + Blank, + /// `[\x00-\x1F\x7F]` + Cntrl, + /// `[0-9]` + Digit, + /// `[!-~]` + Graph, + /// `[a-z]` + Lower, + /// `[ -~]` + Print, + /// `[!-/:-@\[-`{-~]` + Punct, + /// `[\t\n\v\f\r ]` + Space, + /// `[A-Z]` + Upper, + /// `[0-9A-Za-z_]` + Word, + /// `[0-9A-Fa-f]` + Xdigit, +} + +impl ClassAsciiKind { + /// Return the corresponding ClassAsciiKind variant for the given name. + /// + /// The name given should correspond to the lowercase version of the + /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`. + /// + /// If no variant with the corresponding name exists, then `None` is + /// returned. + pub fn from_name(name: &str) -> Option { + use self::ClassAsciiKind::*; + match name { + "alnum" => Some(Alnum), + "alpha" => Some(Alpha), + "ascii" => Some(Ascii), + "blank" => Some(Blank), + "cntrl" => Some(Cntrl), + "digit" => Some(Digit), + "graph" => Some(Graph), + "lower" => Some(Lower), + "print" => Some(Print), + "punct" => Some(Punct), + "space" => Some(Space), + "upper" => Some(Upper), + "word" => Some(Word), + "xdigit" => Some(Xdigit), + _ => None, + } + } +} + +/// A Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct ClassUnicode { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. + /// + /// Note: be careful when using this attribute. This specifically refers + /// to whether the class is written as `\p` or `\P`, where the latter + /// is `negated = true`. However, it also possible to write something like + /// `\P{scx!=Katakana}` which is actually equivalent to + /// `\p{scx=Katakana}` and is therefore not actually negated even though + /// `negated = true` here. To test whether this class is truly negated + /// or not, use the `is_negated` method. + pub negated: bool, + /// The kind of Unicode class. + pub kind: ClassUnicodeKind, +} + +impl ClassUnicode { + /// Returns true if this class has been negated. + /// + /// Note that this takes the Unicode op into account, if it's present. + /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`. + pub fn is_negated(&self) -> bool { + match self.kind { + ClassUnicodeKind::NamedValue { + op: ClassUnicodeOpKind::NotEqual, + .. + } => !self.negated, + _ => self.negated, + } + } +} + +/// The available forms of Unicode character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassUnicodeKind { + /// A one letter abbreviated class, e.g., `\pN`. + OneLetter(char), + /// A binary property, general category or script. The string may be + /// empty. + Named(String), + /// A property name and an associated value. + NamedValue { + /// The type of Unicode op used to associate `name` with `value`. + op: ClassUnicodeOpKind, + /// The property name (which may be empty). + name: String, + /// The property value (which may be empty). + value: String, + }, +} + +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for ClassUnicodeKind { + fn arbitrary( + u: &mut arbitrary::Unstructured, + ) -> arbitrary::Result { + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + { + use alloc::string::ToString; + + use super::unicode_tables::{ + property_names::PROPERTY_NAMES, + property_values::PROPERTY_VALUES, + }; + + match u.choose_index(3)? { + 0 => { + let all = PROPERTY_VALUES + .iter() + .flat_map(|e| e.1.iter()) + .filter(|(name, _)| name.len() == 1) + .count(); + let idx = u.choose_index(all)?; + let value = PROPERTY_VALUES + .iter() + .flat_map(|e| e.1.iter()) + .take(idx + 1) + .last() + .unwrap() + .0 + .chars() + .next() + .unwrap(); + Ok(ClassUnicodeKind::OneLetter(value)) + } + 1 => { + let all = PROPERTY_VALUES + .iter() + .map(|e| e.1.len()) + .sum::() + + PROPERTY_NAMES.len(); + let idx = u.choose_index(all)?; + let name = PROPERTY_VALUES + .iter() + .flat_map(|e| e.1.iter()) + .chain(PROPERTY_NAMES) + .map(|(_, e)| e) + .take(idx + 1) + .last() + .unwrap(); + Ok(ClassUnicodeKind::Named(name.to_string())) + } + 2 => { + let all = PROPERTY_VALUES + .iter() + .map(|e| e.1.len()) + .sum::(); + let idx = u.choose_index(all)?; + let (prop, value) = PROPERTY_VALUES + .iter() + .flat_map(|e| { + e.1.iter().map(|(_, value)| (e.0, value)) + }) + .take(idx + 1) + .last() + .unwrap(); + Ok(ClassUnicodeKind::NamedValue { + op: u.arbitrary()?, + name: prop.to_string(), + value: value.to_string(), + }) + } + _ => unreachable!("index chosen is impossible"), + } + } + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + { + match u.choose_index(3)? { + 0 => Ok(ClassUnicodeKind::OneLetter(u.arbitrary()?)), + 1 => Ok(ClassUnicodeKind::Named(u.arbitrary()?)), + 2 => Ok(ClassUnicodeKind::NamedValue { + op: u.arbitrary()?, + name: u.arbitrary()?, + value: u.arbitrary()?, + }), + _ => unreachable!("index chosen is impossible"), + } + } + } + + fn size_hint(depth: usize) -> (usize, Option) { + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + { + arbitrary::size_hint::and_all(&[ + usize::size_hint(depth), + usize::size_hint(depth), + arbitrary::size_hint::or( + (0, Some(0)), + ClassUnicodeOpKind::size_hint(depth), + ), + ]) + } + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + { + arbitrary::size_hint::and( + usize::size_hint(depth), + arbitrary::size_hint::or_all(&[ + char::size_hint(depth), + String::size_hint(depth), + arbitrary::size_hint::and_all(&[ + String::size_hint(depth), + String::size_hint(depth), + ClassUnicodeOpKind::size_hint(depth), + ]), + ]), + ) + } + } +} + +/// The type of op used in a Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum ClassUnicodeOpKind { + /// A property set to a specific value, e.g., `\p{scx=Katakana}`. + Equal, + /// A property set to a specific value using a colon, e.g., + /// `\p{scx:Katakana}`. + Colon, + /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`. + NotEqual, +} + +impl ClassUnicodeOpKind { + /// Whether the op is an equality op or not. + pub fn is_equal(&self) -> bool { + match *self { + ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true, + _ => false, + } + } +} + +/// A bracketed character class, e.g., `[a-z0-9]`. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct ClassBracketed { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. e.g., `[a]` is not negated but + /// `[^a]` is. + pub negated: bool, + /// The type of this set. A set is either a normal union of things, e.g., + /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`. + pub kind: ClassSet, +} + +/// A character class set. +/// +/// This type corresponds to the internal structure of a bracketed character +/// class. That is, every bracketed character is one of two types: a union of +/// items (literals, ranges, other bracketed classes) or a tree of binary set +/// operations. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum ClassSet { + /// An item, which can be a single literal, range, nested character class + /// or a union of items. + Item(ClassSetItem), + /// A single binary operation (i.e., &&, -- or ~~). + BinaryOp(ClassSetBinaryOp), +} + +impl ClassSet { + /// Build a set from a union. + pub fn union(ast: ClassSetUnion) -> ClassSet { + ClassSet::Item(ClassSetItem::Union(ast)) + } + + /// Return the span of this character class set. + pub fn span(&self) -> &Span { + match *self { + ClassSet::Item(ref x) => x.span(), + ClassSet::BinaryOp(ref x) => &x.span, + } + } + + /// Return true if and only if this class set is empty. + fn is_empty(&self) -> bool { + match *self { + ClassSet::Item(ClassSetItem::Empty(_)) => true, + _ => false, + } + } +} + +/// A single component of a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum ClassSetItem { + /// An empty item. + /// + /// Note that a bracketed character class cannot contain a single empty + /// item. Empty items can appear when using one of the binary operators. + /// For example, `[&&]` is the intersection of two empty classes. + Empty(Span), + /// A single literal. + Literal(Literal), + /// A range between two literals. + Range(ClassSetRange), + /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`. + Ascii(ClassAscii), + /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. + Unicode(ClassUnicode), + /// A perl character class, e.g., `\d` or `\W`. + Perl(ClassPerl), + /// A bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + Bracketed(Box), + /// A union of items. + Union(ClassSetUnion), +} + +impl ClassSetItem { + /// Return the span of this character class set item. + pub fn span(&self) -> &Span { + match *self { + ClassSetItem::Empty(ref span) => span, + ClassSetItem::Literal(ref x) => &x.span, + ClassSetItem::Range(ref x) => &x.span, + ClassSetItem::Ascii(ref x) => &x.span, + ClassSetItem::Perl(ref x) => &x.span, + ClassSetItem::Unicode(ref x) => &x.span, + ClassSetItem::Bracketed(ref x) => &x.span, + ClassSetItem::Union(ref x) => &x.span, + } + } +} + +/// A single character class range in a set. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct ClassSetRange { + /// The span of this range. + pub span: Span, + /// The start of this range. + pub start: Literal, + /// The end of this range. + pub end: Literal, +} + +impl ClassSetRange { + /// Returns true if and only if this character class range is valid. + /// + /// The only case where a range is invalid is if its start is greater than + /// its end. + pub fn is_valid(&self) -> bool { + self.start.c <= self.end.c + } +} + +/// A union of items inside a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct ClassSetUnion { + /// The span of the items in this operation. e.g., the `a-z0-9` in + /// `[^a-z0-9]` + pub span: Span, + /// The sequence of items that make up this union. + pub items: Vec, +} + +impl ClassSetUnion { + /// Push a new item in this union. + /// + /// The ending position of this union's span is updated to the ending + /// position of the span of the item given. If the union is empty, then + /// the starting position of this union is set to the starting position + /// of this item. + /// + /// In other words, if you only use this method to add items to a union + /// and you set the spans on each item correctly, then you should never + /// need to adjust the span of the union directly. + pub fn push(&mut self, item: ClassSetItem) { + if self.items.is_empty() { + self.span.start = item.span().start; + } + self.span.end = item.span().end; + self.items.push(item); + } + + /// Return this union as a character class set item. + /// + /// If this union contains zero items, then an empty union is + /// returned. If this concatenation contains exactly 1 item, then the + /// corresponding item is returned. Otherwise, ClassSetItem::Union is + /// returned. + pub fn into_item(mut self) -> ClassSetItem { + match self.items.len() { + 0 => ClassSetItem::Empty(self.span), + 1 => self.items.pop().unwrap(), + _ => ClassSetItem::Union(self), + } + } +} + +/// A Unicode character class set operation. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct ClassSetBinaryOp { + /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`. + pub span: Span, + /// The type of this set operation. + pub kind: ClassSetBinaryOpKind, + /// The left hand side of the operation. + pub lhs: Box, + /// The right hand side of the operation. + pub rhs: Box, +} + +/// The type of a Unicode character class set operation. +/// +/// Note that this doesn't explicitly represent union since there is no +/// explicit union operator. Concatenation inside a character class corresponds +/// to the union operation. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum ClassSetBinaryOpKind { + /// The intersection of two sets, e.g., `\pN&&[a-z]`. + Intersection, + /// The difference of two sets, e.g., `\pN--[0-9]`. + Difference, + /// The symmetric difference of two sets. The symmetric difference is the + /// set of elements belonging to one but not both sets. + /// e.g., `[\pL~~[:ascii:]]`. + SymmetricDifference, +} + +/// A single zero-width assertion. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Assertion { + /// The span of this assertion. + pub span: Span, + /// The assertion kind, e.g., `\b` or `^`. + pub kind: AssertionKind, +} + +/// An assertion kind. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum AssertionKind { + /// `^` + StartLine, + /// `$` + EndLine, + /// `\A` + StartText, + /// `\z` + EndText, + /// `\b` + WordBoundary, + /// `\B` + NotWordBoundary, + /// `\b{start}` + WordBoundaryStart, + /// `\b{end}` + WordBoundaryEnd, + /// `\<` (alias for `\b{start}`) + WordBoundaryStartAngle, + /// `\>` (alias for `\b{end}`) + WordBoundaryEndAngle, + /// `\b{start-half}` + WordBoundaryStartHalf, + /// `\b{end-half}` + WordBoundaryEndHalf, +} + +/// A repetition operation applied to a regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Repetition { + /// The span of this operation. + pub span: Span, + /// The actual operation. + pub op: RepetitionOp, + /// Whether this operation was applied greedily or not. + pub greedy: bool, + /// The regular expression under repetition. + pub ast: Box, +} + +/// The repetition operator itself. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct RepetitionOp { + /// The span of this operator. This includes things like `+`, `*?` and + /// `{m,n}`. + pub span: Span, + /// The type of operation. + pub kind: RepetitionKind, +} + +/// The kind of a repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum RepetitionKind { + /// `?` + ZeroOrOne, + /// `*` + ZeroOrMore, + /// `+` + OneOrMore, + /// `{m,n}` + Range(RepetitionRange), +} + +/// A range repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum RepetitionRange { + /// `{m}` + Exactly(u32), + /// `{m,}` + AtLeast(u32), + /// `{m,n}` + Bounded(u32, u32), +} + +impl RepetitionRange { + /// Returns true if and only if this repetition range is valid. + /// + /// The only case where a repetition range is invalid is if it is bounded + /// and its start is greater than its end. + pub fn is_valid(&self) -> bool { + match *self { + RepetitionRange::Bounded(s, e) if s > e => false, + _ => true, + } + } +} + +/// A grouped regular expression. +/// +/// This includes both capturing and non-capturing groups. This does **not** +/// include flag-only groups like `(?is)`, but does contain any group that +/// contains a sub-expression, e.g., `(a)`, `(?Pa)`, `(?:a)` and +/// `(?is:a)`. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Group { + /// The span of this group. + pub span: Span, + /// The kind of this group. + pub kind: GroupKind, + /// The regular expression in this group. + pub ast: Box, +} + +impl Group { + /// If this group is non-capturing, then this returns the (possibly empty) + /// set of flags. Otherwise, `None` is returned. + pub fn flags(&self) -> Option<&Flags> { + match self.kind { + GroupKind::NonCapturing(ref flags) => Some(flags), + _ => None, + } + } + + /// Returns true if and only if this group is capturing. + pub fn is_capturing(&self) -> bool { + match self.kind { + GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true, + GroupKind::NonCapturing(_) => false, + } + } + + /// Returns the capture index of this group, if this is a capturing group. + /// + /// This returns a capture index precisely when `is_capturing` is `true`. + pub fn capture_index(&self) -> Option { + match self.kind { + GroupKind::CaptureIndex(i) => Some(i), + GroupKind::CaptureName { ref name, .. } => Some(name.index), + GroupKind::NonCapturing(_) => None, + } + } +} + +/// The kind of a group. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum GroupKind { + /// `(a)` + CaptureIndex(u32), + /// `(?a)` or `(?Pa)` + CaptureName { + /// True if the `?P<` syntax is used and false if the `?<` syntax is used. + starts_with_p: bool, + /// The capture name. + name: CaptureName, + }, + /// `(?:a)` and `(?i:a)` + NonCapturing(Flags), +} + +/// A capture name. +/// +/// This corresponds to the name itself between the angle brackets in, e.g., +/// `(?Pexpr)`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CaptureName { + /// The span of this capture name. + pub span: Span, + /// The capture name. + pub name: String, + /// The capture index. + pub index: u32, +} + +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for CaptureName { + fn arbitrary( + u: &mut arbitrary::Unstructured, + ) -> arbitrary::Result { + let len = u.arbitrary_len::()?; + if len == 0 { + return Err(arbitrary::Error::NotEnoughData); + } + let mut name: String = String::new(); + for _ in 0..len { + let ch: char = u.arbitrary()?; + let cp = u32::from(ch); + let ascii_letter_offset = u8::try_from(cp % 26).unwrap(); + let ascii_letter = b'a' + ascii_letter_offset; + name.push(char::from(ascii_letter)); + } + Ok(CaptureName { span: u.arbitrary()?, name, index: u.arbitrary()? }) + } + + fn size_hint(depth: usize) -> (usize, Option) { + arbitrary::size_hint::and_all(&[ + Span::size_hint(depth), + usize::size_hint(depth), + u32::size_hint(depth), + ]) + } +} + +/// A group of flags that is not applied to a particular regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct SetFlags { + /// The span of these flags, including the grouping parentheses. + pub span: Span, + /// The actual sequence of flags. + pub flags: Flags, +} + +/// A group of flags. +/// +/// This corresponds only to the sequence of flags themselves, e.g., `is-u`. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Flags { + /// The span of this group of flags. + pub span: Span, + /// A sequence of flag items. Each item is either a flag or a negation + /// operator. + pub items: Vec, +} + +impl Flags { + /// Add the given item to this sequence of flags. + /// + /// If the item was added successfully, then `None` is returned. If the + /// given item is a duplicate, then `Some(i)` is returned, where + /// `items[i].kind == item.kind`. + pub fn add_item(&mut self, item: FlagsItem) -> Option { + for (i, x) in self.items.iter().enumerate() { + if x.kind == item.kind { + return Some(i); + } + } + self.items.push(item); + None + } + + /// Returns the state of the given flag in this set. + /// + /// If the given flag is in the set but is negated, then `Some(false)` is + /// returned. + /// + /// If the given flag is in the set and is not negated, then `Some(true)` + /// is returned. + /// + /// Otherwise, `None` is returned. + pub fn flag_state(&self, flag: Flag) -> Option { + let mut negated = false; + for x in &self.items { + match x.kind { + FlagsItemKind::Negation => { + negated = true; + } + FlagsItemKind::Flag(ref xflag) if xflag == &flag => { + return Some(!negated); + } + _ => {} + } + } + None + } +} + +/// A single item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct FlagsItem { + /// The span of this item. + pub span: Span, + /// The kind of this item. + pub kind: FlagsItemKind, +} + +/// The kind of an item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum FlagsItemKind { + /// A negation operator applied to all subsequent flags in the enclosing + /// group. + Negation, + /// A single flag in a group. + Flag(Flag), +} + +impl FlagsItemKind { + /// Returns true if and only if this item is a negation operator. + pub fn is_negation(&self) -> bool { + match *self { + FlagsItemKind::Negation => true, + _ => false, + } + } +} + +/// A single flag. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum Flag { + /// `i` + CaseInsensitive, + /// `m` + MultiLine, + /// `s` + DotMatchesNewLine, + /// `U` + SwapGreed, + /// `u` + Unicode, + /// `R` + CRLF, + /// `x` + IgnoreWhitespace, +} + +/// A custom `Drop` impl is used for `Ast` such that it uses constant stack +/// space but heap space proportional to the depth of the `Ast`. +impl Drop for Ast { + fn drop(&mut self) { + use core::mem; + + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + // Bracketed classes are recursive, they get their own Drop impl. + | Ast::ClassBracketed(_) => return, + Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, + Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::Alternation(ref x) if x.asts.is_empty() => return, + Ast::Concat(ref x) if x.asts.is_empty() => return, + _ => {} + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_ast = || Ast::empty(empty_span()); + let mut stack = vec![mem::replace(self, empty_ast())]; + while let Some(mut ast) = stack.pop() { + match ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + // Bracketed classes are recursive, so they get their own Drop + // impl. + | Ast::ClassBracketed(_) => {} + Ast::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Group(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Alternation(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + Ast::Concat(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + } + } + } +} + +/// A custom `Drop` impl is used for `ClassSet` such that it uses constant +/// stack space but heap space proportional to the depth of the `ClassSet`. +impl Drop for ClassSet { + fn drop(&mut self) { + use core::mem; + + match *self { + ClassSet::Item(ref item) => match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => return, + ClassSetItem::Bracketed(ref x) => { + if x.kind.is_empty() { + return; + } + } + ClassSetItem::Union(ref x) => { + if x.items.is_empty() { + return; + } + } + }, + ClassSet::BinaryOp(ref op) => { + if op.lhs.is_empty() && op.rhs.is_empty() { + return; + } + } + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span())); + let mut stack = vec![mem::replace(self, empty_set())]; + while let Some(mut set) = stack.pop() { + match set { + ClassSet::Item(ref mut item) => match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => {} + ClassSetItem::Bracketed(ref mut x) => { + stack.push(mem::replace(&mut x.kind, empty_set())); + } + ClassSetItem::Union(ref mut x) => { + stack.extend(x.items.drain(..).map(ClassSet::Item)); + } + }, + ClassSet::BinaryOp(ref mut op) => { + stack.push(mem::replace(&mut op.lhs, empty_set())); + stack.push(mem::replace(&mut op.rhs, empty_set())); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // We use a thread with an explicit stack size to test that our destructor + // for Ast can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let span = || Span::splat(Position::new(0, 0, 0)); + let mut ast = Ast::empty(span()); + for i in 0..200 { + ast = Ast::group(Group { + span: span(), + kind: GroupKind::CaptureIndex(i), + ast: Box::new(ast), + }); + } + assert!(!ast.is_empty()); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + // + // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD) + // will just barf with very small stack sizes. So we bump this up a bit + // to give more room to breath. When I did this, I confirmed that if + // I remove the custom `Drop` impl for `Ast`, then this test does + // indeed still fail with a stack overflow. (At the time of writing, I + // had to bump it all the way up to 32K before the test would pass even + // without the custom `Drop` impl. So 16K seems like a safe number + // here.) + // + // See: https://github.com/rust-lang/regex/issues/967 + thread::Builder::new() + .stack_size(16 << 10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } + + // This tests that our `Ast` has a reasonable size. This isn't a hard rule + // and it can be increased if given a good enough reason. But this test + // exists because the size of `Ast` was at one point over 200 bytes on a + // 64-bit target. Wow. + #[test] + fn ast_size() { + let max = 2 * core::mem::size_of::(); + let size = core::mem::size_of::(); + assert!( + size <= max, + "Ast size of {} bytes is bigger than suggested max {}", + size, + max + ); + } +} diff --git a/vendor/regex-syntax/src/ast/parse.rs b/vendor/regex-syntax/src/ast/parse.rs new file mode 100644 index 0000000..593b14f --- /dev/null +++ b/vendor/regex-syntax/src/ast/parse.rs @@ -0,0 +1,6326 @@ +/*! +This module provides a regular expression parser. +*/ + +use core::{ + borrow::Borrow, + cell::{Cell, RefCell}, + mem, +}; + +use alloc::{ + boxed::Box, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + ast::{self, Ast, Position, Span}, + either::Either, + is_escapeable_character, is_meta_character, +}; + +type Result = core::result::Result; + +/// A primitive is an expression with no sub-expressions. This includes +/// literals, assertions and non-set character classes. This representation +/// is used as intermediate state in the parser. +/// +/// This does not include ASCII character classes, since they can only appear +/// within a set character class. +#[derive(Clone, Debug, Eq, PartialEq)] +enum Primitive { + Literal(ast::Literal), + Assertion(ast::Assertion), + Dot(Span), + Perl(ast::ClassPerl), + Unicode(ast::ClassUnicode), +} + +impl Primitive { + /// Return the span of this primitive. + fn span(&self) -> &Span { + match *self { + Primitive::Literal(ref x) => &x.span, + Primitive::Assertion(ref x) => &x.span, + Primitive::Dot(ref span) => span, + Primitive::Perl(ref x) => &x.span, + Primitive::Unicode(ref x) => &x.span, + } + } + + /// Convert this primitive into a proper AST. + fn into_ast(self) -> Ast { + match self { + Primitive::Literal(lit) => Ast::literal(lit), + Primitive::Assertion(assert) => Ast::assertion(assert), + Primitive::Dot(span) => Ast::dot(span), + Primitive::Perl(cls) => Ast::class_perl(cls), + Primitive::Unicode(cls) => Ast::class_unicode(cls), + } + } + + /// Convert this primitive into an item in a character class. + /// + /// If this primitive is not a legal item (i.e., an assertion or a dot), + /// then return an error. + fn into_class_set_item>( + self, + p: &ParserI<'_, P>, + ) -> Result { + use self::Primitive::*; + use crate::ast::ClassSetItem; + + match self { + Literal(lit) => Ok(ClassSetItem::Literal(lit)), + Perl(cls) => Ok(ClassSetItem::Perl(cls)), + Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), + } + } + + /// Convert this primitive into a literal in a character class. In + /// particular, literals are the only valid items that can appear in + /// ranges. + /// + /// If this primitive is not a legal item (i.e., a class, assertion or a + /// dot), then return an error. + fn into_class_literal>( + self, + p: &ParserI<'_, P>, + ) -> Result { + use self::Primitive::*; + + match self { + Literal(lit) => Ok(lit), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)), + } + } +} + +/// Returns true if the given character is a hexadecimal digit. +fn is_hex(c: char) -> bool { + ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') +} + +/// Returns true if the given character is a valid in a capture group name. +/// +/// If `first` is true, then `c` is treated as the first character in the +/// group name (which must be alphabetic or underscore). +fn is_capture_char(c: char, first: bool) -> bool { + if first { + c == '_' || c.is_alphabetic() + } else { + c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() + } +} + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +#[derive(Clone, Debug)] +pub struct ParserBuilder { + ignore_whitespace: bool, + nest_limit: u32, + octal: bool, +} + +impl Default for ParserBuilder { + fn default() -> ParserBuilder { + ParserBuilder::new() + } +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder { + ignore_whitespace: false, + nest_limit: 250, + octal: false, + } + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { + pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), + capture_index: Cell::new(0), + nest_limit: self.nest_limit, + octal: self.octal, + initial_ignore_whitespace: self.ignore_whitespace, + ignore_whitespace: Cell::new(self.ignore_whitespace), + comments: RefCell::new(vec![]), + stack_group: RefCell::new(vec![]), + stack_class: RefCell::new(vec![]), + capture_names: RefCell::new(vec![]), + scratch: RefCell::new(String::new()), + } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire AST is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.octal = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insignificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ignore_whitespace = yes; + self + } +} + +/// A regular expression parser. +/// +/// This parses a string representation of a regular expression into an +/// abstract syntax tree. The size of the tree is proportional to the length +/// of the regular expression pattern. +/// +/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. +#[derive(Clone, Debug)] +pub struct Parser { + /// The current position of the parser. + pos: Cell, + /// The current capture index. + capture_index: Cell, + /// The maximum number of open parens/brackets allowed. If the parser + /// exceeds this number, then an error is returned. + nest_limit: u32, + /// Whether to support octal syntax or not. When `false`, the parser will + /// return an error helpfully pointing out that backreferences are not + /// supported. + octal: bool, + /// The initial setting for `ignore_whitespace` as provided by + /// `ParserBuilder`. It is used when resetting the parser's state. + initial_ignore_whitespace: bool, + /// Whether whitespace should be ignored. When enabled, comments are + /// also permitted. + ignore_whitespace: Cell, + /// A list of comments, in order of appearance. + comments: RefCell>, + /// A stack of grouped sub-expressions, including alternations. + stack_group: RefCell>, + /// A stack of nested character classes. This is only non-empty when + /// parsing a class. + stack_class: RefCell>, + /// A sorted sequence of capture names. This is used to detect duplicate + /// capture names and report an error if one is detected. + capture_names: RefCell>, + /// A scratch buffer used in various places. Mostly this is used to + /// accumulate relevant characters from parts of a pattern. + scratch: RefCell, +} + +/// ParserI is the internal parser implementation. +/// +/// We use this separate type so that we can carry the provided pattern string +/// along with us. In particular, a `Parser` internal state is not tied to any +/// one pattern, but `ParserI` is. +/// +/// This type also lets us use `ParserI<&Parser>` in production code while +/// retaining the convenience of `ParserI` for tests, which sometimes +/// work against the internal interface of the parser. +#[derive(Clone, Debug)] +struct ParserI<'s, P> { + /// The parser state/configuration. + parser: P, + /// The full regular expression provided by the user. + pattern: &'s str, +} + +/// GroupState represents a single stack frame while parsing nested groups +/// and alternations. Each frame records the state up to an opening parenthesis +/// or a alternating bracket `|`. +#[derive(Clone, Debug)] +enum GroupState { + /// This state is pushed whenever an opening group is found. + Group { + /// The concatenation immediately preceding the opening group. + concat: ast::Concat, + /// The group that has been opened. Its sub-AST is always empty. + group: ast::Group, + /// Whether this group has the `x` flag enabled or not. + ignore_whitespace: bool, + }, + /// This state is pushed whenever a new alternation branch is found. If + /// an alternation branch is found and this state is at the top of the + /// stack, then this state should be modified to include the new + /// alternation. + Alternation(ast::Alternation), +} + +/// ClassState represents a single stack frame while parsing character classes. +/// Each frame records the state up to an intersection, difference, symmetric +/// difference or nested class. +/// +/// Note that a parser's character class stack is only non-empty when parsing +/// a character class. In all other cases, it is empty. +#[derive(Clone, Debug)] +enum ClassState { + /// This state is pushed whenever an opening bracket is found. + Open { + /// The union of class items immediately preceding this class. + union: ast::ClassSetUnion, + /// The class that has been opened. Typically this just corresponds + /// to the `[`, but it can also include `[^` since `^` indicates + /// negation of the class. + set: ast::ClassBracketed, + }, + /// This state is pushed when a operator is seen. When popped, the stored + /// set becomes the left hand side of the operator. + Op { + /// The type of the operation, i.e., &&, -- or ~~. + kind: ast::ClassSetBinaryOpKind, + /// The left-hand side of the operator. + lhs: ast::ClassSet, + }, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with either the `parse` or `parse_with_comments` + /// methods. The parse methods return an abstract syntax tree. + /// + /// To set configuration options on the parser, use [`ParserBuilder`]. + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into an abstract syntax tree. + pub fn parse(&mut self, pattern: &str) -> Result { + ParserI::new(self, pattern).parse() + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + pub fn parse_with_comments( + &mut self, + pattern: &str, + ) -> Result { + ParserI::new(self, pattern).parse_with_comments() + } + + /// Reset the internal state of a parser. + /// + /// This is called at the beginning of every parse. This prevents the + /// parser from running with inconsistent state (say, if a previous + /// invocation returned an error and the parser is reused). + fn reset(&self) { + // These settings should be in line with the construction + // in `ParserBuilder::build`. + self.pos.set(Position { offset: 0, line: 1, column: 1 }); + self.ignore_whitespace.set(self.initial_ignore_whitespace); + self.comments.borrow_mut().clear(); + self.stack_group.borrow_mut().clear(); + self.stack_class.borrow_mut().clear(); + } +} + +impl<'s, P: Borrow> ParserI<'s, P> { + /// Build an internal parser from a parser configuration and a pattern. + fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { + ParserI { parser, pattern } + } + + /// Return a reference to the parser state. + fn parser(&self) -> &Parser { + self.parser.borrow() + } + + /// Return a reference to the pattern being parsed. + fn pattern(&self) -> &str { + self.pattern + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { + ast::Error { kind, pattern: self.pattern().to_string(), span } + } + + /// Return the current offset of the parser. + /// + /// The offset starts at `0` from the beginning of the regular expression + /// pattern string. + fn offset(&self) -> usize { + self.parser().pos.get().offset + } + + /// Return the current line number of the parser. + /// + /// The line number starts at `1`. + fn line(&self) -> usize { + self.parser().pos.get().line + } + + /// Return the current column of the parser. + /// + /// The column number starts at `1` and is reset whenever a `\n` is seen. + fn column(&self) -> usize { + self.parser().pos.get().column + } + + /// Return the next capturing index. Each subsequent call increments the + /// internal index. + /// + /// The span given should correspond to the location of the opening + /// parenthesis. + /// + /// If the capture limit is exceeded, then an error is returned. + fn next_capture_index(&self, span: Span) -> Result { + let current = self.parser().capture_index.get(); + let i = current.checked_add(1).ok_or_else(|| { + self.error(span, ast::ErrorKind::CaptureLimitExceeded) + })?; + self.parser().capture_index.set(i); + Ok(i) + } + + /// Adds the given capture name to this parser. If this capture name has + /// already been used, then an error is returned. + fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { + let mut names = self.parser().capture_names.borrow_mut(); + match names + .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) + { + Err(i) => { + names.insert(i, cap.clone()); + Ok(()) + } + Ok(i) => Err(self.error( + cap.span, + ast::ErrorKind::GroupNameDuplicate { original: names[i].span }, + )), + } + } + + /// Return whether the parser should ignore whitespace or not. + fn ignore_whitespace(&self) -> bool { + self.parser().ignore_whitespace.get() + } + + /// Return the character at the current position of the parser. + /// + /// This panics if the current position does not point to a valid char. + fn char(&self) -> char { + self.char_at(self.offset()) + } + + /// Return the character at the given position. + /// + /// This panics if the given position does not point to a valid char. + fn char_at(&self, i: usize) -> char { + self.pattern()[i..] + .chars() + .next() + .unwrap_or_else(|| panic!("expected char at offset {}", i)) + } + + /// Bump the parser to the next Unicode scalar value. + /// + /// If the end of the input has been reached, then `false` is returned. + fn bump(&self) -> bool { + if self.is_eof() { + return false; + } + let Position { mut offset, mut line, mut column } = self.pos(); + if self.char() == '\n' { + line = line.checked_add(1).unwrap(); + column = 1; + } else { + column = column.checked_add(1).unwrap(); + } + offset += self.char().len_utf8(); + self.parser().pos.set(Position { offset, line, column }); + self.pattern()[self.offset()..].chars().next().is_some() + } + + /// If the substring starting at the current position of the parser has + /// the given prefix, then bump the parser to the character immediately + /// following the prefix and return true. Otherwise, don't bump the parser + /// and return false. + fn bump_if(&self, prefix: &str) -> bool { + if self.pattern()[self.offset()..].starts_with(prefix) { + for _ in 0..prefix.chars().count() { + self.bump(); + } + true + } else { + false + } + } + + /// Returns true if and only if the parser is positioned at a look-around + /// prefix. The conditions under which this returns true must always + /// correspond to a regular expression that would otherwise be consider + /// invalid. + /// + /// This should only be called immediately after parsing the opening of + /// a group or a set of flags. + fn is_lookaround_prefix(&self) -> bool { + self.bump_if("?=") + || self.bump_if("?!") + || self.bump_if("?<=") + || self.bump_if("? bool { + if !self.bump() { + return false; + } + self.bump_space(); + !self.is_eof() + } + + /// If the `x` flag is enabled (i.e., whitespace insensitivity with + /// comments), then this will advance the parser through all whitespace + /// and comments to the next non-whitespace non-comment byte. + /// + /// If the `x` flag is disabled, then this is a no-op. + /// + /// This should be used selectively throughout the parser where + /// arbitrary whitespace is permitted when the `x` flag is enabled. For + /// example, `{ 5 , 6}` is equivalent to `{5,6}`. + fn bump_space(&self) { + if !self.ignore_whitespace() { + return; + } + while !self.is_eof() { + if self.char().is_whitespace() { + self.bump(); + } else if self.char() == '#' { + let start = self.pos(); + let mut comment_text = String::new(); + self.bump(); + while !self.is_eof() { + let c = self.char(); + self.bump(); + if c == '\n' { + break; + } + comment_text.push(c); + } + let comment = ast::Comment { + span: Span::new(start, self.pos()), + comment: comment_text, + }; + self.parser().comments.borrow_mut().push(comment); + } else { + break; + } + } + } + + /// Peek at the next character in the input without advancing the parser. + /// + /// If the input has been exhausted, then this returns `None`. + fn peek(&self) -> Option { + if self.is_eof() { + return None; + } + self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() + } + + /// Like peek, but will ignore spaces when the parser is in whitespace + /// insensitive mode. + fn peek_space(&self) -> Option { + if !self.ignore_whitespace() { + return self.peek(); + } + if self.is_eof() { + return None; + } + let mut start = self.offset() + self.char().len_utf8(); + let mut in_comment = false; + for (i, c) in self.pattern()[start..].char_indices() { + if c.is_whitespace() { + continue; + } else if !in_comment && c == '#' { + in_comment = true; + } else if in_comment && c == '\n' { + in_comment = false; + } else { + start += i; + break; + } + } + self.pattern()[start..].chars().next() + } + + /// Returns true if the next call to `bump` would return false. + fn is_eof(&self) -> bool { + self.offset() == self.pattern().len() + } + + /// Return the current position of the parser, which includes the offset, + /// line and column. + fn pos(&self) -> Position { + self.parser().pos.get() + } + + /// Create a span at the current position of the parser. Both the start + /// and end of the span are set. + fn span(&self) -> Span { + Span::splat(self.pos()) + } + + /// Create a span that covers the current character. + fn span_char(&self) -> Span { + let mut next = Position { + offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), + line: self.line(), + column: self.column().checked_add(1).unwrap(), + }; + if self.char() == '\n' { + next.line += 1; + next.column = 1; + } + Span::new(self.pos(), next) + } + + /// Parse and push a single alternation on to the parser's internal stack. + /// If the top of the stack already has an alternation, then add to that + /// instead of pushing a new one. + /// + /// The concatenation given corresponds to a single alternation branch. + /// The concatenation returned starts the next branch and is empty. + /// + /// This assumes the parser is currently positioned at `|` and will advance + /// the parser to the character following `|`. + #[inline(never)] + fn push_alternate(&self, mut concat: ast::Concat) -> Result { + assert_eq!(self.char(), '|'); + concat.span.end = self.pos(); + self.push_or_add_alternation(concat); + self.bump(); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } + + /// Pushes or adds the given branch of an alternation to the parser's + /// internal stack of state. + fn push_or_add_alternation(&self, concat: ast::Concat) { + use self::GroupState::*; + + let mut stack = self.parser().stack_group.borrow_mut(); + if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { + alts.asts.push(concat.into_ast()); + return; + } + stack.push(Alternation(ast::Alternation { + span: Span::new(concat.span.start, self.pos()), + asts: vec![concat.into_ast()], + })); + } + + /// Parse and push a group AST (and its parent concatenation) on to the + /// parser's internal stack. Return a fresh concatenation corresponding + /// to the group's sub-AST. + /// + /// If a set of flags was found (with no group), then the concatenation + /// is returned with that set of flags added. + /// + /// This assumes that the parser is currently positioned on the opening + /// parenthesis. It advances the parser to the character at the start + /// of the sub-expression (or adjoining expression). + /// + /// If there was a problem parsing the start of the group, then an error + /// is returned. + #[inline(never)] + fn push_group(&self, mut concat: ast::Concat) -> Result { + assert_eq!(self.char(), '('); + match self.parse_group()? { + Either::Left(set) => { + let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); + if let Some(v) = ignore { + self.parser().ignore_whitespace.set(v); + } + + concat.asts.push(Ast::flags(set)); + Ok(concat) + } + Either::Right(group) => { + let old_ignore_whitespace = self.ignore_whitespace(); + let new_ignore_whitespace = group + .flags() + .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) + .unwrap_or(old_ignore_whitespace); + self.parser().stack_group.borrow_mut().push( + GroupState::Group { + concat, + group, + ignore_whitespace: old_ignore_whitespace, + }, + ); + self.parser().ignore_whitespace.set(new_ignore_whitespace); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } + } + } + + /// Pop a group AST from the parser's internal stack and set the group's + /// AST to the given concatenation. Return the concatenation containing + /// the group. + /// + /// This assumes that the parser is currently positioned on the closing + /// parenthesis and advances the parser to the character following the `)`. + /// + /// If no such group could be popped, then an unopened group error is + /// returned. + #[inline(never)] + fn pop_group(&self, mut group_concat: ast::Concat) -> Result { + use self::GroupState::*; + + assert_eq!(self.char(), ')'); + let mut stack = self.parser().stack_group.borrow_mut(); + let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack + .pop() + { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, None) + } + Some(Alternation(alt)) => match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, Some(alt)) + } + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }, + None => { + return Err(self + .error(self.span_char(), ast::ErrorKind::GroupUnopened)); + } + }; + self.parser().ignore_whitespace.set(ignore_whitespace); + group_concat.span.end = self.pos(); + self.bump(); + group.span.end = self.pos(); + match alt { + Some(mut alt) => { + alt.span.end = group_concat.span.end; + alt.asts.push(group_concat.into_ast()); + group.ast = Box::new(alt.into_ast()); + } + None => { + group.ast = Box::new(group_concat.into_ast()); + } + } + prior_concat.asts.push(Ast::group(group)); + Ok(prior_concat) + } + + /// Pop the last state from the parser's internal stack, if it exists, and + /// add the given concatenation to it. There either must be no state or a + /// single alternation item on the stack. Any other scenario produces an + /// error. + /// + /// This assumes that the parser has advanced to the end. + #[inline(never)] + fn pop_group_end(&self, mut concat: ast::Concat) -> Result { + concat.span.end = self.pos(); + let mut stack = self.parser().stack_group.borrow_mut(); + let ast = match stack.pop() { + None => Ok(concat.into_ast()), + Some(GroupState::Alternation(mut alt)) => { + alt.span.end = self.pos(); + alt.asts.push(concat.into_ast()); + Ok(Ast::alternation(alt)) + } + Some(GroupState::Group { group, .. }) => { + return Err( + self.error(group.span, ast::ErrorKind::GroupUnclosed) + ); + } + }; + // If we try to pop again, there should be nothing. + match stack.pop() { + None => ast, + Some(GroupState::Alternation(_)) => { + // This unreachable is unfortunate. This case can't happen + // because the only way we can be here is if there were two + // `GroupState::Alternation`s adjacent in the parser's stack, + // which we guarantee to never happen because we never push a + // `GroupState::Alternation` if one is already at the top of + // the stack. + unreachable!() + } + Some(GroupState::Group { group, .. }) => { + Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) + } + } + } + + /// Parse the opening of a character class and push the current class + /// parsing context onto the parser's stack. This assumes that the parser + /// is positioned at an opening `[`. The given union should correspond to + /// the union of set items built up before seeing the `[`. + /// + /// If there was a problem parsing the opening of the class, then an error + /// is returned. Otherwise, a new union of set items for the class is + /// returned (which may be populated with either a `]` or a `-`). + #[inline(never)] + fn push_class_open( + &self, + parent_union: ast::ClassSetUnion, + ) -> Result { + assert_eq!(self.char(), '['); + + let (nested_set, nested_union) = self.parse_set_class_open()?; + self.parser() + .stack_class + .borrow_mut() + .push(ClassState::Open { union: parent_union, set: nested_set }); + Ok(nested_union) + } + + /// Parse the end of a character class set and pop the character class + /// parser stack. The union given corresponds to the last union built + /// before seeing the closing `]`. The union returned corresponds to the + /// parent character class set with the nested class added to it. + /// + /// This assumes that the parser is positioned at a `]` and will advance + /// the parser to the byte immediately following the `]`. + /// + /// If the stack is empty after popping, then this returns the final + /// "top-level" character class AST (where a "top-level" character class + /// is one that is not nested inside any other character class). + /// + /// If there is no corresponding opening bracket on the parser's stack, + /// then an error is returned. + #[inline(never)] + fn pop_class( + &self, + nested_union: ast::ClassSetUnion, + ) -> Result> { + assert_eq!(self.char(), ']'); + + let item = ast::ClassSet::Item(nested_union.into_item()); + let prevset = self.pop_class_op(item); + let mut stack = self.parser().stack_class.borrow_mut(); + match stack.pop() { + None => { + // We can never observe an empty stack: + // + // 1) We are guaranteed to start with a non-empty stack since + // the character class parser is only initiated when it sees + // a `[`. + // 2) If we ever observe an empty stack while popping after + // seeing a `]`, then we signal the character class parser + // to terminate. + panic!("unexpected empty character class stack") + } + Some(ClassState::Op { .. }) => { + // This panic is unfortunate, but this case is impossible + // since we already popped the Op state if one exists above. + // Namely, every push to the class parser stack is guarded by + // whether an existing Op is already on the top of the stack. + // If it is, the existing Op is modified. That is, the stack + // can never have consecutive Op states. + panic!("unexpected ClassState::Op") + } + Some(ClassState::Open { mut union, mut set }) => { + self.bump(); + set.span.end = self.pos(); + set.kind = prevset; + if stack.is_empty() { + Ok(Either::Right(set)) + } else { + union.push(ast::ClassSetItem::Bracketed(Box::new(set))); + Ok(Either::Left(union)) + } + } + } + } + + /// Return an "unclosed class" error whose span points to the most + /// recently opened class. + /// + /// This should only be called while parsing a character class. + #[inline(never)] + fn unclosed_class_error(&self) -> ast::Error { + for state in self.parser().stack_class.borrow().iter().rev() { + if let ClassState::Open { ref set, .. } = *state { + return self.error(set.span, ast::ErrorKind::ClassUnclosed); + } + } + // We are guaranteed to have a non-empty stack with at least + // one open bracket, so we should never get here. + panic!("no open character class found") + } + + /// Push the current set of class items on to the class parser's stack as + /// the left hand side of the given operator. + /// + /// A fresh set union is returned, which should be used to build the right + /// hand side of this operator. + #[inline(never)] + fn push_class_op( + &self, + next_kind: ast::ClassSetBinaryOpKind, + next_union: ast::ClassSetUnion, + ) -> ast::ClassSetUnion { + let item = ast::ClassSet::Item(next_union.into_item()); + let new_lhs = self.pop_class_op(item); + self.parser() + .stack_class + .borrow_mut() + .push(ClassState::Op { kind: next_kind, lhs: new_lhs }); + ast::ClassSetUnion { span: self.span(), items: vec![] } + } + + /// Pop a character class set from the character class parser stack. If the + /// top of the stack is just an item (not an operation), then return the + /// given set unchanged. If the top of the stack is an operation, then the + /// given set will be used as the rhs of the operation on the top of the + /// stack. In that case, the binary operation is returned as a set. + #[inline(never)] + fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { + let mut stack = self.parser().stack_class.borrow_mut(); + let (kind, lhs) = match stack.pop() { + Some(ClassState::Op { kind, lhs }) => (kind, lhs), + Some(state @ ClassState::Open { .. }) => { + stack.push(state); + return rhs; + } + None => unreachable!(), + }; + let span = Span::new(lhs.span().start, rhs.span().end); + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span, + kind, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } +} + +impl<'s, P: Borrow> ParserI<'s, P> { + /// Parse the regular expression into an abstract syntax tree. + fn parse(&self) -> Result { + self.parse_with_comments().map(|astc| astc.ast) + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + fn parse_with_comments(&self) -> Result { + assert_eq!(self.offset(), 0, "parser can only be used once"); + self.parser().reset(); + let mut concat = ast::Concat { span: self.span(), asts: vec![] }; + loop { + self.bump_space(); + if self.is_eof() { + break; + } + match self.char() { + '(' => concat = self.push_group(concat)?, + ')' => concat = self.pop_group(concat)?, + '|' => concat = self.push_alternate(concat)?, + '[' => { + let class = self.parse_set_class()?; + concat.asts.push(Ast::class_bracketed(class)); + } + '?' => { + concat = self.parse_uncounted_repetition( + concat, + ast::RepetitionKind::ZeroOrOne, + )?; + } + '*' => { + concat = self.parse_uncounted_repetition( + concat, + ast::RepetitionKind::ZeroOrMore, + )?; + } + '+' => { + concat = self.parse_uncounted_repetition( + concat, + ast::RepetitionKind::OneOrMore, + )?; + } + '{' => { + concat = self.parse_counted_repetition(concat)?; + } + _ => concat.asts.push(self.parse_primitive()?.into_ast()), + } + } + let ast = self.pop_group_end(concat)?; + NestLimiter::new(self).check(&ast)?; + Ok(ast::WithComments { + ast, + comments: mem::replace( + &mut *self.parser().comments.borrow_mut(), + vec![], + ), + }) + } + + /// Parses an uncounted repetition operation. An uncounted repetition + /// operator includes ?, * and +, but does not include the {m,n} syntax. + /// The given `kind` should correspond to the operator observed by the + /// caller. + /// + /// This assumes that the parser is currently positioned at the repetition + /// operator and advances the parser to the first character after the + /// operator. (Note that the operator may include a single additional `?`, + /// which makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + #[inline(never)] + fn parse_uncounted_repetition( + &self, + mut concat: ast::Concat, + kind: ast::RepetitionKind, + ) -> Result { + assert!( + self.char() == '?' || self.char() == '*' || self.char() == '+' + ); + let op_start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + }; + match ast { + Ast::Empty(_) | Ast::Flags(_) => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + _ => {} + } + let mut greedy = true; + if self.bump() && self.char() == '?' { + greedy = false; + self.bump(); + } + concat.asts.push(Ast::repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: Span::new(op_start, self.pos()), + kind, + }, + greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parses a counted repetition operation. A counted repetition operator + /// corresponds to the {m,n} syntax, and does not include the ?, * or + + /// operators. + /// + /// This assumes that the parser is currently positioned at the opening `{` + /// and advances the parser to the first character after the operator. + /// (Note that the operator may include a single additional `?`, which + /// makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + #[inline(never)] + fn parse_counted_repetition( + &self, + mut concat: ast::Concat, + ) -> Result { + assert!(self.char() == '{'); + let start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + }; + match ast { + Ast::Empty(_) | Ast::Flags(_) => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + _ => {} + } + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + let count_start = specialize_err( + self.parse_decimal(), + ast::ErrorKind::DecimalEmpty, + ast::ErrorKind::RepetitionCountDecimalEmpty, + )?; + let mut range = ast::RepetitionRange::Exactly(count_start); + if self.is_eof() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() == ',' { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() != '}' { + let count_end = specialize_err( + self.parse_decimal(), + ast::ErrorKind::DecimalEmpty, + ast::ErrorKind::RepetitionCountDecimalEmpty, + )?; + range = ast::RepetitionRange::Bounded(count_start, count_end); + } else { + range = ast::RepetitionRange::AtLeast(count_start); + } + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + + let mut greedy = true; + if self.bump_and_bump_space() && self.char() == '?' { + greedy = false; + self.bump(); + } + + let op_span = Span::new(start, self.pos()); + if !range.is_valid() { + return Err( + self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) + ); + } + concat.asts.push(Ast::repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: op_span, + kind: ast::RepetitionKind::Range(range), + }, + greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parse a group (which contains a sub-expression) or a set of flags. + /// + /// If a group was found, then it is returned with an empty AST. If a set + /// of flags is found, then that set is returned. + /// + /// The parser should be positioned at the opening parenthesis. + /// + /// This advances the parser to the character before the start of the + /// sub-expression (in the case of a group) or to the closing parenthesis + /// immediately following the set of flags. + /// + /// # Errors + /// + /// If flags are given and incorrectly specified, then a corresponding + /// error is returned. + /// + /// If a capture name is given and it is incorrectly specified, then a + /// corresponding error is returned. + #[inline(never)] + fn parse_group(&self) -> Result> { + assert_eq!(self.char(), '('); + let open_span = self.span_char(); + self.bump(); + self.bump_space(); + if self.is_lookaround_prefix() { + return Err(self.error( + Span::new(open_span.start, self.span().end), + ast::ErrorKind::UnsupportedLookAround, + )); + } + let inner_span = self.span(); + let mut starts_with_p = true; + if self.bump_if("?P<") || { + starts_with_p = false; + self.bump_if("?<") + } { + let capture_index = self.next_capture_index(open_span)?; + let name = self.parse_capture_name(capture_index)?; + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureName { starts_with_p, name }, + ast: Box::new(Ast::empty(self.span())), + })) + } else if self.bump_if("?") { + if self.is_eof() { + return Err( + self.error(open_span, ast::ErrorKind::GroupUnclosed) + ); + } + let flags = self.parse_flags()?; + let char_end = self.char(); + self.bump(); + if char_end == ')' { + // We don't allow empty flags, e.g., `(?)`. We instead + // interpret it as a repetition operator missing its argument. + if flags.items.is_empty() { + return Err(self.error( + inner_span, + ast::ErrorKind::RepetitionMissing, + )); + } + Ok(Either::Left(ast::SetFlags { + span: Span { end: self.pos(), ..open_span }, + flags, + })) + } else { + assert_eq!(char_end, ':'); + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::NonCapturing(flags), + ast: Box::new(Ast::empty(self.span())), + })) + } + } else { + let capture_index = self.next_capture_index(open_span)?; + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureIndex(capture_index), + ast: Box::new(Ast::empty(self.span())), + })) + } + } + + /// Parses a capture group name. Assumes that the parser is positioned at + /// the first character in the name following the opening `<` (and may + /// possibly be EOF). This advances the parser to the first character + /// following the closing `>`. + /// + /// The caller must provide the capture index of the group for this name. + #[inline(never)] + fn parse_capture_name( + &self, + capture_index: u32, + ) -> Result { + if self.is_eof() { + return Err(self + .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); + } + let start = self.pos(); + loop { + if self.char() == '>' { + break; + } + if !is_capture_char(self.char(), self.pos() == start) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupNameInvalid, + )); + } + if !self.bump() { + break; + } + } + let end = self.pos(); + if self.is_eof() { + return Err(self + .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); + } + assert_eq!(self.char(), '>'); + self.bump(); + let name = &self.pattern()[start.offset..end.offset]; + if name.is_empty() { + return Err(self.error( + Span::new(start, start), + ast::ErrorKind::GroupNameEmpty, + )); + } + let capname = ast::CaptureName { + span: Span::new(start, end), + name: name.to_string(), + index: capture_index, + }; + self.add_capture_name(&capname)?; + Ok(capname) + } + + /// Parse a sequence of flags starting at the current character. + /// + /// This advances the parser to the character immediately following the + /// flags, which is guaranteed to be either `:` or `)`. + /// + /// # Errors + /// + /// If any flags are duplicated, then an error is returned. + /// + /// If the negation operator is used more than once, then an error is + /// returned. + /// + /// If no flags could be found or if the negation operation is not followed + /// by any flags, then an error is returned. + #[inline(never)] + fn parse_flags(&self) -> Result { + let mut flags = ast::Flags { span: self.span(), items: vec![] }; + let mut last_was_negation = None; + while self.char() != ':' && self.char() != ')' { + if self.char() == '-' { + last_was_negation = Some(self.span_char()); + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Negation, + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagRepeatedNegation { + original: flags.items[i].span, + }, + )); + } + } else { + last_was_negation = None; + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Flag(self.parse_flag()?), + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagDuplicate { + original: flags.items[i].span, + }, + )); + } + } + if !self.bump() { + return Err( + self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof) + ); + } + } + if let Some(span) = last_was_negation { + return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); + } + flags.span.end = self.pos(); + Ok(flags) + } + + /// Parse the current character as a flag. Do not advance the parser. + /// + /// # Errors + /// + /// If the flag is not recognized, then an error is returned. + #[inline(never)] + fn parse_flag(&self) -> Result { + match self.char() { + 'i' => Ok(ast::Flag::CaseInsensitive), + 'm' => Ok(ast::Flag::MultiLine), + 's' => Ok(ast::Flag::DotMatchesNewLine), + 'U' => Ok(ast::Flag::SwapGreed), + 'u' => Ok(ast::Flag::Unicode), + 'R' => Ok(ast::Flag::CRLF), + 'x' => Ok(ast::Flag::IgnoreWhitespace), + _ => { + Err(self + .error(self.span_char(), ast::ErrorKind::FlagUnrecognized)) + } + } + } + + /// Parse a primitive AST. e.g., A literal, non-set character class or + /// assertion. + /// + /// This assumes that the parser expects a primitive at the current + /// location. i.e., All other non-primitive cases have been handled. + /// For example, if the parser's position is at `|`, then `|` will be + /// treated as a literal (e.g., inside a character class). + /// + /// This advances the parser to the first character immediately following + /// the primitive. + fn parse_primitive(&self) -> Result { + match self.char() { + '\\' => self.parse_escape(), + '.' => { + let ast = Primitive::Dot(self.span_char()); + self.bump(); + Ok(ast) + } + '^' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::StartLine, + }); + self.bump(); + Ok(ast) + } + '$' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::EndLine, + }); + self.bump(); + Ok(ast) + } + c => { + let ast = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c, + }); + self.bump(); + Ok(ast) + } + } + } + + /// Parse an escape sequence as a primitive AST. + /// + /// This assumes the parser is positioned at the start of the escape + /// sequence, i.e., `\`. It advances the parser to the first position + /// immediately following the escape sequence. + #[inline(never)] + fn parse_escape(&self) -> Result { + assert_eq!(self.char(), '\\'); + let start = self.pos(); + if !self.bump() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let c = self.char(); + // Put some of the more complicated routines into helpers. + match c { + '0'..='7' => { + if !self.parser().octal { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + let mut lit = self.parse_octal(); + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + '8'..='9' if !self.parser().octal => { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + 'x' | 'u' | 'U' => { + let mut lit = self.parse_hex()?; + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + 'p' | 'P' => { + let mut cls = self.parse_unicode_class()?; + cls.span.start = start; + return Ok(Primitive::Unicode(cls)); + } + 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { + let mut cls = self.parse_perl_class(); + cls.span.start = start; + return Ok(Primitive::Perl(cls)); + } + _ => {} + } + + // Handle all of the one letter sequences inline. + self.bump(); + let span = Span::new(start, self.pos()); + if is_meta_character(c) { + return Ok(Primitive::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Meta, + c, + })); + } + if is_escapeable_character(c) { + return Ok(Primitive::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Superfluous, + c, + })); + } + let special = |kind, c| { + Ok(Primitive::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Special(kind), + c, + })) + }; + match c { + 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), + 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), + 't' => special(ast::SpecialLiteralKind::Tab, '\t'), + 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), + 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), + 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), + 'A' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::StartText, + })), + 'z' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::EndText, + })), + 'b' => { + let mut wb = ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundary, + }; + // After a \b, we "try" to parse things like \b{start} for + // special word boundary assertions. + if !self.is_eof() && self.char() == '{' { + if let Some(kind) = + self.maybe_parse_special_word_boundary(start)? + { + wb.kind = kind; + wb.span.end = self.pos(); + } + } + Ok(Primitive::Assertion(wb)) + } + 'B' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::NotWordBoundary, + })), + '<' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryStartAngle, + })), + '>' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryEndAngle, + })), + _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), + } + } + + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary( + &self, + wb_start: Position, + ) -> Result> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(wb_start, self.pos()), + ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + )); + } + let start_contents = self.pos(); + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.parser().pos.set(start); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + while !self.is_eof() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::SpecialWordBoundaryUnclosed, + )); + } + let end = self.pos(); + self.bump(); + let kind = match scratch.as_str() { + "start" => ast::AssertionKind::WordBoundaryStart, + "end" => ast::AssertionKind::WordBoundaryEnd, + "start-half" => ast::AssertionKind::WordBoundaryStartHalf, + "end-half" => ast::AssertionKind::WordBoundaryEndHalf, + _ => { + return Err(self.error( + Span::new(start_contents, end), + ast::ErrorKind::SpecialWordBoundaryUnrecognized, + )) + } + }; + Ok(Some(kind)) + } + + /// Parse an octal representation of a Unicode codepoint up to 3 digits + /// long. This expects the parser to be positioned at the first octal + /// digit and advances the parser to the first character immediately + /// following the octal number. This also assumes that parsing octal + /// escapes is enabled. + /// + /// Assuming the preconditions are met, this routine can never fail. + #[inline(never)] + fn parse_octal(&self) -> ast::Literal { + assert!(self.parser().octal); + assert!('0' <= self.char() && self.char() <= '7'); + let start = self.pos(); + // Parse up to two more digits. + while self.bump() + && '0' <= self.char() + && self.char() <= '7' + && self.pos().offset - start.offset <= 2 + {} + let end = self.pos(); + let octal = &self.pattern()[start.offset..end.offset]; + // Parsing the octal should never fail since the above guarantees a + // valid number. + let codepoint = + u32::from_str_radix(octal, 8).expect("valid octal number"); + // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no + // invalid Unicode scalar values. + let c = char::from_u32(codepoint).expect("Unicode scalar value"); + ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::Octal, + c, + } + } + + /// Parse a hex representation of a Unicode codepoint. This handles both + /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to + /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to + /// the first character immediately following the hexadecimal literal. + #[inline(never)] + fn parse_hex(&self) -> Result { + assert!( + self.char() == 'x' || self.char() == 'u' || self.char() == 'U' + ); + + let hex_kind = match self.char() { + 'x' => ast::HexLiteralKind::X, + 'u' => ast::HexLiteralKind::UnicodeShort, + _ => ast::HexLiteralKind::UnicodeLong, + }; + if !self.bump_and_bump_space() { + return Err( + self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) + ); + } + if self.char() == '{' { + self.parse_hex_brace(hex_kind) + } else { + self.parse_hex_digits(hex_kind) + } + } + + /// Parse an N-digit hex representation of a Unicode codepoint. This + /// expects the parser to be positioned at the first digit and will advance + /// the parser to the first character immediately following the escape + /// sequence. + /// + /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) + /// or 8 (for `\UNNNNNNNN`). + #[inline(never)] + fn parse_hex_digits( + &self, + kind: ast::HexLiteralKind, + ) -> Result { + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let start = self.pos(); + for i in 0..kind.digits() { + if i > 0 && !self.bump_and_bump_space() { + return Err(self + .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); + } + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + // The final bump just moves the parser past the literal, which may + // be EOF. + self.bump_and_bump_space(); + let end = self.pos(); + let hex = scratch.as_str(); + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::HexFixed(kind), + c, + }), + } + } + + /// Parse a hex representation of any Unicode scalar value. This expects + /// the parser to be positioned at the opening brace `{` and will advance + /// the parser to the first character following the closing brace `}`. + #[inline(never)] + fn parse_hex_brace( + &self, + kind: ast::HexLiteralKind, + ) -> Result { + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let brace_pos = self.pos(); + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let end = self.pos(); + let hex = scratch.as_str(); + assert_eq!(self.char(), '}'); + self.bump_and_bump_space(); + + if hex.is_empty() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeHexEmpty, + )); + } + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, self.pos()), + kind: ast::LiteralKind::HexBrace(kind), + c, + }), + } + } + + /// Parse a decimal number into a u32 while trimming leading and trailing + /// whitespace. + /// + /// This expects the parser to be positioned at the first position where + /// a decimal digit could occur. This will advance the parser to the byte + /// immediately following the last contiguous decimal digit. + /// + /// If no decimal digit could be found or if there was a problem parsing + /// the complete set of digits into a u32, then an error is returned. + fn parse_decimal(&self) -> Result { + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + while !self.is_eof() && self.char().is_whitespace() { + self.bump(); + } + let start = self.pos(); + while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + let span = Span::new(start, self.pos()); + while !self.is_eof() && self.char().is_whitespace() { + self.bump_and_bump_space(); + } + let digits = scratch.as_str(); + if digits.is_empty() { + return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); + } + match u32::from_str_radix(digits, 10).ok() { + Some(n) => Ok(n), + None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), + } + } + + /// Parse a standard character class consisting primarily of characters or + /// character ranges, but can also contain nested character classes of + /// any type (sans `.`). + /// + /// This assumes the parser is positioned at the opening `[`. If parsing + /// is successful, then the parser is advanced to the position immediately + /// following the closing `]`. + #[inline(never)] + fn parse_set_class(&self) -> Result { + assert_eq!(self.char(), '['); + + let mut union = + ast::ClassSetUnion { span: self.span(), items: vec![] }; + loop { + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + match self.char() { + '[' => { + // If we've already parsed the opening bracket, then + // attempt to treat this as the beginning of an ASCII + // class. If ASCII class parsing fails, then the parser + // backs up to `[`. + if !self.parser().stack_class.borrow().is_empty() { + if let Some(cls) = self.maybe_parse_ascii_class() { + union.push(ast::ClassSetItem::Ascii(cls)); + continue; + } + } + union = self.push_class_open(union)?; + } + ']' => match self.pop_class(union)? { + Either::Left(nested_union) => { + union = nested_union; + } + Either::Right(class) => return Ok(class), + }, + '&' if self.peek() == Some('&') => { + assert!(self.bump_if("&&")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Intersection, + union, + ); + } + '-' if self.peek() == Some('-') => { + assert!(self.bump_if("--")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Difference, + union, + ); + } + '~' if self.peek() == Some('~') => { + assert!(self.bump_if("~~")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::SymmetricDifference, + union, + ); + } + _ => { + union.push(self.parse_set_class_range()?); + } + } + } + } + + /// Parse a single primitive item in a character class set. The item to + /// be parsed can either be one of a simple literal character, a range + /// between two simple literal characters or a "primitive" character + /// class like \w or \p{Greek}. + /// + /// If an invalid escape is found, or if a character class is found where + /// a simple literal is expected (e.g., in a range), then an error is + /// returned. + #[inline(never)] + fn parse_set_class_range(&self) -> Result { + let prim1 = self.parse_set_class_item()?; + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + // If the next char isn't a `-`, then we don't have a range. + // There are two exceptions. If the char after a `-` is a `]`, then + // `-` is interpreted as a literal `-`. Alternatively, if the char + // after a `-` is a `-`, then `--` corresponds to a "difference" + // operation. + if self.char() != '-' + || self.peek_space() == Some(']') + || self.peek_space() == Some('-') + { + return prim1.into_class_set_item(self); + } + // OK, now we're parsing a range, so bump past the `-` and parse the + // second half of the range. + if !self.bump_and_bump_space() { + return Err(self.unclosed_class_error()); + } + let prim2 = self.parse_set_class_item()?; + let range = ast::ClassSetRange { + span: Span::new(prim1.span().start, prim2.span().end), + start: prim1.into_class_literal(self)?, + end: prim2.into_class_literal(self)?, + }; + if !range.is_valid() { + return Err( + self.error(range.span, ast::ErrorKind::ClassRangeInvalid) + ); + } + Ok(ast::ClassSetItem::Range(range)) + } + + /// Parse a single item in a character class as a primitive, where the + /// primitive either consists of a verbatim literal or a single escape + /// sequence. + /// + /// This assumes the parser is positioned at the beginning of a primitive, + /// and advances the parser to the first position after the primitive if + /// successful. + /// + /// Note that it is the caller's responsibility to report an error if an + /// illegal primitive was parsed. + #[inline(never)] + fn parse_set_class_item(&self) -> Result { + if self.char() == '\\' { + self.parse_escape() + } else { + let x = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: self.char(), + }); + self.bump(); + Ok(x) + } + } + + /// Parses the opening of a character class set. This includes the opening + /// bracket along with `^` if present to indicate negation. This also + /// starts parsing the opening set of unioned items if applicable, since + /// there are special rules applied to certain characters in the opening + /// of a character class. For example, `[^]]` is the class of all + /// characters not equal to `]`. (`]` would need to be escaped in any other + /// position.) Similarly for `-`. + /// + /// In all cases, the op inside the returned `ast::ClassBracketed` is an + /// empty union. This empty union should be replaced with the actual item + /// when it is popped from the parser's stack. + /// + /// This assumes the parser is positioned at the opening `[` and advances + /// the parser to the first non-special byte of the character class. + /// + /// An error is returned if EOF is found. + #[inline(never)] + fn parse_set_class_open( + &self, + ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { + assert_eq!(self.char(), '['); + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + + let negated = if self.char() != '^' { + false + } else { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + true + }; + // Accept any number of `-` as literal `-`. + let mut union = + ast::ClassSetUnion { span: self.span(), items: vec![] }; + while self.char() == '-' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: '-', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, start), + ast::ErrorKind::ClassUnclosed, + )); + } + } + // If `]` is the *first* char in a set, then interpret it as a literal + // `]`. That is, an empty class is impossible to write. + if union.items.is_empty() && self.char() == ']' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: ']', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + } + let set = ast::ClassBracketed { + span: Span::new(start, self.pos()), + negated, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: Span::new(union.span.start, union.span.start), + items: vec![], + }), + }; + Ok((set, union)) + } + + /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. + /// + /// This assumes the parser is positioned at the opening `[`. + /// + /// If no valid ASCII character class could be found, then this does not + /// advance the parser and `None` is returned. Otherwise, the parser is + /// advanced to the first byte following the closing `]` and the + /// corresponding ASCII class is returned. + #[inline(never)] + fn maybe_parse_ascii_class(&self) -> Option { + // ASCII character classes are interesting from a parsing perspective + // because parsing cannot fail with any interesting error. For example, + // in order to use an ASCII character class, it must be enclosed in + // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think + // of it as "ASCII character classes have the syntax `[:NAME:]` which + // can only appear within character brackets." This means that things + // like `[[:lower:]A]` are legal constructs. + // + // However, if one types an incorrect ASCII character class, e.g., + // `[[:loower:]]`, then we treat that as a normal nested character + // class containing the characters `:elorw`. One might argue that we + // should return an error instead since the repeated colons give away + // the intent to write an ASCII class. But what if the user typed + // `[[:lower]]` instead? How can we tell that was intended to be an + // ASCII class and not just a normal nested class? + // + // Reasonable people can probably disagree over this, but for better + // or worse, we implement semantics that never fails at the expense + // of better failure modes. + assert_eq!(self.char(), '['); + // If parsing fails, then we back up the parser to this starting point. + let start = self.pos(); + let mut negated = false; + if !self.bump() || self.char() != ':' { + self.parser().pos.set(start); + return None; + } + if !self.bump() { + self.parser().pos.set(start); + return None; + } + if self.char() == '^' { + negated = true; + if !self.bump() { + self.parser().pos.set(start); + return None; + } + } + let name_start = self.offset(); + while self.char() != ':' && self.bump() {} + if self.is_eof() { + self.parser().pos.set(start); + return None; + } + let name = &self.pattern()[name_start..self.offset()]; + if !self.bump_if(":]") { + self.parser().pos.set(start); + return None; + } + let kind = match ast::ClassAsciiKind::from_name(name) { + Some(kind) => kind, + None => { + self.parser().pos.set(start); + return None; + } + }; + Some(ast::ClassAscii { + span: Span::new(start, self.pos()), + kind, + negated, + }) + } + + /// Parse a Unicode class in either the single character notation, `\pN` + /// or the multi-character bracketed notation, `\p{Greek}`. This assumes + /// the parser is positioned at the `p` (or `P` for negation) and will + /// advance the parser to the character immediately following the class. + /// + /// Note that this does not check whether the class name is valid or not. + #[inline(never)] + fn parse_unicode_class(&self) -> Result { + assert!(self.char() == 'p' || self.char() == 'P'); + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let negated = self.char() == 'P'; + if !self.bump_and_bump_space() { + return Err( + self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) + ); + } + let (start, kind) = if self.char() == '{' { + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self + .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); + } + assert_eq!(self.char(), '}'); + self.bump(); + + let name = scratch.as_str(); + if let Some(i) = name.find("!=") { + ( + start, + ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: name[..i].to_string(), + value: name[i + 2..].to_string(), + }, + ) + } else if let Some(i) = name.find(':') { + ( + start, + ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: name[..i].to_string(), + value: name[i + 1..].to_string(), + }, + ) + } else if let Some(i) = name.find('=') { + ( + start, + ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: name[..i].to_string(), + value: name[i + 1..].to_string(), + }, + ) + } else { + (start, ast::ClassUnicodeKind::Named(name.to_string())) + } + } else { + let start = self.pos(); + let c = self.char(); + if c == '\\' { + return Err(self.error( + self.span_char(), + ast::ErrorKind::UnicodeClassInvalid, + )); + } + self.bump_and_bump_space(); + let kind = ast::ClassUnicodeKind::OneLetter(c); + (start, kind) + }; + Ok(ast::ClassUnicode { + span: Span::new(start, self.pos()), + negated, + kind, + }) + } + + /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the + /// parser is currently at a valid character class name and will be + /// advanced to the character immediately following the class. + #[inline(never)] + fn parse_perl_class(&self) -> ast::ClassPerl { + let c = self.char(); + let span = self.span_char(); + self.bump(); + let (negated, kind) = match c { + 'd' => (false, ast::ClassPerlKind::Digit), + 'D' => (true, ast::ClassPerlKind::Digit), + 's' => (false, ast::ClassPerlKind::Space), + 'S' => (true, ast::ClassPerlKind::Space), + 'w' => (false, ast::ClassPerlKind::Word), + 'W' => (true, ast::ClassPerlKind::Word), + c => panic!("expected valid Perl class but got '{}'", c), + }; + ast::ClassPerl { span, kind, negated } + } +} + +/// A type that traverses a fully parsed Ast and checks whether its depth +/// exceeds the specified nesting limit. If it does, then an error is returned. +#[derive(Debug)] +struct NestLimiter<'p, 's, P> { + /// The parser that is checking the nest limit. + p: &'p ParserI<'s, P>, + /// The current depth while walking an Ast. + depth: u32, +} + +impl<'p, 's, P: Borrow> NestLimiter<'p, 's, P> { + fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { + NestLimiter { p, depth: 0 } + } + + #[inline(never)] + fn check(self, ast: &Ast) -> Result<()> { + ast::visit(ast, self) + } + + fn increment_depth(&mut self, span: &Span) -> Result<()> { + let new = self.depth.checked_add(1).ok_or_else(|| { + self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(u32::MAX), + ) + })?; + let limit = self.p.parser().nest_limit; + if new > limit { + return Err(self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(limit), + )); + } + self.depth = new; + Ok(()) + } + + fn decrement_depth(&mut self) { + // Assuming the correctness of the visitor, this should never drop + // below 0. + self.depth = self.depth.checked_sub(1).unwrap(); + } +} + +impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { + type Output = (); + type Err = ast::Error; + + fn finish(self) -> Result<()> { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + let span = match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + Ast::ClassBracketed(ref x) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + Ast::ClassBracketed(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + let span = match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + ast::ClassSetItem::Bracketed(ref x) => &x.span, + ast::ClassSetItem::Union(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_binary_op_pre( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.increment_depth(&ast.span) + } + + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.decrement_depth(); + Ok(()) + } +} + +/// When the result is an error, transforms the ast::ErrorKind from the source +/// Result into another one. This function is used to return clearer error +/// messages when possible. +fn specialize_err( + result: Result, + from: ast::ErrorKind, + to: ast::ErrorKind, +) -> Result { + if let Err(e) = result { + if e.kind == from { + Err(ast::Error { kind: to, pattern: e.pattern, span: e.span }) + } else { + Err(e) + } + } else { + result + } +} + +#[cfg(test)] +mod tests { + use core::ops::Range; + + use alloc::format; + + use crate::ast::{self, Ast, Position, Span}; + + use super::*; + + // Our own assert_eq, which has slightly better formatting (but honestly + // still kind of crappy). + macro_rules! assert_eq { + ($left:expr, $right:expr) => {{ + match (&$left, &$right) { + (left_val, right_val) => { + if !(*left_val == *right_val) { + panic!( + "assertion failed: `(left == right)`\n\n\ + left: `{:?}`\nright: `{:?}`\n\n", + left_val, right_val + ) + } + } + } + }}; + } + + // We create these errors to compare with real ast::Errors in the tests. + // We define equality between TestError and ast::Error to disregard the + // pattern string in ast::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: ast::ErrorKind, + } + + impl PartialEq for TestError { + fn eq(&self, other: &ast::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq for ast::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn s(str: &str) -> String { + str.to_string() + } + + fn parser(pattern: &str) -> ParserI<'_, Parser> { + ParserI::new(Parser::new(), pattern) + } + + fn parser_octal(pattern: &str) -> ParserI<'_, Parser> { + let parser = ParserBuilder::new().octal(true).build(); + ParserI::new(parser, pattern) + } + + fn parser_nest_limit( + pattern: &str, + nest_limit: u32, + ) -> ParserI<'_, Parser> { + let p = ParserBuilder::new().nest_limit(nest_limit).build(); + ParserI::new(p, pattern) + } + + fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> { + let p = ParserBuilder::new().ignore_whitespace(true).build(); + ParserI::new(p, pattern) + } + + /// Short alias for creating a new span. + fn nspan(start: Position, end: Position) -> Span { + Span::new(start, end) + } + + /// Short alias for creating a new position. + fn npos(offset: usize, line: usize, column: usize) -> Position { + Position::new(offset, line, column) + } + + /// Create a new span from the given offset range. This assumes a single + /// line and sets the columns based on the offsets. i.e., This only works + /// out of the box for ASCII, which is fine for most tests. + fn span(range: Range) -> Span { + let start = Position::new(range.start, 1, range.start + 1); + let end = Position::new(range.end, 1, range.end + 1); + Span::new(start, end) + } + + /// Create a new span for the corresponding byte range in the given string. + fn span_range(subject: &str, range: Range) -> Span { + let start = Position { + offset: range.start, + line: 1 + subject[..range.start].matches('\n').count(), + column: 1 + subject[..range.start] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.start].chars().count()), + }; + let end = Position { + offset: range.end, + line: 1 + subject[..range.end].matches('\n').count(), + column: 1 + subject[..range.end] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.end].chars().count()), + }; + Span::new(start, end) + } + + /// Create a verbatim literal starting at the given position. + fn lit(c: char, start: usize) -> Ast { + lit_with(c, span(start..start + c.len_utf8())) + } + + /// Create a meta literal starting at the given position. + fn meta_lit(c: char, span: Span) -> Ast { + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) + } + + /// Create a verbatim literal with the given span. + fn lit_with(c: char, span: Span) -> Ast { + Ast::literal(ast::Literal { + span, + kind: ast::LiteralKind::Verbatim, + c, + }) + } + + /// Create a concatenation with the given range. + fn concat(range: Range, asts: Vec) -> Ast { + concat_with(span(range), asts) + } + + /// Create a concatenation with the given span. + fn concat_with(span: Span, asts: Vec) -> Ast { + Ast::concat(ast::Concat { span, asts }) + } + + /// Create an alternation with the given span. + fn alt(range: Range, asts: Vec) -> Ast { + Ast::alternation(ast::Alternation { span: span(range), asts }) + } + + /// Create a capturing group with the given span. + fn group(range: Range, index: u32, ast: Ast) -> Ast { + Ast::group(ast::Group { + span: span(range), + kind: ast::GroupKind::CaptureIndex(index), + ast: Box::new(ast), + }) + } + + /// Create an ast::SetFlags. + /// + /// The given pattern should be the full pattern string. The range given + /// should correspond to the byte offsets where the flag set occurs. + /// + /// If negated is true, then the set is interpreted as beginning with a + /// negation. + fn flag_set( + pat: &str, + range: Range, + flag: ast::Flag, + negated: bool, + ) -> Ast { + let mut items = vec![ast::FlagsItem { + span: span_range(pat, (range.end - 2)..(range.end - 1)), + kind: ast::FlagsItemKind::Flag(flag), + }]; + if negated { + items.insert( + 0, + ast::FlagsItem { + span: span_range(pat, (range.start + 2)..(range.end - 2)), + kind: ast::FlagsItemKind::Negation, + }, + ); + } + Ast::flags(ast::SetFlags { + span: span_range(pat, range.clone()), + flags: ast::Flags { + span: span_range(pat, (range.start + 2)..(range.end - 1)), + items, + }, + }) + } + + #[test] + fn parse_nest_limit() { + // A nest limit of 0 still allows some types of regexes. + assert_eq!( + parser_nest_limit("", 0).parse(), + Ok(Ast::empty(span(0..0))) + ); + assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); + + // Test repetition operations, which require one level of nesting. + assert_eq!( + parser_nest_limit("a+", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("a+", 1).parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser_nest_limit("(a)+", 1).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("a+*", 1).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("a+*", 2).parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(Ast::repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })), + })) + ); + + // Test concatenations. A concatenation requires one level of nesting. + assert_eq!( + parser_nest_limit("ab", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("ab", 1).parse(), + Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])) + ); + assert_eq!( + parser_nest_limit("abc", 1).parse(), + Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])) + ); + + // Test alternations. An alternation requires one level of nesting. + assert_eq!( + parser_nest_limit("a|b", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("a|b", 1).parse(), + Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])) + ); + assert_eq!( + parser_nest_limit("a|b|c", 1).parse(), + Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])) + ); + + // Test character classes. Classes form their own mini-recursive + // syntax! + assert_eq!( + parser_nest_limit("[a]", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("[a]", 1).parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( + ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: 'a', + } + )), + })) + ); + assert_eq!( + parser_nest_limit("[ab]", 1).parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::NestLimitExceeded(2), + } + ); + assert_eq!( + parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(3), + } + ); + assert_eq!( + parser_nest_limit("[a--b]", 1).parse().unwrap_err(), + TestError { + span: span(1..5), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(2), + } + ); + } + + #[test] + fn parse_comments() { + let pat = "(?x) +# This is comment 1. +foo # This is comment 2. + # This is comment 3. +bar +# This is comment 4."; + let astc = parser(pat).parse_with_comments().unwrap(); + assert_eq!( + astc.ast, + concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('f', span_range(pat, 26..27)), + lit_with('o', span_range(pat, 27..28)), + lit_with('o', span_range(pat, 28..29)), + lit_with('b', span_range(pat, 74..75)), + lit_with('a', span_range(pat, 75..76)), + lit_with('r', span_range(pat, 76..77)), + ] + ) + ); + assert_eq!( + astc.comments, + vec![ + ast::Comment { + span: span_range(pat, 5..26), + comment: s(" This is comment 1."), + }, + ast::Comment { + span: span_range(pat, 30..51), + comment: s(" This is comment 2."), + }, + ast::Comment { + span: span_range(pat, 53..74), + comment: s(" This is comment 3."), + }, + ast::Comment { + span: span_range(pat, 78..98), + comment: s(" This is comment 4."), + }, + ] + ); + } + + #[test] + fn parse_holistic() { + assert_eq!(parser("]").parse(), Ok(lit(']', 0))); + assert_eq!( + parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), + Ok(concat( + 0..36, + vec![ + meta_lit('\\', span(0..2)), + meta_lit('.', span(2..4)), + meta_lit('+', span(4..6)), + meta_lit('*', span(6..8)), + meta_lit('?', span(8..10)), + meta_lit('(', span(10..12)), + meta_lit(')', span(12..14)), + meta_lit('|', span(14..16)), + meta_lit('[', span(16..18)), + meta_lit(']', span(18..20)), + meta_lit('{', span(20..22)), + meta_lit('}', span(22..24)), + meta_lit('^', span(24..26)), + meta_lit('$', span(26..28)), + meta_lit('#', span(28..30)), + meta_lit('&', span(30..32)), + meta_lit('-', span(32..34)), + meta_lit('~', span(34..36)), + ] + )) + ); + } + + #[test] + fn parse_ignore_whitespace() { + // Test that basic whitespace insensitivity works. + let pat = "(?x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + nspan(npos(0, 1, 1), npos(7, 1, 8)), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + ] + )) + ); + + // Test that we can toggle whitespace insensitivity. + let pat = "(?x)a b(?-x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + nspan(npos(0, 1, 1), npos(15, 1, 16)), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), + lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), + lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), + lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), + ] + )) + ); + + // Test that nesting whitespace insensitive flags works. + let pat = "a (?x:a )a "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..11), + vec![ + lit_with('a', span_range(pat, 0..1)), + lit_with(' ', span_range(pat, 1..2)), + Ast::group(ast::Group { + span: span_range(pat, 2..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 4..5), + items: vec![ast::FlagsItem { + span: span_range(pat, 4..5), + kind: ast::FlagsItemKind::Flag( + ast::Flag::IgnoreWhitespace + ), + },], + }), + ast: Box::new(lit_with('a', span_range(pat, 6..7))), + }), + lit_with('a', span_range(pat, 9..10)), + lit_with(' ', span_range(pat, 10..11)), + ] + )) + ); + + // Test that whitespace after an opening paren is insignificant. + let pat = "(?x)( ?P a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span_range(pat, 9..12), + name: s("foo"), + index: 1, + } + }, + ast: Box::new(lit_with('a', span_range(pat, 14..15))), + }), + ] + )) + ); + let pat = "(?x)( a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureIndex(1), + ast: Box::new(lit_with('a', span_range(pat, 7..8))), + }), + ] + )) + ); + let pat = "(?x)( ?: a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 8..8), + items: vec![], + }), + ast: Box::new(lit_with('a', span_range(pat, 11..12))), + }), + ] + )) + ); + let pat = r"(?x)\x { 53 }"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::literal(ast::Literal { + span: span(4..13), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::X + ), + c: 'S', + }), + ] + )) + ); + + // Test that whitespace after an escape is OK. + let pat = r"(?x)\ "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::literal(ast::Literal { + span: span_range(pat, 4..6), + kind: ast::LiteralKind::Superfluous, + c: ' ', + }), + ] + )) + ); + } + + #[test] + fn parse_newlines() { + let pat = ".\n."; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..3), + vec![ + Ast::dot(span_range(pat, 0..1)), + lit_with('\n', span_range(pat, 1..2)), + Ast::dot(span_range(pat, 2..3)), + ] + )) + ); + + let pat = "foobar\nbaz\nquux\n"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), + lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), + lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), + lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), + lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), + lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), + lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), + lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), + lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), + lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), + lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), + lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), + lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), + lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), + ] + )) + ); + } + + #[test] + fn parse_uncounted_repetition() { + assert_eq!( + parser(r"a*").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a+").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a??").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a?b").parse(), + Ok(concat( + 0..3, + vec![ + Ast::repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }), + lit('b', 2), + ] + )) + ); + assert_eq!( + parser(r"a??b").parse(), + Ok(concat( + 0..4, + vec![ + Ast::repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }), + lit('b', 3), + ] + )) + ); + assert_eq!( + parser(r"ab?").parse(), + Ok(concat( + 0..3, + vec![ + lit('a', 0), + Ast::repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ] + )) + ); + assert_eq!( + parser(r"(ab)?").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(4..5), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(group( + 0..4, + 1, + concat(1..3, vec![lit('a', 1), lit('b', 2),]) + )), + })) + ); + assert_eq!( + parser(r"|a?").parse(), + Ok(alt( + 0..3, + vec![ + Ast::empty(span(0..0)), + Ast::repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 1)), + }), + ] + )) + ); + + assert_eq!( + parser(r"*").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?i)*").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(*)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?:?)").parse().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"+").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"?").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|*").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|+").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|?").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + } + + #[test] + fn parse_counted_repetition() { + assert_eq!( + parser(r"a{5}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..4), + op: ast::RepetitionOp { + span: span(1..4), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{5,}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::AtLeast(5) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{5,9}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{5}?").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"ab{5}").parse(), + Ok(concat( + 0..5, + vec![ + lit('a', 0), + Ast::repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ] + )) + ); + assert_eq!( + parser(r"ab{5}c").parse(), + Ok(concat( + 0..6, + vec![ + lit('a', 0), + Ast::repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + lit('c', 5), + ] + )) + ); + + assert_eq!( + parser(r"a{ 5 }").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{ 5 , 9 }").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..10), + op: ast::RepetitionOp { + span: span(1..10), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser_ignore_whitespace(r"a{5,9} ?").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..8), + op: ast::RepetitionOp { + span: span(1..8), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"\b{5,9}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..7), + op: ast::RepetitionOp { + span: span(2..7), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(Ast::assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + })), + })) + ); + + assert_eq!( + parser(r"(?i){0}").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?m){1,1}").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"a{]}").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{1,]}").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{}").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{a").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{9999999999}").parse().unwrap_err(), + TestError { + span: span(2..12), + kind: ast::ErrorKind::DecimalInvalid, + } + ); + assert_eq!( + parser(r"a{9").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{9,a").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{9,9999999999}").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::DecimalInvalid, + } + ); + assert_eq!( + parser(r"a{9,").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{9,11").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{2,1}").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountInvalid, + } + ); + assert_eq!( + parser(r"{5}").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|{5}").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + } + + #[test] + fn parse_alternate() { + assert_eq!( + parser(r"a|b").parse(), + Ok(Ast::alternation(ast::Alternation { + span: span(0..3), + asts: vec![lit('a', 0), lit('b', 2)], + })) + ); + assert_eq!( + parser(r"(a|b)").parse(), + Ok(group( + 0..5, + 1, + Ast::alternation(ast::Alternation { + span: span(1..4), + asts: vec![lit('a', 1), lit('b', 3)], + }) + )) + ); + + assert_eq!( + parser(r"a|b|c").parse(), + Ok(Ast::alternation(ast::Alternation { + span: span(0..5), + asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], + })) + ); + assert_eq!( + parser(r"ax|by|cz").parse(), + Ok(Ast::alternation(ast::Alternation { + span: span(0..8), + asts: vec![ + concat(0..2, vec![lit('a', 0), lit('x', 1)]), + concat(3..5, vec![lit('b', 3), lit('y', 4)]), + concat(6..8, vec![lit('c', 6), lit('z', 7)]), + ], + })) + ); + assert_eq!( + parser(r"(ax|by|cz)").parse(), + Ok(group( + 0..10, + 1, + Ast::alternation(ast::Alternation { + span: span(1..9), + asts: vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + concat(4..6, vec![lit('b', 4), lit('y', 5)]), + concat(7..9, vec![lit('c', 7), lit('z', 8)]), + ], + }) + )) + ); + assert_eq!( + parser(r"(ax|(by|(cz)))").parse(), + Ok(group( + 0..14, + 1, + alt( + 1..13, + vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + group( + 4..13, + 2, + alt( + 5..12, + vec![ + concat( + 5..7, + vec![lit('b', 5), lit('y', 6)] + ), + group( + 8..12, + 3, + concat( + 9..11, + vec![lit('c', 9), lit('z', 10),] + ) + ), + ] + ) + ), + ] + ) + )) + ); + + assert_eq!( + parser(r"|").parse(), + Ok(alt( + 0..1, + vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] + )) + ); + assert_eq!( + parser(r"||").parse(), + Ok(alt( + 0..2, + vec![ + Ast::empty(span(0..0)), + Ast::empty(span(1..1)), + Ast::empty(span(2..2)), + ] + )) + ); + assert_eq!( + parser(r"a|").parse(), + Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),])) + ); + assert_eq!( + parser(r"|a").parse(), + Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),])) + ); + + assert_eq!( + parser(r"(|)").parse(), + Ok(group( + 0..3, + 1, + alt( + 1..2, + vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] + ) + )) + ); + assert_eq!( + parser(r"(a|)").parse(), + Ok(group( + 0..4, + 1, + alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),]) + )) + ); + assert_eq!( + parser(r"(|a)").parse(), + Ok(group( + 0..4, + 1, + alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),]) + )) + ); + + assert_eq!( + parser(r"a|b)").parse().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::GroupUnopened, + } + ); + assert_eq!( + parser(r"(a|b").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnclosed, + } + ); + } + + #[test] + fn parse_unsupported_lookaround() { + assert_eq!( + parser(r"(?=a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + assert_eq!( + parser(r"(?!a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + assert_eq!( + parser(r"(?<=a)").parse().unwrap_err(), + TestError { + span: span(0..4), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + assert_eq!( + parser(r"(?z)").parse(), + Ok(Ast::group(ast::Group { + span: span(0..7), + kind: ast::GroupKind::CaptureName { + starts_with_p: false, + name: ast::CaptureName { + span: span(3..4), + name: s("a"), + index: 1, + } + }, + ast: Box::new(lit('z', 5)), + })) + ); + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::group(ast::Group { + span: span(0..8), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..5), + name: s("a"), + index: 1, + } + }, + ast: Box::new(lit('z', 6)), + })) + ); + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("abc"), + index: 1, + } + }, + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a_1"), + index: 1, + } + }, + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a.1"), + index: 1, + } + }, + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::group(ast::Group { + span: span(0..11), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..8), + name: s("a[1]"), + index: 1, + } + }, + ast: Box::new(lit('z', 9)), + })) + ); + + assert_eq!( + parser("(?P)").parse(), + Ok(Ast::group(ast::Group { + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 9), + ), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: Span::new( + Position::new(4, 1, 5), + Position::new(7, 1, 7), + ), + name: s("a¾"), + index: 1, + } + }, + ast: Box::new(Ast::empty(Span::new( + Position::new(8, 1, 8), + Position::new(8, 1, 8), + ))), + })) + ); + assert_eq!( + parser("(?P<名字>)").parse(), + Ok(Ast::group(ast::Group { + span: Span::new( + Position::new(0, 1, 1), + Position::new(12, 1, 9), + ), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: Span::new( + Position::new(4, 1, 5), + Position::new(10, 1, 7), + ), + name: s("名字"), + index: 1, + } + }, + ast: Box::new(Ast::empty(Span::new( + Position::new(11, 1, 8), + Position::new(11, 1, 8), + ))), + })) + ); + + assert_eq!( + parser("(?P<").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameUnexpectedEof, + } + ); + assert_eq!( + parser("(?P<>z)").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameEmpty, + } + ); + assert_eq!( + parser("(?Py)(?Pz)").parse().unwrap_err(), + TestError { + span: span(12..13), + kind: ast::ErrorKind::GroupNameDuplicate { + original: span(4..5), + }, + } + ); + assert_eq!( + parser("(?P<5>)").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<5a>)").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<¾>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(6, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<¾a>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(6, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<☃>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(7, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 7), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + } + + #[test] + fn parse_flags() { + assert_eq!( + parser("i:").parse_flags(), + Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + }) + ); + assert_eq!( + parser("i)").parse_flags(), + Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + }) + ); + + assert_eq!( + parser("isU:").parse_flags(), + Ok(ast::Flags { + span: span(0..3), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + }) + ); + + assert_eq!( + parser("-isU:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + }) + ); + assert_eq!( + parser("i-sU:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + }) + ); + assert_eq!( + parser("i-sR:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF), + }, + ], + }) + ); + + assert_eq!( + parser("isU").parse_flags().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::FlagUnexpectedEof, + } + ); + assert_eq!( + parser("isUa:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + assert_eq!( + parser("isUi:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) }, + } + ); + assert_eq!( + parser("i-sU-i:").parse_flags().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::FlagRepeatedNegation { + original: span(1..2), + }, + } + ); + assert_eq!( + parser("-)").parse_flags().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagDanglingNegation, + } + ); + assert_eq!( + parser("i-)").parse_flags().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::FlagDanglingNegation, + } + ); + assert_eq!( + parser("iU-)").parse_flags().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::FlagDanglingNegation, + } + ); + } + + #[test] + fn parse_flag() { + assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); + assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); + assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); + assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); + assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); + assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF)); + assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); + + assert_eq!( + parser("a").parse_flag().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + assert_eq!( + parser("☃").parse_flag().unwrap_err(), + TestError { + span: span_range("☃", 0..3), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + } + + #[test] + fn parse_primitive_non_escape() { + assert_eq!( + parser(r".").parse_primitive(), + Ok(Primitive::Dot(span(0..1))) + ); + assert_eq!( + parser(r"^").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::StartLine, + })) + ); + assert_eq!( + parser(r"$").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::EndLine, + })) + ); + + assert_eq!( + parser(r"a").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: 'a', + })) + ); + assert_eq!( + parser(r"|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: '|', + })) + ); + assert_eq!( + parser(r"☃").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span_range("☃", 0..3), + kind: ast::LiteralKind::Verbatim, + c: '☃', + })) + ); + } + + #[test] + fn parse_escape() { + assert_eq!( + parser(r"\|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Meta, + c: '|', + })) + ); + let specials = &[ + (r"\a", '\x07', ast::SpecialLiteralKind::Bell), + (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), + (r"\t", '\t', ast::SpecialLiteralKind::Tab), + (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), + (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), + (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), + ]; + for &(pat, c, ref kind) in specials { + assert_eq!( + parser(pat).parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Special(kind.clone()), + c, + })) + ); + } + assert_eq!( + parser(r"\A").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::StartText, + })) + ); + assert_eq!( + parser(r"\z").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::EndText, + })) + ); + assert_eq!( + parser(r"\b").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + })) + ); + assert_eq!( + parser(r"\b{start}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..9), + kind: ast::AssertionKind::WordBoundaryStart, + })) + ); + assert_eq!( + parser(r"\b{end}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..7), + kind: ast::AssertionKind::WordBoundaryEnd, + })) + ); + assert_eq!( + parser(r"\b{start-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..14), + kind: ast::AssertionKind::WordBoundaryStartHalf, + })) + ); + assert_eq!( + parser(r"\b{end-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..12), + kind: ast::AssertionKind::WordBoundaryEndHalf, + })) + ); + assert_eq!( + parser(r"\<").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryStartAngle, + })) + ); + assert_eq!( + parser(r"\>").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryEndAngle, + })) + ); + assert_eq!( + parser(r"\B").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::NotWordBoundary, + })) + ); + + // We also support superfluous escapes in most cases now too. + for c in ['!', '@', '%', '"', '\'', '/', ' '] { + let pat = format!(r"\{}", c); + assert_eq!( + parser(&pat).parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Superfluous, + c, + })) + ); + } + + // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This + // gives flexibility for future evolution. + assert_eq!( + parser(r"\e").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + assert_eq!( + parser(r"\y").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + + // Starting a special word boundary without any non-whitespace chars + // after the brace makes it ambiguous whether the user meant to write + // a counted repetition (probably not?) or an actual special word + // boundary assertion. + assert_eq!( + parser(r"\b{").parse_escape().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + } + ); + assert_eq!( + parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(), + TestError { + span: span(0..4), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + } + ); + // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, + // and thus causes the parser to treat it as a counted repetition. + assert_eq!( + parser(r"\b{ ").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + // In this case, we got some valid chars that makes it look like the + // user is writing one of the special word boundary assertions, but + // we forget to close the brace. + assert_eq!( + parser(r"\b{foo").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // We get the same error as above, except it is provoked by seeing a + // char that we know is invalid before seeing a closing brace. + assert_eq!( + parser(r"\b{foo!}").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // And this one occurs when, syntactically, everything looks okay, but + // we don't use a valid spelling of a word boundary assertion. + assert_eq!( + parser(r"\b{foo}").parse_escape().unwrap_err(), + TestError { + span: span(3..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, + } + ); + + // An unfinished escape is illegal. + assert_eq!( + parser(r"\").parse_escape().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + } + + #[test] + fn parse_unsupported_backreference() { + assert_eq!( + parser(r"\0").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + } + ); + assert_eq!( + parser(r"\9").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + } + ); + } + + #[test] + fn parse_octal() { + for i in 0..511 { + let pat = format!(r"\{:o}", i); + assert_eq!( + parser_octal(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::Octal, + c: char::from_u32(i).unwrap(), + })) + ); + } + assert_eq!( + parser_octal(r"\778").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + })) + ); + assert_eq!( + parser_octal(r"\7777").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + })) + ); + assert_eq!( + parser_octal(r"\778").parse(), + Ok(Ast::concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + }), + Ast::literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: '8', + }), + ], + })) + ); + assert_eq!( + parser_octal(r"\7777").parse(), + Ok(Ast::concat(ast::Concat { + span: span(0..5), + asts: vec![ + Ast::literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + }), + Ast::literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: '7', + }), + ], + })) + ); + + assert_eq!( + parser_octal(r"\8").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + } + + #[test] + fn parse_hex_two() { + for i in 0..256 { + let pat = format!(r"\x{:02x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), + c: char::from_u32(i).unwrap(), + })) + ); + } + + assert_eq!( + parser(r"\xF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\xG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\xFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + } + + #[test] + fn parse_hex_four() { + for i in 0..65536 { + let c = match char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\u{:04x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeShort + ), + c, + })) + ); + } + + assert_eq!( + parser(r"\uF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\uG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uD800").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + } + + #[test] + fn parse_hex_eight() { + for i in 0..65536 { + let c = match char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\U{:08x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeLong + ), + c, + })) + ); + } + + assert_eq!( + parser(r"\UF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\UG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(6..7), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(7..8), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(8..9), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(9..10), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + } + + #[test] + fn parse_hex_brace() { + assert_eq!( + parser(r"\u{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeShort + ), + c: '⛄', + })) + ); + assert_eq!( + parser(r"\U{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeLong + ), + c: '⛄', + })) + ); + assert_eq!( + parser(r"\x{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '⛄', + })) + ); + assert_eq!( + parser(r"\x{26C4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '⛄', + })) + ); + assert_eq!( + parser(r"\x{10fFfF}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..10), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '\u{10FFFF}', + })) + ); + + assert_eq!( + parser(r"\x").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\x{").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\x{FF").parse_escape().unwrap_err(), + TestError { + span: span(2..5), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\x{}").parse_escape().unwrap_err(), + TestError { + span: span(2..4), + kind: ast::ErrorKind::EscapeHexEmpty, + } + ); + assert_eq!( + parser(r"\x{FGF}").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..9), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + assert_eq!( + parser(r"\x{D800}").parse_escape().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + assert_eq!( + parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..12), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + } + + #[test] + fn parse_decimal() { + assert_eq!(parser("123").parse_decimal(), Ok(123)); + assert_eq!(parser("0").parse_decimal(), Ok(0)); + assert_eq!(parser("01").parse_decimal(), Ok(1)); + + assert_eq!( + parser("-1").parse_decimal().unwrap_err(), + TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } + ); + assert_eq!( + parser("").parse_decimal().unwrap_err(), + TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } + ); + assert_eq!( + parser("9999999999").parse_decimal().unwrap_err(), + TestError { + span: span(0..10), + kind: ast::ErrorKind::DecimalInvalid, + } + ); + } + + #[test] + fn parse_set_class() { + fn union(span: Span, items: Vec) -> ast::ClassSet { + ast::ClassSet::union(ast::ClassSetUnion { span, items }) + } + + fn intersection( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span, + kind: ast::ClassSetBinaryOpKind::Intersection, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn difference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span, + kind: ast::ClassSetBinaryOpKind::Difference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn symdifference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span, + kind: ast::ClassSetBinaryOpKind::SymmetricDifference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { + ast::ClassSet::Item(item) + } + + fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { + ast::ClassSetItem::Ascii(cls) + } + + fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { + ast::ClassSetItem::Unicode(cls) + } + + fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { + ast::ClassSetItem::Perl(cls) + } + + fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { + ast::ClassSetItem::Bracketed(Box::new(cls)) + } + + fn lit(span: Span, c: char) -> ast::ClassSetItem { + ast::ClassSetItem::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Verbatim, + c, + }) + } + + fn empty(span: Span) -> ast::ClassSetItem { + ast::ClassSetItem::Empty(span) + } + + fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { + let pos1 = Position { + offset: span.start.offset + start.len_utf8(), + column: span.start.column + 1, + ..span.start + }; + let pos2 = Position { + offset: span.end.offset - end.len_utf8(), + column: span.end.column - 1, + ..span.end + }; + ast::ClassSetItem::Range(ast::ClassSetRange { + span, + start: ast::Literal { + span: Span { end: pos1, ..span }, + kind: ast::LiteralKind::Verbatim, + c: start, + }, + end: ast::Literal { + span: Span { start: pos2, ..span }, + kind: ast::LiteralKind::Verbatim, + c: end, + }, + }) + } + + fn alnum(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated } + } + + fn lower(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated } + } + + assert_eq!( + parser("[[:alnum:]]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..11), + negated: false, + kind: itemset(item_ascii(alnum(span(1..10), false))), + })) + ); + assert_eq!( + parser("[[[:alnum:]]]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..13), + negated: false, + kind: itemset(item_bracket(ast::ClassBracketed { + span: span(1..12), + negated: false, + kind: itemset(item_ascii(alnum(span(2..11), false))), + })), + })) + ); + assert_eq!( + parser("[[:alnum:]&&[:lower:]]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: intersection( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + })) + ); + assert_eq!( + parser("[[:alnum:]--[:lower:]]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: difference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + })) + ); + assert_eq!( + parser("[[:alnum:]~~[:lower:]]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: symdifference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + })) + ); + + assert_eq!( + parser("[a]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), 'a')), + })) + ); + assert_eq!( + parser(r"[a\]]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union( + span(1..4), + vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Meta, + c: ']', + }), + ] + ), + })) + ); + assert_eq!( + parser(r"[a\-z]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union( + span(1..5), + vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Meta, + c: '-', + }), + lit(span(4..5), 'z'), + ] + ), + })) + ); + assert_eq!( + parser("[ab]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union( + span(1..3), + vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] + ), + })) + ); + assert_eq!( + parser("[a-]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union( + span(1..3), + vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] + ), + })) + ); + assert_eq!( + parser("[-a]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union( + span(1..3), + vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] + ), + })) + ); + assert_eq!( + parser(r"[\pL]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(item_unicode(ast::ClassUnicode { + span: span(1..4), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('L'), + })), + })) + ); + assert_eq!( + parser(r"[\w]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + })) + ); + assert_eq!( + parser(r"[a\wz]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union( + span(1..5), + vec![ + lit(span(1..2), 'a'), + item_perl(ast::ClassPerl { + span: span(2..4), + kind: ast::ClassPerlKind::Word, + negated: false, + }), + lit(span(4..5), 'z'), + ] + ), + })) + ); + + assert_eq!( + parser("[a-z]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(range(span(1..4), 'a', 'z')), + })) + ); + assert_eq!( + parser("[a-cx-z]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..8), + negated: false, + kind: union( + span(1..7), + vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ] + ), + })) + ); + assert_eq!( + parser(r"[\w&&a-cx-z]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + union( + span(5..11), + vec![ + range(span(5..8), 'a', 'c'), + range(span(8..11), 'x', 'z'), + ] + ), + ), + })) + ); + assert_eq!( + parser(r"[a-cx-z&&\w]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + union( + span(1..7), + vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ] + ), + itemset(item_perl(ast::ClassPerl { + span: span(9..11), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + ), + })) + ); + assert_eq!( + parser(r"[a--b--c]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: difference( + span(1..8), + difference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + })) + ); + assert_eq!( + parser(r"[a~~b~~c]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: symdifference( + span(1..8), + symdifference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + })) + ); + assert_eq!( + parser(r"[\^&&^]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Meta, + c: '^', + })), + itemset(lit(span(5..6), '^')), + ), + })) + ); + assert_eq!( + parser(r"[\&&&&]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Meta, + c: '&', + })), + itemset(lit(span(5..6), '&')), + ), + })) + ); + assert_eq!( + parser(r"[&&&&]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: intersection( + span(1..5), + intersection( + span(1..3), + itemset(empty(span(1..1))), + itemset(empty(span(3..3))), + ), + itemset(empty(span(5..5))), + ), + })) + ); + + let pat = "[☃-⛄]"; + assert_eq!( + parser(pat).parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span_range(pat, 0..9), + negated: false, + kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { + span: span_range(pat, 1..8), + start: ast::Literal { + span: span_range(pat, 1..4), + kind: ast::LiteralKind::Verbatim, + c: '☃', + }, + end: ast::Literal { + span: span_range(pat, 5..8), + kind: ast::LiteralKind::Verbatim, + c: '⛄', + }, + })), + })) + ); + + assert_eq!( + parser(r"[]]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), ']')), + })) + ); + assert_eq!( + parser(r"[]\[]").parse(), + Ok(Ast::class_bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union( + span(1..4), + vec![ + lit(span(1..2), ']'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Meta, + c: '[', + }), + ] + ), + })) + ); + assert_eq!( + parser(r"[\[]]").parse(), + Ok(concat( + 0..5, + vec![ + Ast::class_bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(ast::ClassSetItem::Literal( + ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Meta, + c: '[', + } + )), + }), + Ast::literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ] + )) + ); + + assert_eq!( + parser("[").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[[").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[[-]").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[[[:alnum:]").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser(r"[\b]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassEscapeInvalid, + } + ); + assert_eq!( + parser(r"[\w-a]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassRangeLiteral, + } + ); + assert_eq!( + parser(r"[a-\w]").parse().unwrap_err(), + TestError { + span: span(3..5), + kind: ast::ErrorKind::ClassRangeLiteral, + } + ); + assert_eq!( + parser(r"[z-a]").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::ClassRangeInvalid, + } + ); + + assert_eq!( + parser_ignore_whitespace("[a ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser_ignore_whitespace("[a- ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + } + + #[test] + fn parse_set_class_open() { + assert_eq!(parser("[a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..1), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { span: span(1..1), items: vec![] }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = + ast::ClassSetUnion { span: span(4..4), items: vec![] }; + Ok((set, union)) + } + ); + assert_eq!(parser("[^a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { span: span(2..2), items: vec![] }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = + ast::ClassSetUnion { span: span(4..4), items: vec![] }; + Ok((set, union)) + } + ); + assert_eq!(parser("[-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ - a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + } + ); + assert_eq!(parser("[^-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + }); + assert_eq!(parser("[--a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!(parser("[]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: ']', + })], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ] a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + })], + }; + Ok((set, union)) + } + ); + assert_eq!(parser("[^]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + })], + }; + Ok((set, union)) + }); + assert_eq!(parser("[-]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + }); + + assert_eq!( + parser("[").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser_ignore_whitespace("[ ") + .parse_set_class_open() + .unwrap_err(), + TestError { + span: span(0..5), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[^").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[]").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[-").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[--").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + + // See: https://github.com/rust-lang/regex/issues/792 + assert_eq!( + parser("(?x)[-#]").parse_with_comments().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + } + + #[test] + fn maybe_parse_ascii_class() { + assert_eq!( + parser(r"[:alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + }) + ); + assert_eq!( + parser(r"[:alnum:]A").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + }) + ); + assert_eq!( + parser(r"[:^alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..10), + kind: ast::ClassAsciiKind::Alnum, + negated: true, + }) + ); + + let p = parser(r"[:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:^"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[^:alnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + } + + #[test] + fn parse_unicode_class() { + assert_eq!( + parser(r"\pN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })) + ); + assert_eq!( + parser(r"\PN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: true, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })) + ); + assert_eq!( + parser(r"\p{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("N")), + })) + ); + assert_eq!( + parser(r"\P{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: true, + kind: ast::ClassUnicodeKind::Named(s("N")), + })) + ); + assert_eq!( + parser(r"\p{Greek}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + })) + ); + + assert_eq!( + parser(r"\p{scx:Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s("scx"), + value: s("Katakana"), + }, + })) + ); + assert_eq!( + parser(r"\p{scx=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s("scx"), + value: s("Katakana"), + }, + })) + ); + assert_eq!( + parser(r"\p{scx!=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..17), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s("scx"), + value: s("Katakana"), + }, + })) + ); + + assert_eq!( + parser(r"\p{:}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s(""), + value: s(""), + }, + })) + ); + assert_eq!( + parser(r"\p{=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s(""), + value: s(""), + }, + })) + ); + assert_eq!( + parser(r"\p{!=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..6), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s(""), + value: s(""), + }, + })) + ); + + assert_eq!( + parser(r"\p").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\p{").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\p{N").parse_escape().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\p{Greek").parse_escape().unwrap_err(), + TestError { + span: span(8..8), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + + assert_eq!( + parser(r"\pNz").parse(), + Ok(Ast::concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::class_unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + }), + Ast::literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + })) + ); + assert_eq!( + parser(r"\p{Greek}z").parse(), + Ok(Ast::concat(ast::Concat { + span: span(0..10), + asts: vec![ + Ast::class_unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + }), + Ast::literal(ast::Literal { + span: span(9..10), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + })) + ); + assert_eq!( + parser(r"\p\{").parse().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::UnicodeClassInvalid, + } + ); + assert_eq!( + parser(r"\P\{").parse().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::UnicodeClassInvalid, + } + ); + } + + #[test] + fn parse_perl_class() { + assert_eq!( + parser(r"\d").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })) + ); + assert_eq!( + parser(r"\D").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: true, + })) + ); + assert_eq!( + parser(r"\s").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: false, + })) + ); + assert_eq!( + parser(r"\S").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: true, + })) + ); + assert_eq!( + parser(r"\w").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: false, + })) + ); + assert_eq!( + parser(r"\W").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: true, + })) + ); + + assert_eq!( + parser(r"\d").parse(), + Ok(Ast::class_perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })) + ); + assert_eq!( + parser(r"\dz").parse(), + Ok(Ast::concat(ast::Concat { + span: span(0..3), + asts: vec![ + Ast::class_perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + }), + Ast::literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + })) + ); + } + + // This tests a bug fix where the nest limit checker wasn't decrementing + // its depth during post-traversal, which causes long regexes to trip + // the default limit too aggressively. + #[test] + fn regression_454_nest_too_big() { + let pattern = r#" + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4} + "#; + assert!(parser_nest_limit(pattern, 50).parse().is_ok()); + } + + // This tests that we treat a trailing `-` in a character class as a + // literal `-` even when whitespace mode is enabled and there is whitespace + // after the trailing `-`. + #[test] + fn regression_455_trailing_dash_ignore_whitespace() { + assert!(parser("(?x)[ / - ]").parse().is_ok()); + assert!(parser("(?x)[ a - ]").parse().is_ok()); + assert!(parser( + "(?x)[ + a + - ] + " + ) + .parse() + .is_ok()); + assert!(parser( + "(?x)[ + a # wat + - ] + " + ) + .parse() + .is_ok()); + + assert!(parser("(?x)[ / -").parse().is_err()); + assert!(parser("(?x)[ / - ").parse().is_err()); + assert!(parser( + "(?x)[ + / - + " + ) + .parse() + .is_err()); + assert!(parser( + "(?x)[ + / - # wat + " + ) + .parse() + .is_err()); + } +} diff --git a/vendor/regex-syntax/src/ast/print.rs b/vendor/regex-syntax/src/ast/print.rs new file mode 100644 index 0000000..1ceb3c7 --- /dev/null +++ b/vendor/regex-syntax/src/ast/print.rs @@ -0,0 +1,577 @@ +/*! +This module provides a regular expression printer for `Ast`. +*/ + +use core::fmt; + +use crate::ast::{ + self, + visitor::{self, Visitor}, + Ast, +}; + +/// A builder for constructing a printer. +/// +/// Note that since a printer doesn't have any configuration knobs, this type +/// remains unexported. +#[derive(Clone, Debug)] +struct PrinterBuilder { + _priv: (), +} + +impl Default for PrinterBuilder { + fn default() -> PrinterBuilder { + PrinterBuilder::new() + } +} + +impl PrinterBuilder { + fn new() -> PrinterBuilder { + PrinterBuilder { _priv: () } + } + + fn build(&self) -> Printer { + Printer { _priv: () } + } +} + +/// A printer for a regular expression abstract syntax tree. +/// +/// A printer converts an abstract syntax tree (AST) to a regular expression +/// pattern string. This particular printer uses constant stack space and heap +/// space proportional to the size of the AST. +/// +/// This printer will not necessarily preserve the original formatting of the +/// regular expression pattern string. For example, all whitespace and comments +/// are ignored. +#[derive(Debug)] +pub struct Printer { + _priv: (), +} + +impl Printer { + /// Create a new printer. + pub fn new() -> Printer { + PrinterBuilder::new().build() + } + + /// Print the given `Ast` to the given writer. The writer must implement + /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used + /// here are a `fmt::Formatter` (which is available in `fmt::Display` + /// implementations) or a `&mut String`. + pub fn print(&mut self, ast: &Ast, wtr: W) -> fmt::Result { + visitor::visit(ast, Writer { wtr }) + } +} + +#[derive(Debug)] +struct Writer { + wtr: W, +} + +impl Visitor for Writer { + type Output = (); + type Err = fmt::Error; + + fn finish(self) -> fmt::Result { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { + match *ast { + Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), + _ => Ok(()), + } + } + + fn visit_post(&mut self, ast: &Ast) -> fmt::Result { + match *ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(ref x) => self.fmt_set_flags(x), + Ast::Literal(ref x) => self.fmt_literal(x), + Ast::Dot(_) => self.wtr.write_str("."), + Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::ClassPerl(ref x) => self.fmt_class_perl(x), + Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), + Ast::Repetition(ref x) => self.fmt_repetition(x), + Ast::Group(ref x) => self.fmt_group_post(x), + Ast::Alternation(_) => Ok(()), + Ast::Concat(_) => Ok(()), + } + } + + fn visit_alternation_in(&mut self) -> fmt::Result { + self.wtr.write_str("|") + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + match *ast { + ast::ClassSetItem::Bracketed(ref x) => { + self.fmt_class_bracketed_pre(x) + } + _ => Ok(()), + } + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + use crate::ast::ClassSetItem::*; + + match *ast { + Empty(_) => Ok(()), + Literal(ref x) => self.fmt_literal(x), + Range(ref x) => { + self.fmt_literal(&x.start)?; + self.wtr.write_str("-")?; + self.fmt_literal(&x.end)?; + Ok(()) + } + Ascii(ref x) => self.fmt_class_ascii(x), + Unicode(ref x) => self.fmt_class_unicode(x), + Perl(ref x) => self.fmt_class_perl(x), + Bracketed(ref x) => self.fmt_class_bracketed_post(x), + Union(_) => Ok(()), + } + } + + fn visit_class_set_binary_op_in( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + self.fmt_class_set_binary_op_kind(&ast.kind) + } +} + +impl Writer { + fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result { + use crate::ast::GroupKind::*; + match ast.kind { + CaptureIndex(_) => self.wtr.write_str("("), + CaptureName { ref name, starts_with_p } => { + let start = if starts_with_p { "(?P<" } else { "(?<" }; + self.wtr.write_str(start)?; + self.wtr.write_str(&name.name)?; + self.wtr.write_str(">")?; + Ok(()) + } + NonCapturing(ref flags) => { + self.wtr.write_str("(?")?; + self.fmt_flags(flags)?; + self.wtr.write_str(":")?; + Ok(()) + } + } + } + + fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result { + self.wtr.write_str(")") + } + + fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { + use crate::ast::RepetitionKind::*; + match ast.op.kind { + ZeroOrOne if ast.greedy => self.wtr.write_str("?"), + ZeroOrOne => self.wtr.write_str("??"), + ZeroOrMore if ast.greedy => self.wtr.write_str("*"), + ZeroOrMore => self.wtr.write_str("*?"), + OneOrMore if ast.greedy => self.wtr.write_str("+"), + OneOrMore => self.wtr.write_str("+?"), + Range(ref x) => { + self.fmt_repetition_range(x)?; + if !ast.greedy { + self.wtr.write_str("?")?; + } + Ok(()) + } + } + } + + fn fmt_repetition_range( + &mut self, + ast: &ast::RepetitionRange, + ) -> fmt::Result { + use crate::ast::RepetitionRange::*; + match *ast { + Exactly(x) => write!(self.wtr, "{{{}}}", x), + AtLeast(x) => write!(self.wtr, "{{{},}}", x), + Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y), + } + } + + fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result { + use crate::ast::LiteralKind::*; + + match ast.kind { + Verbatim => self.wtr.write_char(ast.c), + Meta | Superfluous => write!(self.wtr, r"\{}", ast.c), + Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)), + HexFixed(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{:02X}", u32::from(ast.c)) + } + HexFixed(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{:04X}", u32::from(ast.c)) + } + HexFixed(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{:08X}", u32::from(ast.c)) + } + HexBrace(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c)) + } + HexBrace(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c)) + } + HexBrace(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c)) + } + Special(ast::SpecialLiteralKind::Bell) => { + self.wtr.write_str(r"\a") + } + Special(ast::SpecialLiteralKind::FormFeed) => { + self.wtr.write_str(r"\f") + } + Special(ast::SpecialLiteralKind::Tab) => self.wtr.write_str(r"\t"), + Special(ast::SpecialLiteralKind::LineFeed) => { + self.wtr.write_str(r"\n") + } + Special(ast::SpecialLiteralKind::CarriageReturn) => { + self.wtr.write_str(r"\r") + } + Special(ast::SpecialLiteralKind::VerticalTab) => { + self.wtr.write_str(r"\v") + } + Special(ast::SpecialLiteralKind::Space) => { + self.wtr.write_str(r"\ ") + } + } + } + + fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result { + use crate::ast::AssertionKind::*; + match ast.kind { + StartLine => self.wtr.write_str("^"), + EndLine => self.wtr.write_str("$"), + StartText => self.wtr.write_str(r"\A"), + EndText => self.wtr.write_str(r"\z"), + WordBoundary => self.wtr.write_str(r"\b"), + NotWordBoundary => self.wtr.write_str(r"\B"), + WordBoundaryStart => self.wtr.write_str(r"\b{start}"), + WordBoundaryEnd => self.wtr.write_str(r"\b{end}"), + WordBoundaryStartAngle => self.wtr.write_str(r"\<"), + WordBoundaryEndAngle => self.wtr.write_str(r"\>"), + WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"), + WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"), + } + } + + fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result { + self.wtr.write_str("(?")?; + self.fmt_flags(&ast.flags)?; + self.wtr.write_str(")")?; + Ok(()) + } + + fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result { + use crate::ast::{Flag, FlagsItemKind}; + + for item in &ast.items { + match item.kind { + FlagsItemKind::Negation => self.wtr.write_str("-"), + FlagsItemKind::Flag(ref flag) => match *flag { + Flag::CaseInsensitive => self.wtr.write_str("i"), + Flag::MultiLine => self.wtr.write_str("m"), + Flag::DotMatchesNewLine => self.wtr.write_str("s"), + Flag::SwapGreed => self.wtr.write_str("U"), + Flag::Unicode => self.wtr.write_str("u"), + Flag::CRLF => self.wtr.write_str("R"), + Flag::IgnoreWhitespace => self.wtr.write_str("x"), + }, + }?; + } + Ok(()) + } + + fn fmt_class_bracketed_pre( + &mut self, + ast: &ast::ClassBracketed, + ) -> fmt::Result { + if ast.negated { + self.wtr.write_str("[^") + } else { + self.wtr.write_str("[") + } + } + + fn fmt_class_bracketed_post( + &mut self, + _ast: &ast::ClassBracketed, + ) -> fmt::Result { + self.wtr.write_str("]") + } + + fn fmt_class_set_binary_op_kind( + &mut self, + ast: &ast::ClassSetBinaryOpKind, + ) -> fmt::Result { + use crate::ast::ClassSetBinaryOpKind::*; + match *ast { + Intersection => self.wtr.write_str("&&"), + Difference => self.wtr.write_str("--"), + SymmetricDifference => self.wtr.write_str("~~"), + } + } + + fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result { + use crate::ast::ClassPerlKind::*; + match ast.kind { + Digit if ast.negated => self.wtr.write_str(r"\D"), + Digit => self.wtr.write_str(r"\d"), + Space if ast.negated => self.wtr.write_str(r"\S"), + Space => self.wtr.write_str(r"\s"), + Word if ast.negated => self.wtr.write_str(r"\W"), + Word => self.wtr.write_str(r"\w"), + } + } + + fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result { + use crate::ast::ClassAsciiKind::*; + match ast.kind { + Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"), + Alnum => self.wtr.write_str("[:alnum:]"), + Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"), + Alpha => self.wtr.write_str("[:alpha:]"), + Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"), + Ascii => self.wtr.write_str("[:ascii:]"), + Blank if ast.negated => self.wtr.write_str("[:^blank:]"), + Blank => self.wtr.write_str("[:blank:]"), + Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"), + Cntrl => self.wtr.write_str("[:cntrl:]"), + Digit if ast.negated => self.wtr.write_str("[:^digit:]"), + Digit => self.wtr.write_str("[:digit:]"), + Graph if ast.negated => self.wtr.write_str("[:^graph:]"), + Graph => self.wtr.write_str("[:graph:]"), + Lower if ast.negated => self.wtr.write_str("[:^lower:]"), + Lower => self.wtr.write_str("[:lower:]"), + Print if ast.negated => self.wtr.write_str("[:^print:]"), + Print => self.wtr.write_str("[:print:]"), + Punct if ast.negated => self.wtr.write_str("[:^punct:]"), + Punct => self.wtr.write_str("[:punct:]"), + Space if ast.negated => self.wtr.write_str("[:^space:]"), + Space => self.wtr.write_str("[:space:]"), + Upper if ast.negated => self.wtr.write_str("[:^upper:]"), + Upper => self.wtr.write_str("[:upper:]"), + Word if ast.negated => self.wtr.write_str("[:^word:]"), + Word => self.wtr.write_str("[:word:]"), + Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"), + Xdigit => self.wtr.write_str("[:xdigit:]"), + } + } + + fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result { + use crate::ast::ClassUnicodeKind::*; + use crate::ast::ClassUnicodeOpKind::*; + + if ast.negated { + self.wtr.write_str(r"\P")?; + } else { + self.wtr.write_str(r"\p")?; + } + match ast.kind { + OneLetter(c) => self.wtr.write_char(c), + Named(ref x) => write!(self.wtr, "{{{}}}", x), + NamedValue { op: Equal, ref name, ref value } => { + write!(self.wtr, "{{{}={}}}", name, value) + } + NamedValue { op: Colon, ref name, ref value } => { + write!(self.wtr, "{{{}:{}}}", name, value) + } + NamedValue { op: NotEqual, ref name, ref value } => { + write!(self.wtr, "{{{}!={}}}", name, value) + } + } + } +} + +#[cfg(test)] +mod tests { + use alloc::string::String; + + use crate::ast::parse::ParserBuilder; + + use super::*; + + fn roundtrip(given: &str) { + roundtrip_with(|b| b, given); + } + + fn roundtrip_with(mut f: F, given: &str) + where + F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, + { + let mut builder = ParserBuilder::new(); + f(&mut builder); + let ast = builder.build().parse(given).unwrap(); + + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&ast, &mut dst).unwrap(); + assert_eq!(given, dst); + } + + #[test] + fn print_literal() { + roundtrip("a"); + roundtrip(r"\["); + roundtrip_with(|b| b.octal(true), r"\141"); + roundtrip(r"\x61"); + roundtrip(r"\x7F"); + roundtrip(r"\u0061"); + roundtrip(r"\U00000061"); + roundtrip(r"\x{61}"); + roundtrip(r"\x{7F}"); + roundtrip(r"\u{61}"); + roundtrip(r"\U{61}"); + + roundtrip(r"\a"); + roundtrip(r"\f"); + roundtrip(r"\t"); + roundtrip(r"\n"); + roundtrip(r"\r"); + roundtrip(r"\v"); + roundtrip(r"(?x)\ "); + } + + #[test] + fn print_dot() { + roundtrip("."); + } + + #[test] + fn print_concat() { + roundtrip("ab"); + roundtrip("abcde"); + roundtrip("a(bcd)ef"); + } + + #[test] + fn print_alternation() { + roundtrip("a|b"); + roundtrip("a|b|c|d|e"); + roundtrip("|a|b|c|d|e"); + roundtrip("|a|b|c|d|e|"); + roundtrip("a(b|c|d)|e|f"); + } + + #[test] + fn print_assertion() { + roundtrip(r"^"); + roundtrip(r"$"); + roundtrip(r"\A"); + roundtrip(r"\z"); + roundtrip(r"\b"); + roundtrip(r"\B"); + } + + #[test] + fn print_repetition() { + roundtrip("a?"); + roundtrip("a??"); + roundtrip("a*"); + roundtrip("a*?"); + roundtrip("a+"); + roundtrip("a+?"); + roundtrip("a{5}"); + roundtrip("a{5}?"); + roundtrip("a{5,}"); + roundtrip("a{5,}?"); + roundtrip("a{5,10}"); + roundtrip("a{5,10}?"); + } + + #[test] + fn print_flags() { + roundtrip("(?i)"); + roundtrip("(?-i)"); + roundtrip("(?s-i)"); + roundtrip("(?-si)"); + roundtrip("(?siUmux)"); + } + + #[test] + fn print_group() { + roundtrip("(?i:a)"); + roundtrip("(?Pa)"); + roundtrip("(?a)"); + roundtrip("(a)"); + } + + #[test] + fn print_class() { + roundtrip(r"[abc]"); + roundtrip(r"[a-z]"); + roundtrip(r"[^a-z]"); + roundtrip(r"[a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[a-z0-9---]"); + roundtrip(r"[a-z&&m-n]"); + roundtrip(r"[[a-z&&m-n]]"); + roundtrip(r"[a-z--m-n]"); + roundtrip(r"[a-z~~m-n]"); + roundtrip(r"[a-z[0-9]]"); + roundtrip(r"[a-z[^0-9]]"); + + roundtrip(r"\d"); + roundtrip(r"\D"); + roundtrip(r"\s"); + roundtrip(r"\S"); + roundtrip(r"\w"); + roundtrip(r"\W"); + + roundtrip(r"[[:alnum:]]"); + roundtrip(r"[[:^alnum:]]"); + roundtrip(r"[[:alpha:]]"); + roundtrip(r"[[:^alpha:]]"); + roundtrip(r"[[:ascii:]]"); + roundtrip(r"[[:^ascii:]]"); + roundtrip(r"[[:blank:]]"); + roundtrip(r"[[:^blank:]]"); + roundtrip(r"[[:cntrl:]]"); + roundtrip(r"[[:^cntrl:]]"); + roundtrip(r"[[:digit:]]"); + roundtrip(r"[[:^digit:]]"); + roundtrip(r"[[:graph:]]"); + roundtrip(r"[[:^graph:]]"); + roundtrip(r"[[:lower:]]"); + roundtrip(r"[[:^lower:]]"); + roundtrip(r"[[:print:]]"); + roundtrip(r"[[:^print:]]"); + roundtrip(r"[[:punct:]]"); + roundtrip(r"[[:^punct:]]"); + roundtrip(r"[[:space:]]"); + roundtrip(r"[[:^space:]]"); + roundtrip(r"[[:upper:]]"); + roundtrip(r"[[:^upper:]]"); + roundtrip(r"[[:word:]]"); + roundtrip(r"[[:^word:]]"); + roundtrip(r"[[:xdigit:]]"); + roundtrip(r"[[:^xdigit:]]"); + + roundtrip(r"\pL"); + roundtrip(r"\PL"); + roundtrip(r"\p{L}"); + roundtrip(r"\P{L}"); + roundtrip(r"\p{X=Y}"); + roundtrip(r"\P{X=Y}"); + roundtrip(r"\p{X:Y}"); + roundtrip(r"\P{X:Y}"); + roundtrip(r"\p{X!=Y}"); + roundtrip(r"\P{X!=Y}"); + } +} diff --git a/vendor/regex-syntax/src/ast/visitor.rs b/vendor/regex-syntax/src/ast/visitor.rs new file mode 100644 index 0000000..c1bb24d --- /dev/null +++ b/vendor/regex-syntax/src/ast/visitor.rs @@ -0,0 +1,522 @@ +use alloc::{vec, vec::Vec}; + +use crate::ast::{self, Ast}; + +/// A trait for visiting an abstract syntax tree (AST) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on an abstract syntax tree without necessarily using recursion. +/// In particular, this permits callers to do case analysis with constant stack +/// usage, which can be important since the size of an abstract syntax tree +/// may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`] function. +/// +/// Note that the abstract syntax tree for a regular expression is quite +/// complex. Unless you specifically need it, you might be able to use the much +/// simpler [high-level intermediate representation](crate::hir::Hir) and its +/// [corresponding `Visitor` trait](crate::hir::Visitor) instead. +pub trait Visitor { + /// The result of visiting an AST. + type Output; + /// An error that visiting an AST might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the AST or an error. + fn finish(self) -> Result; + + /// This method is called before beginning traversal of the AST. + fn start(&mut self) {} + + /// This method is called on an `Ast` before descending into child `Ast` + /// nodes. + fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Ast` after descending all of its child + /// `Ast` nodes. + fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an + /// [`Alternation`](ast::Alternation). + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of a concatenation. + fn visit_concat_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every [`ClassSetItem`](ast::ClassSetItem) + /// before descending into child nodes. + fn visit_class_set_item_pre( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every [`ClassSetItem`](ast::ClassSetItem) + /// after descending into child nodes. + fn visit_class_set_item_post( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) before descending into + /// child nodes. + fn visit_class_set_binary_op_pre( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) after descending into child + /// nodes. + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between the left hand and right hand child nodes + /// of a [`ClassSetBinaryOp`](ast::ClassSetBinaryOp). + fn visit_class_set_binary_op_in( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Ast` while calling the +/// appropriate methods provided by the [`Visitor`] trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Ast` without using a stack size proportional to the depth +/// of the `Ast`. Namely, this method will instead use constant stack size, but +/// will use heap space proportional to the size of the `Ast`. This may be +/// desirable in cases where the size of `Ast` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit(ast: &Ast, visitor: V) -> Result { + HeapVisitor::new().visit(ast, visitor) +} + +/// HeapVisitor visits every item in an `Ast` recursively using constant stack +/// size and a heap size proportional to the size of the `Ast`. +struct HeapVisitor<'a> { + /// A stack of `Ast` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Ast, Frame<'a>)>, + /// Similar to the `Ast` stack above, but is used only for character + /// classes. In particular, character classes embed their own mini + /// recursive syntax. + stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Ast`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a ast::Repetition), + /// A stack frame allocated just before descending into a group's child + /// node. + Group(&'a ast::Group), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, +} + +/// Represents a single stack frame while performing structural induction over +/// a character class. +enum ClassFrame<'a> { + /// The stack frame used while visiting every child node of a union of + /// character class items. + Union { + /// The child node we are currently visiting. + head: &'a ast::ClassSetItem, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [ast::ClassSetItem], + }, + /// The stack frame used while a binary class operation. + Binary { op: &'a ast::ClassSetBinaryOp }, + /// A stack frame allocated just before descending into a binary operator's + /// left hand child node. + BinaryLHS { + op: &'a ast::ClassSetBinaryOp, + lhs: &'a ast::ClassSet, + rhs: &'a ast::ClassSet, + }, + /// A stack frame allocated just before descending into a binary operator's + /// right hand child node. + BinaryRHS { op: &'a ast::ClassSetBinaryOp, rhs: &'a ast::ClassSet }, +} + +/// A representation of the inductive step when performing structural induction +/// over a character class. +/// +/// Note that there is no analogous explicit type for the inductive step for +/// `Ast` nodes because the inductive step is just an `Ast`. For character +/// classes, the inductive step can produce one of two possible child nodes: +/// an item or a binary operation. (An item cannot be a binary operation +/// because that would imply binary operations can be unioned in the concrete +/// syntax, which is not possible.) +enum ClassInduct<'a> { + Item(&'a ast::ClassSetItem), + BinaryOp(&'a ast::ClassSetBinaryOp), +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![], stack_class: vec![] } + } + + fn visit( + &mut self, + mut ast: &'a Ast, + mut visitor: V, + ) -> Result { + self.stack.clear(); + self.stack_class.clear(); + + visitor.start(); + loop { + visitor.visit_pre(ast)?; + if let Some(x) = self.induct(ast, &mut visitor)? { + let child = x.child(); + self.stack.push((ast, x)); + ast = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + visitor.visit_post(ast)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + match x { + Frame::Alternation { .. } => { + visitor.visit_alternation_in()?; + } + Frame::Concat { .. } => { + visitor.visit_concat_in()?; + } + _ => {} + } + ast = x.child(); + self.stack.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this AST, so we can post visit it now. + visitor.visit_post(post_ast)?; + } + } + } + + /// Build a stack frame for the given AST if one is needed (which occurs if + /// and only if there are child nodes in the AST). Otherwise, return None. + /// + /// If this visits a class, then the underlying visitor implementation may + /// return an error which will be passed on here. + fn induct( + &mut self, + ast: &'a Ast, + visitor: &mut V, + ) -> Result>, V::Err> { + Ok(match *ast { + Ast::ClassBracketed(ref x) => { + self.visit_class(x, visitor)?; + None + } + Ast::Repetition(ref x) => Some(Frame::Repetition(x)), + Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::Concat(ref x) if x.asts.is_empty() => None, + Ast::Concat(ref x) => { + Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) + } + Ast::Alternation(ref x) if x.asts.is_empty() => None, + Ast::Alternation(ref x) => Some(Frame::Alternation { + head: &x.asts[0], + tail: &x.asts[1..], + }), + _ => None, + }) + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option> { + match induct { + Frame::Repetition(_) => None, + Frame::Group(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { head: &tail[0], tail: &tail[1..] }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } + + fn visit_class( + &mut self, + ast: &'a ast::ClassBracketed, + visitor: &mut V, + ) -> Result<(), V::Err> { + let mut ast = ClassInduct::from_bracketed(ast); + loop { + self.visit_class_pre(&ast, visitor)?; + if let Some(x) = self.induct_class(&ast) { + let child = x.child(); + self.stack_class.push((ast, x)); + ast = child; + continue; + } + self.visit_class_post(&ast, visitor)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack_class.pop() { + None => return Ok(()), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a union or a binary op, then we might have + // additional inductive steps to process. + if let Some(x) = self.pop_class(frame) { + if let ClassFrame::BinaryRHS { ref op, .. } = x { + visitor.visit_class_set_binary_op_in(op)?; + } + ast = x.child(); + self.stack_class.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this class node, so we can post visit it now. + self.visit_class_post(&post_ast, visitor)?; + } + } + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_pre( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + visitor.visit_class_set_item_pre(item)?; + } + ClassInduct::BinaryOp(op) => { + visitor.visit_class_set_binary_op_pre(op)?; + } + } + Ok(()) + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_post( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + visitor.visit_class_set_item_post(item)?; + } + ClassInduct::BinaryOp(op) => { + visitor.visit_class_set_binary_op_post(op)?; + } + } + Ok(()) + } + + /// Build a stack frame for the given class node if one is needed (which + /// occurs if and only if there are child nodes). Otherwise, return None. + fn induct_class(&self, ast: &ClassInduct<'a>) -> Option> { + match *ast { + ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => { + match x.kind { + ast::ClassSet::Item(ref item) => { + Some(ClassFrame::Union { head: item, tail: &[] }) + } + ast::ClassSet::BinaryOp(ref op) => { + Some(ClassFrame::Binary { op }) + } + } + } + ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => { + if x.items.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &x.items[0], + tail: &x.items[1..], + }) + } + } + ClassInduct::BinaryOp(op) => { + Some(ClassFrame::BinaryLHS { op, lhs: &op.lhs, rhs: &op.rhs }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop_class(&self, induct: ClassFrame<'a>) -> Option> { + match induct { + ClassFrame::Union { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &tail[0], + tail: &tail[1..], + }) + } + } + ClassFrame::Binary { .. } => None, + ClassFrame::BinaryLHS { op, rhs, .. } => { + Some(ClassFrame::BinaryRHS { op, rhs }) + } + ClassFrame::BinaryRHS { .. } => None, + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child AST node to visit. + fn child(&self) -> &'a Ast { + match *self { + Frame::Repetition(rep) => &rep.ast, + Frame::Group(group) => &group.ast, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} + +impl<'a> ClassFrame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child class node to visit. + fn child(&self) -> ClassInduct<'a> { + match *self { + ClassFrame::Union { head, .. } => ClassInduct::Item(head), + ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op), + ClassFrame::BinaryLHS { ref lhs, .. } => { + ClassInduct::from_set(lhs) + } + ClassFrame::BinaryRHS { ref rhs, .. } => { + ClassInduct::from_set(rhs) + } + } + } +} + +impl<'a> ClassInduct<'a> { + fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> { + ClassInduct::from_set(&ast.kind) + } + + fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> { + match *ast { + ast::ClassSet::Item(ref item) => ClassInduct::Item(item), + ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op), + } + } +} + +impl<'a> core::fmt::Debug for ClassFrame<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let x = match *self { + ClassFrame::Union { .. } => "Union", + ClassFrame::Binary { .. } => "Binary", + ClassFrame::BinaryLHS { .. } => "BinaryLHS", + ClassFrame::BinaryRHS { .. } => "BinaryRHS", + }; + write!(f, "{}", x) + } +} + +impl<'a> core::fmt::Debug for ClassInduct<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let x = match *self { + ClassInduct::Item(it) => match *it { + ast::ClassSetItem::Empty(_) => "Item(Empty)", + ast::ClassSetItem::Literal(_) => "Item(Literal)", + ast::ClassSetItem::Range(_) => "Item(Range)", + ast::ClassSetItem::Ascii(_) => "Item(Ascii)", + ast::ClassSetItem::Perl(_) => "Item(Perl)", + ast::ClassSetItem::Unicode(_) => "Item(Unicode)", + ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)", + ast::ClassSetItem::Union(_) => "Item(Union)", + }, + ClassInduct::BinaryOp(it) => match it.kind { + ast::ClassSetBinaryOpKind::Intersection => { + "BinaryOp(Intersection)" + } + ast::ClassSetBinaryOpKind::Difference => { + "BinaryOp(Difference)" + } + ast::ClassSetBinaryOpKind::SymmetricDifference => { + "BinaryOp(SymmetricDifference)" + } + }, + }; + write!(f, "{}", x) + } +} diff --git a/vendor/regex-syntax/src/debug.rs b/vendor/regex-syntax/src/debug.rs new file mode 100644 index 0000000..a0b051b --- /dev/null +++ b/vendor/regex-syntax/src/debug.rs @@ -0,0 +1,107 @@ +/// A type that wraps a single byte with a convenient fmt::Debug impl that +/// escapes the byte. +pub(crate) struct Byte(pub(crate) u8); + +impl core::fmt::Debug for Byte { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + // Special case ASCII space. It's too hard to read otherwise, so + // put quotes around it. I sometimes wonder whether just '\x20' would + // be better... + if self.0 == b' ' { + return write!(f, "' '"); + } + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) + } +} + +/// A type that provides a human readable debug impl for arbitrary bytes. +/// +/// This generally works best when the bytes are presumed to be mostly UTF-8, +/// but will work for anything. +/// +/// N.B. This is copied nearly verbatim from regex-automata. Sigh. +pub(crate) struct Bytes<'a>(pub(crate) &'a [u8]); + +impl<'a> core::fmt::Debug for Bytes<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "\"")?; + // This is a sad re-implementation of a similar impl found in bstr. + let mut bytes = self.0; + while let Some(result) = utf8_decode(bytes) { + let ch = match result { + Ok(ch) => ch, + Err(byte) => { + write!(f, r"\x{:02x}", byte)?; + bytes = &bytes[1..]; + continue; + } + }; + bytes = &bytes[ch.len_utf8()..]; + match ch { + '\0' => write!(f, "\\0")?, + // ASCII control characters except \0, \n, \r, \t + '\x01'..='\x08' + | '\x0b' + | '\x0c' + | '\x0e'..='\x19' + | '\x7f' => { + write!(f, "\\x{:02x}", u32::from(ch))?; + } + '\n' | '\r' | '\t' | _ => { + write!(f, "{}", ch.escape_debug())?; + } + } + } + write!(f, "\"")?; + Ok(()) + } +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +pub(crate) fn utf8_decode(bytes: &[u8]) -> Option> { + fn len(byte: u8) -> Option { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } + } + + if bytes.is_empty() { + return None; + } + let len = match len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(char::from(bytes[0]))), + Some(len) => len, + }; + match core::str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} diff --git a/vendor/regex-syntax/src/either.rs b/vendor/regex-syntax/src/either.rs new file mode 100644 index 0000000..7ae41e4 --- /dev/null +++ b/vendor/regex-syntax/src/either.rs @@ -0,0 +1,8 @@ +/// A simple binary sum type. +/// +/// This is occasionally useful in an ad hoc fashion. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Either { + Left(Left), + Right(Right), +} diff --git a/vendor/regex-syntax/src/error.rs b/vendor/regex-syntax/src/error.rs new file mode 100644 index 0000000..98869c4 --- /dev/null +++ b/vendor/regex-syntax/src/error.rs @@ -0,0 +1,311 @@ +use alloc::{ + format, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ast, hir}; + +/// This error type encompasses any error that can be returned by this crate. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Error { + /// An error that occurred while translating concrete syntax into abstract + /// syntax (AST). + Parse(ast::Error), + /// An error that occurred while translating abstract syntax into a high + /// level intermediate representation (HIR). + Translate(hir::Error), +} + +impl From for Error { + fn from(err: ast::Error) -> Error { + Error::Parse(err) + } +} + +impl From for Error { + fn from(err: hir::Error) -> Error { + Error::Translate(err) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error {} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + Error::Parse(ref x) => x.fmt(f), + Error::Translate(ref x) => x.fmt(f), + } + } +} + +/// A helper type for formatting nice error messages. +/// +/// This type is responsible for reporting regex parse errors in a nice human +/// readable format. Most of its complexity is from interspersing notational +/// markers pointing out the position where an error occurred. +#[derive(Debug)] +pub struct Formatter<'e, E> { + /// The original regex pattern in which the error occurred. + pattern: &'e str, + /// The error kind. It must impl fmt::Display. + err: &'e E, + /// The primary span of the error. + span: &'e ast::Span, + /// An auxiliary and optional span, in case the error needs to point to + /// two locations (e.g., when reporting a duplicate capture group name). + aux_span: Option<&'e ast::Span>, +} + +impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> { + fn from(err: &'e ast::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: err.auxiliary_span(), + } + } +} + +impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> { + fn from(err: &'e hir::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: None, + } + } +} + +impl<'e, E: core::fmt::Display> core::fmt::Display for Formatter<'e, E> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let spans = Spans::from_formatter(self); + if self.pattern.contains('\n') { + let divider = repeat_char('~', 79); + + writeln!(f, "regex parse error:")?; + writeln!(f, "{}", divider)?; + let notated = spans.notate(); + write!(f, "{}", notated)?; + writeln!(f, "{}", divider)?; + // If we have error spans that cover multiple lines, then we just + // note the line numbers. + if !spans.multi_line.is_empty() { + let mut notes = vec![]; + for span in &spans.multi_line { + notes.push(format!( + "on line {} (column {}) through line {} (column {})", + span.start.line, + span.start.column, + span.end.line, + span.end.column - 1 + )); + } + writeln!(f, "{}", notes.join("\n"))?; + } + write!(f, "error: {}", self.err)?; + } else { + writeln!(f, "regex parse error:")?; + let notated = Spans::from_formatter(self).notate(); + write!(f, "{}", notated)?; + write!(f, "error: {}", self.err)?; + } + Ok(()) + } +} + +/// This type represents an arbitrary number of error spans in a way that makes +/// it convenient to notate the regex pattern. ("Notate" means "point out +/// exactly where the error occurred in the regex pattern.") +/// +/// Technically, we can only ever have two spans given our current error +/// structure. However, after toiling with a specific algorithm for handling +/// two spans, it became obvious that an algorithm to handle an arbitrary +/// number of spans was actually much simpler. +struct Spans<'p> { + /// The original regex pattern string. + pattern: &'p str, + /// The total width that should be used for line numbers. The width is + /// used for left padding the line numbers for alignment. + /// + /// A value of `0` means line numbers should not be displayed. That is, + /// the pattern is itself only one line. + line_number_width: usize, + /// All error spans that occur on a single line. This sequence always has + /// length equivalent to the number of lines in `pattern`, where the index + /// of the sequence represents a line number, starting at `0`. The spans + /// in each line are sorted in ascending order. + by_line: Vec>, + /// All error spans that occur over one or more lines. That is, the start + /// and end position of the span have different line numbers. The spans are + /// sorted in ascending order. + multi_line: Vec, +} + +impl<'p> Spans<'p> { + /// Build a sequence of spans from a formatter. + fn from_formatter<'e, E: core::fmt::Display>( + fmter: &'p Formatter<'e, E>, + ) -> Spans<'p> { + let mut line_count = fmter.pattern.lines().count(); + // If the pattern ends with a `\n` literal, then our line count is + // off by one, since a span can occur immediately after the last `\n`, + // which is consider to be an additional line. + if fmter.pattern.ends_with('\n') { + line_count += 1; + } + let line_number_width = + if line_count <= 1 { 0 } else { line_count.to_string().len() }; + let mut spans = Spans { + pattern: &fmter.pattern, + line_number_width, + by_line: vec![vec![]; line_count], + multi_line: vec![], + }; + spans.add(fmter.span.clone()); + if let Some(span) = fmter.aux_span { + spans.add(span.clone()); + } + spans + } + + /// Add the given span to this sequence, putting it in the right place. + fn add(&mut self, span: ast::Span) { + // This is grossly inefficient since we sort after each add, but right + // now, we only ever add two spans at most. + if span.is_one_line() { + let i = span.start.line - 1; // because lines are 1-indexed + self.by_line[i].push(span); + self.by_line[i].sort(); + } else { + self.multi_line.push(span); + self.multi_line.sort(); + } + } + + /// Notate the pattern string with carents (`^`) pointing at each span + /// location. This only applies to spans that occur within a single line. + fn notate(&self) -> String { + let mut notated = String::new(); + for (i, line) in self.pattern.lines().enumerate() { + if self.line_number_width > 0 { + notated.push_str(&self.left_pad_line_number(i + 1)); + notated.push_str(": "); + } else { + notated.push_str(" "); + } + notated.push_str(line); + notated.push('\n'); + if let Some(notes) = self.notate_line(i) { + notated.push_str(¬es); + notated.push('\n'); + } + } + notated + } + + /// Return notes for the line indexed at `i` (zero-based). If there are no + /// spans for the given line, then `None` is returned. Otherwise, an + /// appropriately space padded string with correctly positioned `^` is + /// returned, accounting for line numbers. + fn notate_line(&self, i: usize) -> Option { + let spans = &self.by_line[i]; + if spans.is_empty() { + return None; + } + let mut notes = String::new(); + for _ in 0..self.line_number_padding() { + notes.push(' '); + } + let mut pos = 0; + for span in spans { + for _ in pos..(span.start.column - 1) { + notes.push(' '); + pos += 1; + } + let note_len = span.end.column.saturating_sub(span.start.column); + for _ in 0..core::cmp::max(1, note_len) { + notes.push('^'); + pos += 1; + } + } + Some(notes) + } + + /// Left pad the given line number with spaces such that it is aligned with + /// other line numbers. + fn left_pad_line_number(&self, n: usize) -> String { + let n = n.to_string(); + let pad = self.line_number_width.checked_sub(n.len()).unwrap(); + let mut result = repeat_char(' ', pad); + result.push_str(&n); + result + } + + /// Return the line number padding beginning at the start of each line of + /// the pattern. + /// + /// If the pattern is only one line, then this returns a fixed padding + /// for visual indentation. + fn line_number_padding(&self) -> usize { + if self.line_number_width == 0 { + 4 + } else { + 2 + self.line_number_width + } + } +} + +fn repeat_char(c: char, count: usize) -> String { + core::iter::repeat(c).take(count).collect() +} + +#[cfg(test)] +mod tests { + use alloc::string::ToString; + + use crate::ast::parse::Parser; + + fn assert_panic_message(pattern: &str, expected_msg: &str) { + let result = Parser::new().parse(pattern); + match result { + Ok(_) => { + panic!("regex should not have parsed"); + } + Err(err) => { + assert_eq!(err.to_string(), expected_msg.trim()); + } + } + } + + // See: https://github.com/rust-lang/regex/issues/464 + #[test] + fn regression_464() { + let err = Parser::new().parse("a{\n").unwrap_err(); + // This test checks that the error formatter doesn't panic. + assert!(!err.to_string().is_empty()); + } + + // See: https://github.com/rust-lang/regex/issues/545 + #[test] + fn repetition_quantifier_expects_a_valid_decimal() { + assert_panic_message( + r"\\u{[^}]*}", + r#" +regex parse error: + \\u{[^}]*} + ^ +error: repetition quantifier expects a valid decimal +"#, + ); + } +} diff --git a/vendor/regex-syntax/src/hir/interval.rs b/vendor/regex-syntax/src/hir/interval.rs new file mode 100644 index 0000000..e063390 --- /dev/null +++ b/vendor/regex-syntax/src/hir/interval.rs @@ -0,0 +1,581 @@ +use core::{char, cmp, fmt::Debug, slice}; + +use alloc::vec::Vec; + +use crate::unicode; + +// This module contains an *internal* implementation of interval sets. +// +// The primary invariant that interval sets guards is canonical ordering. That +// is, every interval set contains an ordered sequence of intervals where +// no two intervals are overlapping or adjacent. While this invariant is +// occasionally broken within the implementation, it should be impossible for +// callers to observe it. +// +// Since case folding (as implemented below) breaks that invariant, we roll +// that into this API even though it is a little out of place in an otherwise +// generic interval set. (Hence the reason why the `unicode` module is imported +// here.) +// +// Some of the implementation complexity here is a result of me wanting to +// preserve the sequential representation without using additional memory. +// In many cases, we do use linear extra memory, but it is at most 2x and it +// is amortized. If we relaxed the memory requirements, this implementation +// could become much simpler. The extra memory is honestly probably OK, but +// character classes (especially of the Unicode variety) can become quite +// large, and it would be nice to keep regex compilation snappy even in debug +// builds. (In the past, I have been careless with this area of code and it has +// caused slow regex compilations in debug mode, so this isn't entirely +// unwarranted.) +// +// Tests on this are relegated to the public API of HIR in src/hir.rs. + +#[derive(Clone, Debug)] +pub struct IntervalSet { + /// A sorted set of non-overlapping ranges. + ranges: Vec, + /// While not required at all for correctness, we keep track of whether an + /// interval set has been case folded or not. This helps us avoid doing + /// redundant work if, for example, a set has already been cased folded. + /// And note that whether a set is folded or not is preserved through + /// all of the pairwise set operations. That is, if both interval sets + /// have been case folded, then any of difference, union, intersection or + /// symmetric difference all produce a case folded set. + /// + /// Note that when this is true, it *must* be the case that the set is case + /// folded. But when it's false, the set *may* be case folded. In other + /// words, we only set this to true when we know it to be case, but we're + /// okay with it being false if it would otherwise be costly to determine + /// whether it should be true. This means code cannot assume that a false + /// value necessarily indicates that the set is not case folded. + /// + /// Bottom line: this is a performance optimization. + folded: bool, +} + +impl Eq for IntervalSet {} + +// We implement PartialEq manually so that we don't consider the set's internal +// 'folded' property to be part of its identity. The 'folded' property is +// strictly an optimization. +impl PartialEq for IntervalSet { + fn eq(&self, other: &IntervalSet) -> bool { + self.ranges.eq(&other.ranges) + } +} + +impl IntervalSet { + /// Create a new set from a sequence of intervals. Each interval is + /// specified as a pair of bounds, where both bounds are inclusive. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new>(intervals: T) -> IntervalSet { + let ranges: Vec = intervals.into_iter().collect(); + // An empty set is case folded. + let folded = ranges.is_empty(); + let mut set = IntervalSet { ranges, folded }; + set.canonicalize(); + set + } + + /// Add a new interval to this set. + pub fn push(&mut self, interval: I) { + // TODO: This could be faster. e.g., Push the interval such that + // it preserves canonicalization. + self.ranges.push(interval); + self.canonicalize(); + // We don't know whether the new interval added here is considered + // case folded, so we conservatively assume that the entire set is + // no longer case folded if it was previously. + self.folded = false; + } + + /// Return an iterator over all intervals in this set. + /// + /// The iterator yields intervals in ascending order. + pub fn iter(&self) -> IntervalSetIter<'_, I> { + IntervalSetIter(self.ranges.iter()) + } + + /// Return an immutable slice of intervals in this set. + /// + /// The sequence returned is in canonical ordering. + pub fn intervals(&self) -> &[I] { + &self.ranges + } + + /// Expand this interval set such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// This returns an error if the necessary case mapping data is not + /// available. + pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { + if self.folded { + return Ok(()); + } + let len = self.ranges.len(); + for i in 0..len { + let range = self.ranges[i]; + if let Err(err) = range.case_fold_simple(&mut self.ranges) { + self.canonicalize(); + return Err(err); + } + } + self.canonicalize(); + self.folded = true; + Ok(()) + } + + /// Union this set with the given set, in place. + pub fn union(&mut self, other: &IntervalSet) { + if other.ranges.is_empty() || self.ranges == other.ranges { + return; + } + // This could almost certainly be done more efficiently. + self.ranges.extend(&other.ranges); + self.canonicalize(); + self.folded = self.folded && other.folded; + } + + /// Intersect this set with the given set, in place. + pub fn intersect(&mut self, other: &IntervalSet) { + if self.ranges.is_empty() { + return; + } + if other.ranges.is_empty() { + self.ranges.clear(); + // An empty set is case folded. + self.folded = true; + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the intersection to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + let mut ita = 0..drain_end; + let mut itb = 0..other.ranges.len(); + let mut a = ita.next().unwrap(); + let mut b = itb.next().unwrap(); + loop { + if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { + self.ranges.push(ab); + } + let (it, aorb) = + if self.ranges[a].upper() < other.ranges[b].upper() { + (&mut ita, &mut a) + } else { + (&mut itb, &mut b) + }; + match it.next() { + Some(v) => *aorb = v, + None => break, + } + } + self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; + } + + /// Subtract the given set from this set, in place. + pub fn difference(&mut self, other: &IntervalSet) { + if self.ranges.is_empty() || other.ranges.is_empty() { + return; + } + + // This algorithm is (to me) surprisingly complex. A search of the + // interwebs indicate that this is a potentially interesting problem. + // Folks seem to suggest interval or segment trees, but I'd like to + // avoid the overhead (both runtime and conceptual) of that. + // + // The following is basically my Shitty First Draft. Therefore, in + // order to grok it, you probably need to read each line carefully. + // Simplifications are most welcome! + // + // Remember, we can assume the canonical format invariant here, which + // says that all ranges are sorted, not overlapping and not adjacent in + // each class. + let drain_end = self.ranges.len(); + let (mut a, mut b) = (0, 0); + 'LOOP: while a < drain_end && b < other.ranges.len() { + // Basically, the easy cases are when neither range overlaps with + // each other. If the `b` range is less than our current `a` + // range, then we can skip it and move on. + if other.ranges[b].upper() < self.ranges[a].lower() { + b += 1; + continue; + } + // ... similarly for the `a` range. If it's less than the smallest + // `b` range, then we can add it as-is. + if self.ranges[a].upper() < other.ranges[b].lower() { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + continue; + } + // Otherwise, we have overlapping ranges. + assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); + + // This part is tricky and was non-obvious to me without looking + // at explicit examples (see the tests). The trickiness stems from + // two things: 1) subtracting a range from another range could + // yield two ranges and 2) after subtracting a range, it's possible + // that future ranges can have an impact. The loop below advances + // the `b` ranges until they can't possible impact the current + // range. + // + // For example, if our `a` range is `a-t` and our next three `b` + // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply + // subtraction three times before moving on to the next `a` range. + let mut range = self.ranges[a]; + while b < other.ranges.len() + && !range.is_intersection_empty(&other.ranges[b]) + { + let old_range = range; + range = match range.difference(&other.ranges[b]) { + (None, None) => { + // We lost the entire range, so move on to the next + // without adding this one. + a += 1; + continue 'LOOP; + } + (Some(range1), None) | (None, Some(range1)) => range1, + (Some(range1), Some(range2)) => { + self.ranges.push(range1); + range2 + } + }; + // It's possible that the `b` range has more to contribute + // here. In particular, if it is greater than the original + // range, then it might impact the next `a` range *and* it + // has impacted the current `a` range as much as possible, + // so we can quit. We don't bump `b` so that the next `a` + // range can apply it. + if other.ranges[b].upper() > old_range.upper() { + break; + } + // Otherwise, the next `b` range might apply to the current + // `a` range. + b += 1; + } + self.ranges.push(range); + a += 1; + } + while a < drain_end { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + } + self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; + } + + /// Compute the symmetric difference of the two sets, in place. + /// + /// This computes the symmetric difference of two interval sets. This + /// removes all elements in this set that are also in the given set, + /// but also adds all elements from the given set that aren't in this + /// set. That is, the set will contain all elements in either set, + /// but will not contain any elements that are in both sets. + pub fn symmetric_difference(&mut self, other: &IntervalSet) { + // TODO(burntsushi): Fix this so that it amortizes allocation. + let mut intersection = self.clone(); + intersection.intersect(other); + self.union(other); + self.difference(&intersection); + } + + /// Negate this interval set. + /// + /// For all `x` where `x` is any element, if `x` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + if self.ranges.is_empty() { + let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); + self.ranges.push(I::create(min, max)); + // The set containing everything must case folded. + self.folded = true; + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + // We do checked arithmetic below because of the canonical ordering + // invariant. + if self.ranges[0].lower() > I::Bound::min_value() { + let upper = self.ranges[0].lower().decrement(); + self.ranges.push(I::create(I::Bound::min_value(), upper)); + } + for i in 1..drain_end { + let lower = self.ranges[i - 1].upper().increment(); + let upper = self.ranges[i].lower().decrement(); + self.ranges.push(I::create(lower, upper)); + } + if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { + let lower = self.ranges[drain_end - 1].upper().increment(); + self.ranges.push(I::create(lower, I::Bound::max_value())); + } + self.ranges.drain(..drain_end); + // We don't need to update whether this set is folded or not, because + // it is conservatively preserved through negation. Namely, if a set + // is not folded, then it is possible that its negation is folded, for + // example, [^☃]. But we're fine with assuming that the set is not + // folded in that case. (`folded` permits false negatives but not false + // positives.) + // + // But what about when a set is folded, is its negation also + // necessarily folded? Yes. Because if a set is folded, then for every + // character in the set, it necessarily included its equivalence class + // of case folded characters. Negating it in turn means that all + // equivalence classes in the set are negated, and any equivalence + // class that was previously not in the set is now entirely in the set. + } + + /// Converts this set into a canonical ordering. + fn canonicalize(&mut self) { + if self.is_canonical() { + return; + } + self.ranges.sort(); + assert!(!self.ranges.is_empty()); + + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } + } + let range = self.ranges[oldi]; + self.ranges.push(range); + } + self.ranges.drain(..drain_end); + } + + /// Returns true if and only if this class is in a canonical ordering. + fn is_canonical(&self) -> bool { + for pair in self.ranges.windows(2) { + if pair[0] >= pair[1] { + return false; + } + if pair[0].is_contiguous(&pair[1]) { + return false; + } + } + true + } +} + +/// An iterator over intervals. +#[derive(Debug)] +pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>); + +impl<'a, I> Iterator for IntervalSetIter<'a, I> { + type Item = &'a I; + + fn next(&mut self) -> Option<&'a I> { + self.0.next() + } +} + +pub trait Interval: + Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord +{ + type Bound: Bound; + + fn lower(&self) -> Self::Bound; + fn upper(&self) -> Self::Bound; + fn set_lower(&mut self, bound: Self::Bound); + fn set_upper(&mut self, bound: Self::Bound); + fn case_fold_simple( + &self, + intervals: &mut Vec, + ) -> Result<(), unicode::CaseFoldError>; + + /// Create a new interval. + fn create(lower: Self::Bound, upper: Self::Bound) -> Self { + let mut int = Self::default(); + if lower <= upper { + int.set_lower(lower); + int.set_upper(upper); + } else { + int.set_lower(upper); + int.set_upper(lower); + } + int + } + + /// Union the given overlapping range into this range. + /// + /// If the two ranges aren't contiguous, then this returns `None`. + fn union(&self, other: &Self) -> Option { + if !self.is_contiguous(other) { + return None; + } + let lower = cmp::min(self.lower(), other.lower()); + let upper = cmp::max(self.upper(), other.upper()); + Some(Self::create(lower, upper)) + } + + /// Intersect this range with the given range and return the result. + /// + /// If the intersection is empty, then this returns `None`. + fn intersect(&self, other: &Self) -> Option { + let lower = cmp::max(self.lower(), other.lower()); + let upper = cmp::min(self.upper(), other.upper()); + if lower <= upper { + Some(Self::create(lower, upper)) + } else { + None + } + } + + /// Subtract the given range from this range and return the resulting + /// ranges. + /// + /// If subtraction would result in an empty range, then no ranges are + /// returned. + fn difference(&self, other: &Self) -> (Option, Option) { + if self.is_subset(other) { + return (None, None); + } + if self.is_intersection_empty(other) { + return (Some(self.clone()), None); + } + let add_lower = other.lower() > self.lower(); + let add_upper = other.upper() < self.upper(); + // We know this because !self.is_subset(other) and the ranges have + // a non-empty intersection. + assert!(add_lower || add_upper); + let mut ret = (None, None); + if add_lower { + let upper = other.lower().decrement(); + ret.0 = Some(Self::create(self.lower(), upper)); + } + if add_upper { + let lower = other.upper().increment(); + let range = Self::create(lower, self.upper()); + if ret.0.is_none() { + ret.0 = Some(range); + } else { + ret.1 = Some(range); + } + } + ret + } + + /// Compute the symmetric difference the given range from this range. This + /// returns the union of the two ranges minus its intersection. + fn symmetric_difference( + &self, + other: &Self, + ) -> (Option, Option) { + let union = match self.union(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(union) => union, + }; + let intersection = match self.intersect(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(intersection) => intersection, + }; + union.difference(&intersection) + } + + /// Returns true if and only if the two ranges are contiguous. Two ranges + /// are contiguous if and only if the ranges are either overlapping or + /// adjacent. + fn is_contiguous(&self, other: &Self) -> bool { + let lower1 = self.lower().as_u32(); + let upper1 = self.upper().as_u32(); + let lower2 = other.lower().as_u32(); + let upper2 = other.upper().as_u32(); + cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) + } + + /// Returns true if and only if the intersection of this range and the + /// other range is empty. + fn is_intersection_empty(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + cmp::max(lower1, lower2) > cmp::min(upper1, upper2) + } + + /// Returns true if and only if this range is a subset of the other range. + fn is_subset(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + (lower2 <= lower1 && lower1 <= upper2) + && (lower2 <= upper1 && upper1 <= upper2) + } +} + +pub trait Bound: + Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord +{ + fn min_value() -> Self; + fn max_value() -> Self; + fn as_u32(self) -> u32; + fn increment(self) -> Self; + fn decrement(self) -> Self; +} + +impl Bound for u8 { + fn min_value() -> Self { + u8::MIN + } + fn max_value() -> Self { + u8::MAX + } + fn as_u32(self) -> u32 { + u32::from(self) + } + fn increment(self) -> Self { + self.checked_add(1).unwrap() + } + fn decrement(self) -> Self { + self.checked_sub(1).unwrap() + } +} + +impl Bound for char { + fn min_value() -> Self { + '\x00' + } + fn max_value() -> Self { + '\u{10FFFF}' + } + fn as_u32(self) -> u32 { + u32::from(self) + } + + fn increment(self) -> Self { + match self { + '\u{D7FF}' => '\u{E000}', + c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), + } + } + + fn decrement(self) -> Self { + match self { + '\u{E000}' => '\u{D7FF}', + c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), + } + } +} + +// Tests for interval sets are written in src/hir.rs against the public API. diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs new file mode 100644 index 0000000..a5a3737 --- /dev/null +++ b/vendor/regex-syntax/src/hir/literal.rs @@ -0,0 +1,3214 @@ +/*! +Provides literal extraction from `Hir` expressions. + +An [`Extractor`] pulls literals out of [`Hir`] expressions and returns a +[`Seq`] of [`Literal`]s. + +The purpose of literal extraction is generally to provide avenues for +optimizing regex searches. The main idea is that substring searches can be an +order of magnitude faster than a regex search. Therefore, if one can execute +a substring search to find candidate match locations and only run the regex +search at those locations, then it is possible for huge improvements in +performance to be realized. + +With that said, literal optimizations are generally a black art because even +though substring search is generally faster, if the number of candidates +produced is high, then it can create a lot of overhead by ping-ponging between +the substring search and the regex search. + +Here are some heuristics that might be used to help increase the chances of +effective literal optimizations: + +* Stick to small [`Seq`]s. If you search for too many literals, it's likely +to lead to substring search that is only a little faster than a regex search, +and thus the overhead of using literal optimizations in the first place might +make things slower overall. +* The literals in your [`Seq`] shouldn't be too short. In general, longer is +better. A sequence corresponding to single bytes that occur frequently in the +haystack, for example, is probably a bad literal optimization because it's +likely to produce many false positive candidates. Longer literals are less +likely to match, and thus probably produce fewer false positives. +* If it's possible to estimate the approximate frequency of each byte according +to some pre-computed background distribution, it is possible to compute a score +of how "good" a `Seq` is. If a `Seq` isn't good enough, you might consider +skipping the literal optimization and just use the regex engine. + +(It should be noted that there are always pathological cases that can make +any kind of literal optimization be a net slower result. This is why it +might be a good idea to be conservative, or to even provide a means for +literal optimizations to be dynamically disabled if they are determined to be +ineffective according to some measure.) + +You're encouraged to explore the methods on [`Seq`], which permit shrinking +the size of sequences in a preference-order preserving fashion. + +Finally, note that it isn't strictly necessary to use an [`Extractor`]. Namely, +an `Extractor` only uses public APIs of the [`Seq`] and [`Literal`] types, +so it is possible to implement your own extractor. For example, for n-grams +or "inner" literals (i.e., not prefix or suffix literals). The `Extractor` +is mostly responsible for the case analysis over `Hir` expressions. Much of +the "trickier" parts are how to combine literal sequences, and that is all +implemented on [`Seq`]. +*/ + +use core::{cmp, mem, num::NonZeroUsize}; + +use alloc::{vec, vec::Vec}; + +use crate::hir::{self, Hir}; + +/// Extracts prefix or suffix literal sequences from [`Hir`] expressions. +/// +/// Literal extraction is based on the following observations: +/// +/// * Many regexes start with one or a small number of literals. +/// * Substring search for literals is often much faster (sometimes by an order +/// of magnitude) than a regex search. +/// +/// Thus, in many cases, one can search for literals to find candidate starting +/// locations of a match, and then only run the full regex engine at each such +/// location instead of over the full haystack. +/// +/// The main downside of literal extraction is that it can wind up causing a +/// search to be slower overall. For example, if there are many matches or if +/// there are many candidates that don't ultimately lead to a match, then a +/// lot of overhead will be spent in shuffing back-and-forth between substring +/// search and the regex engine. This is the fundamental reason why literal +/// optimizations for regex patterns is sometimes considered a "black art." +/// +/// # Look-around assertions +/// +/// Literal extraction treats all look-around assertions as-if they match every +/// empty string. So for example, the regex `\bquux\b` will yield a sequence +/// containing a single exact literal `quux`. However, not all occurrences +/// of `quux` correspond to a match a of the regex. For example, `\bquux\b` +/// does not match `ZquuxZ` anywhere because `quux` does not fall on a word +/// boundary. +/// +/// In effect, if your regex contains look-around assertions, then a match of +/// an exact literal does not necessarily mean the regex overall matches. So +/// you may still need to run the regex engine in such cases to confirm the +/// match. +/// +/// The precise guarantee you get from a literal sequence is: if every literal +/// in the sequence is exact and the original regex contains zero look-around +/// assertions, then a preference-order multi-substring search of those +/// literals will precisely match a preference-order search of the original +/// regex. +/// +/// # Example +/// +/// This shows how to extract prefixes: +/// +/// ``` +/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; +/// +/// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?; +/// let got = Extractor::new().extract(&hir); +/// // All literals returned are "inexact" because none of them reach the +/// // match state. +/// let expected = Seq::from_iter([ +/// Literal::inexact("ax"), +/// Literal::inexact("ay"), +/// Literal::inexact("az"), +/// Literal::inexact("bx"), +/// Literal::inexact("by"), +/// Literal::inexact("bz"), +/// Literal::inexact("cx"), +/// Literal::inexact("cy"), +/// Literal::inexact("cz"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// This shows how to extract suffixes: +/// +/// ``` +/// use regex_syntax::{ +/// hir::literal::{Extractor, ExtractKind, Literal, Seq}, +/// parse, +/// }; +/// +/// let hir = parse(r"foo|[A-Z]+bar")?; +/// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir); +/// // Since 'foo' gets to a match state, it is considered exact. But 'bar' +/// // does not because of the '[A-Z]+', and thus is marked inexact. +/// let expected = Seq::from_iter([ +/// Literal::exact("foo"), +/// Literal::inexact("bar"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Extractor { + kind: ExtractKind, + limit_class: usize, + limit_repeat: usize, + limit_literal_len: usize, + limit_total: usize, +} + +impl Extractor { + /// Create a new extractor with a default configuration. + /// + /// The extractor can be optionally configured before calling + /// [`Extractor::extract`] to get a literal sequence. + pub fn new() -> Extractor { + Extractor { + kind: ExtractKind::Prefix, + limit_class: 10, + limit_repeat: 10, + limit_literal_len: 100, + limit_total: 250, + } + } + + /// Execute the extractor and return a sequence of literals. + pub fn extract(&self, hir: &Hir) -> Seq { + use crate::hir::HirKind::*; + + match *hir.kind() { + Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + Literal(hir::Literal(ref bytes)) => { + let mut seq = + Seq::singleton(self::Literal::exact(bytes.to_vec())); + self.enforce_literal_len(&mut seq); + seq + } + Class(hir::Class::Unicode(ref cls)) => { + self.extract_class_unicode(cls) + } + Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), + Repetition(ref rep) => self.extract_repetition(rep), + Capture(hir::Capture { ref sub, .. }) => self.extract(sub), + Concat(ref hirs) => match self.kind { + ExtractKind::Prefix => self.extract_concat(hirs.iter()), + ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()), + }, + Alternation(ref hirs) => { + // Unlike concat, we always union starting from the beginning, + // since the beginning corresponds to the highest preference, + // which doesn't change based on forwards vs reverse. + self.extract_alternation(hirs.iter()) + } + } + } + + /// Set the kind of literal sequence to extract from an [`Hir`] expression. + /// + /// The default is to extract prefixes, but suffixes can be selected + /// instead. The contract for prefixes is that every match of the + /// corresponding `Hir` must start with one of the literals in the sequence + /// returned. Moreover, the _order_ of the sequence returned corresponds to + /// the preference order. + /// + /// Suffixes satisfy a similar contract in that every match of the + /// corresponding `Hir` must end with one of the literals in the sequence + /// returned. However, there is no guarantee that the literals are in + /// preference order. + /// + /// Remember that a sequence can be infinite. For example, unless the + /// limits are configured to be impractically large, attempting to extract + /// prefixes (or suffixes) for the pattern `[A-Z]` will return an infinite + /// sequence. Generally speaking, if the sequence returned is infinite, + /// then it is presumed to be unwise to do prefix (or suffix) optimizations + /// for the pattern. + pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor { + self.kind = kind; + self + } + + /// Configure a limit on the length of the sequence that is permitted for + /// a character class. If a character class exceeds this limit, then the + /// sequence returned for it is infinite. + /// + /// This prevents classes like `[A-Z]` or `\pL` from getting turned into + /// huge and likely unproductive sequences of literals. + /// + /// # Example + /// + /// This example shows how this limit can be lowered to decrease the tolerance + /// for character classes being turned into literal sequences. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse}; + /// + /// let hir = parse(r"[0-9]")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new([ + /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + /// ]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_class(4).extract(&hir); + /// let expected = Seq::infinite(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_class(&mut self, limit: usize) -> &mut Extractor { + self.limit_class = limit; + self + } + + /// Configure a limit on the total number of repetitions that is permitted + /// before literal extraction is stopped. + /// + /// This is useful for limiting things like `(abcde){50}`, or more + /// insidiously, `(?:){1000000000}`. This limit prevents any one single + /// repetition from adding too much to a literal sequence. + /// + /// With this limit set, repetitions that exceed it will be stopped and any + /// literals extracted up to that point will be made inexact. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"(abc){8}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_repeat(4).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabc"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor { + self.limit_repeat = limit; + self + } + + /// Configure a limit on the maximum length of any literal in a sequence. + /// + /// This is useful for limiting things like `(abcde){5}{5}{5}{5}`. While + /// each repetition or literal in that regex is small, when all the + /// repetitions are applied, one ends up with a literal of length `5^4 = + /// 625`. + /// + /// With this limit set, literals that exceed it will be made inexact and + /// thus prevented from growing. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"(abc){2}{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_literal_len(14).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabcab"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor { + self.limit_literal_len = limit; + self + } + + /// Configure a limit on the total number of literals that will be + /// returned. + /// + /// This is useful as a practical measure for avoiding the creation of + /// large sequences of literals. While the extractor will automatically + /// handle local creations of large sequences (for example, `[A-Z]` yields + /// an infinite sequence by default), large sequences can be created + /// through non-local means as well. + /// + /// For example, `[ab]{3}{3}` would yield a sequence of length `512 = 2^9` + /// despite each of the repetitions being small on their own. This limit + /// thus represents a "catch all" for avoiding locally small sequences from + /// combining into large sequences. + /// + /// # Example + /// + /// This example shows how reducing the limit will change the literal + /// sequence returned. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"[ab]{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new([ + /// "aaaa", "aaab", "aaba", "aabb", + /// "abaa", "abab", "abba", "abbb", + /// "baaa", "baab", "baba", "babb", + /// "bbaa", "bbab", "bbba", "bbbb", + /// ]); + /// assert_eq!(expected, got); + /// + /// // The default limit is not too big, but big enough to extract all + /// // literals from '[ab]{2}{2}'. If we shrink the limit to less than 16, + /// // then we'll get a truncated set. Notice that it returns a sequence of + /// // length 4 even though our limit was 10. This is because the sequence + /// // is difficult to increase without blowing the limit. Notice also + /// // that every literal in the sequence is now inexact because they were + /// // stripped of some suffix. + /// let got = Extractor::new().limit_total(10).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("aa"), + /// Literal::inexact("ab"), + /// Literal::inexact("ba"), + /// Literal::inexact("bb"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_total(&mut self, limit: usize) -> &mut Extractor { + self.limit_total = limit; + self + } + + /// Extract a sequence from the given concatenation. Sequences from each of + /// the child HIR expressions are combined via cross product. + /// + /// This short circuits once the cross product turns into a sequence + /// containing only inexact literals. + fn extract_concat<'a, I: Iterator>(&self, it: I) -> Seq { + let mut seq = Seq::singleton(self::Literal::exact(vec![])); + for hir in it { + // If every element in the sequence is inexact, then a cross + // product will always be a no-op. Thus, there is nothing else we + // can add to it and can quit early. Note that this also includes + // infinite sequences. + if seq.is_inexact() { + break; + } + // Note that 'cross' also dispatches based on whether we're + // extracting prefixes or suffixes. + seq = self.cross(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence from the given alternation. + /// + /// This short circuits once the union turns into an infinite sequence. + fn extract_alternation<'a, I: Iterator>( + &self, + it: I, + ) -> Seq { + let mut seq = Seq::empty(); + for hir in it { + // Once our 'seq' is infinite, every subsequent union + // operation on it will itself always result in an + // infinite sequence. Thus, it can never change and we can + // short-circuit. + if !seq.is_finite() { + break; + } + seq = self.union(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence of literals from the given repetition. We do our + /// best, Some examples: + /// + /// 'a*' => [inexact(a), exact("")] + /// 'a*?' => [exact(""), inexact(a)] + /// 'a+' => [inexact(a)] + /// 'a{3}' => [exact(aaa)] + /// 'a{3,5} => [inexact(aaa)] + /// + /// The key here really is making sure we get the 'inexact' vs 'exact' + /// attributes correct on each of the literals we add. For example, the + /// fact that 'a*' gives us an inexact 'a' and an exact empty string means + /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)] + /// literals being extracted, which might actually be a better prefilter + /// than just 'a'. + fn extract_repetition(&self, rep: &hir::Repetition) -> Seq { + let mut subseq = self.extract(&rep.sub); + match *rep { + hir::Repetition { min: 0, max, greedy, .. } => { + // When 'max=1', we can retain exactness, since 'a?' is + // equivalent to 'a|'. Similarly below, 'a??' is equivalent to + // '|a'. + if max != Some(1) { + subseq.make_inexact(); + } + let mut empty = Seq::singleton(Literal::exact(vec![])); + if !greedy { + mem::swap(&mut subseq, &mut empty); + } + self.union(subseq, &mut empty) + } + hir::Repetition { min, max: Some(max), .. } if min == max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + if usize::try_from(min).is_err() || min > limit { + seq.make_inexact(); + } + seq + } + hir::Repetition { min, .. } => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + seq.make_inexact(); + seq + } + } + } + + /// Convert the given Unicode class into a sequence of literals if the + /// class is small enough. If the class is too big, return an infinite + /// sequence. + fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq { + if self.class_over_limit_unicode(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for ch in r.start()..=r.end() { + seq.push(Literal::from(ch)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Convert the given byte class into a sequence of literals if the class + /// is small enough. If the class is too big, return an infinite sequence. + fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq { + if self.class_over_limit_bytes(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for b in r.start()..=r.end() { + seq.push(Literal::from(b)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Returns true if the given Unicode class exceeds the configured limits + /// on this extractor. + fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Returns true if the given byte class exceeds the configured limits on + /// this extractor. + fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Compute the cross product of the two sequences if the result would be + /// within configured limits. Otherwise, make `seq2` infinite and cross the + /// infinite sequence with `seq1`. + fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + if let ExtractKind::Suffix = self.kind { + seq1.cross_reverse(seq2); + } else { + seq1.cross_forward(seq2); + } + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + self.enforce_literal_len(&mut seq1); + seq1 + } + + /// Union the two sequences if the result would be within configured + /// limits. Otherwise, make `seq2` infinite and union the infinite sequence + /// with `seq1`. + fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) + { + // We try to trim our literal sequences to see if we can make + // room for more literals. The idea is that we'd rather trim down + // literals already in our sequence if it means we can add a few + // more and retain a finite sequence. Otherwise, we'll union with + // an infinite sequence and that infects everything and effectively + // stops literal extraction in its tracks. + // + // We do we keep 4 bytes here? Well, it's a bit of an abstraction + // leakage. Downstream, the literals may wind up getting fed to + // the Teddy algorithm, which supports searching literals up to + // length 4. So that's why we pick that number here. Arguably this + // should be a tuneable parameter, but it seems a little tricky to + // describe. And I'm still unsure if this is the right way to go + // about culling literal sequences. + match self.kind { + ExtractKind::Prefix => { + seq1.keep_first_bytes(4); + seq2.keep_first_bytes(4); + } + ExtractKind::Suffix => { + seq1.keep_last_bytes(4); + seq2.keep_last_bytes(4); + } + } + seq1.dedup(); + seq2.dedup(); + if seq1 + .max_union_len(seq2) + .map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + } + seq1.union(seq2); + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + seq1 + } + + /// Applies the literal length limit to the given sequence. If none of the + /// literals in the sequence exceed the limit, then this is a no-op. + fn enforce_literal_len(&self, seq: &mut Seq) { + let len = self.limit_literal_len; + match self.kind { + ExtractKind::Prefix => seq.keep_first_bytes(len), + ExtractKind::Suffix => seq.keep_last_bytes(len), + } + } +} + +impl Default for Extractor { + fn default() -> Extractor { + Extractor::new() + } +} + +/// The kind of literals to extract from an [`Hir`] expression. +/// +/// The default extraction kind is `Prefix`. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum ExtractKind { + /// Extracts only prefix literals from a regex. + Prefix, + /// Extracts only suffix literals from a regex. + /// + /// Note that the sequence returned by suffix literals currently may + /// not correctly represent leftmost-first or "preference" order match + /// semantics. + Suffix, +} + +impl ExtractKind { + /// Returns true if this kind is the `Prefix` variant. + pub fn is_prefix(&self) -> bool { + matches!(*self, ExtractKind::Prefix) + } + + /// Returns true if this kind is the `Suffix` variant. + pub fn is_suffix(&self) -> bool { + matches!(*self, ExtractKind::Suffix) + } +} + +impl Default for ExtractKind { + fn default() -> ExtractKind { + ExtractKind::Prefix + } +} + +/// A sequence of literals. +/// +/// A `Seq` is very much like a set in that it represents a union of its +/// members. That is, it corresponds to a set of literals where at least one +/// must match in order for a particular [`Hir`] expression to match. (Whether +/// this corresponds to the entire `Hir` expression, a prefix of it or a suffix +/// of it depends on how the `Seq` was extracted from the `Hir`.) +/// +/// It is also unlike a set in that multiple identical literals may appear, +/// and that the order of the literals in the `Seq` matters. For example, if +/// the sequence is `[sam, samwise]` and leftmost-first matching is used, then +/// `samwise` can never match and the sequence is equivalent to `[sam]`. +/// +/// # States of a sequence +/// +/// A `Seq` has a few different logical states to consider: +/// +/// * The sequence can represent "any" literal. When this happens, the set does +/// not have a finite size. The purpose of this state is to inhibit callers +/// from making assumptions about what literals are required in order to match +/// a particular [`Hir`] expression. Generally speaking, when a set is in this +/// state, literal optimizations are inhibited. A good example of a regex that +/// will cause this sort of set to appear is `[A-Za-z]`. The character class +/// is just too big (and also too narrow) to be usefully expanded into 52 +/// different literals. (Note that the decision for when a seq should become +/// infinite is determined by the caller. A seq itself has no hard-coded +/// limits.) +/// * The sequence can be empty, in which case, it is an affirmative statement +/// that there are no literals that can match the corresponding `Hir`. +/// Consequently, the `Hir` never matches any input. For example, `[a&&b]`. +/// * The sequence can be non-empty, in which case, at least one of the +/// literals must match in order for the corresponding `Hir` to match. +/// +/// # Example +/// +/// This example shows how literal sequences can be simplified by stripping +/// suffixes and minimizing while maintaining preference order. +/// +/// ``` +/// use regex_syntax::hir::literal::{Literal, Seq}; +/// +/// let mut seq = Seq::new(&[ +/// "farm", +/// "appliance", +/// "faraway", +/// "apple", +/// "fare", +/// "gap", +/// "applicant", +/// "applaud", +/// ]); +/// seq.keep_first_bytes(3); +/// seq.minimize_by_preference(); +/// // Notice that 'far' comes before 'app', which matches the order in the +/// // original sequence. This guarantees that leftmost-first semantics are +/// // not altered by simplifying the set. +/// let expected = Seq::from_iter([ +/// Literal::inexact("far"), +/// Literal::inexact("app"), +/// Literal::exact("gap"), +/// ]); +/// assert_eq!(expected, seq); +/// ``` +#[derive(Clone, Eq, PartialEq)] +pub struct Seq { + /// The members of this seq. + /// + /// When `None`, the seq represents all possible literals. That is, it + /// prevents one from making assumptions about specific literals in the + /// seq, and forces one to treat it as if any literal might be in the seq. + /// + /// Note that `Some(vec![])` is valid and corresponds to the empty seq of + /// literals, i.e., a regex that can never match. For example, `[a&&b]`. + /// It is distinct from `Some(vec![""])`, which corresponds to the seq + /// containing an empty string, which matches at every position. + literals: Option>, +} + +impl Seq { + /// Returns an empty sequence. + /// + /// An empty sequence matches zero literals, and thus corresponds to a + /// regex that itself can never match. + #[inline] + pub fn empty() -> Seq { + Seq { literals: Some(vec![]) } + } + + /// Returns a sequence of literals without a finite size and may contain + /// any literal. + /// + /// A sequence without finite size does not reveal anything about the + /// characteristics of the literals in its set. There are no fixed prefixes + /// or suffixes, nor are lower or upper bounds on the length of the literals + /// in the set known. + /// + /// This is useful to represent constructs in a regex that are "too big" + /// to useful represent as a sequence of literals. For example, `[A-Za-z]`. + /// When sequences get too big, they lose their discriminating nature and + /// are more likely to produce false positives, which in turn makes them + /// less likely to speed up searches. + /// + /// More pragmatically, for many regexes, enumerating all possible literals + /// is itself not possible or might otherwise use too many resources. So + /// constraining the size of sets during extraction is a practical trade + /// off to make. + #[inline] + pub fn infinite() -> Seq { + Seq { literals: None } + } + + /// Returns a sequence containing a single literal. + #[inline] + pub fn singleton(lit: Literal) -> Seq { + Seq { literals: Some(vec![lit]) } + } + + /// Returns a sequence of exact literals from the given byte strings. + #[inline] + pub fn new(it: I) -> Seq + where + I: IntoIterator, + B: AsRef<[u8]>, + { + it.into_iter().map(|b| Literal::exact(b.as_ref())).collect() + } + + /// If this is a finite sequence, return its members as a slice of + /// literals. + /// + /// The slice returned may be empty, in which case, there are no literals + /// that can match this sequence. + #[inline] + pub fn literals(&self) -> Option<&[Literal]> { + self.literals.as_deref() + } + + /// Push a literal to the end of this sequence. + /// + /// If this sequence is not finite, then this is a no-op. + /// + /// Similarly, if the most recently added item of this sequence is + /// equivalent to the literal given, then it is not added. This reflects + /// a `Seq`'s "set like" behavior, and represents a practical trade off. + /// Namely, there is never any need to have two adjacent and equivalent + /// literals in the same sequence, _and_ it is easy to detect in some + /// cases. + #[inline] + pub fn push(&mut self, lit: Literal) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + if lits.last().map_or(false, |m| m == &lit) { + return; + } + lits.push(lit); + } + + /// Make all of the literals in this sequence inexact. + /// + /// This is a no-op if this sequence is not finite. + #[inline] + pub fn make_inexact(&mut self) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + for lit in lits.iter_mut() { + lit.make_inexact(); + } + } + + /// Converts this sequence to an infinite sequence. + /// + /// This is a no-op if the sequence is already infinite. + #[inline] + pub fn make_infinite(&mut self) { + self.literals = None; + } + + /// Modify this sequence to contain the cross product between it and the + /// sequence given. + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("fooquux"), + /// Literal::exact("foobaz"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite prefixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior of this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_forward(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + let newcap = lits1.len().saturating_mul(lits2.len()); + for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) { + if !selflit.is_exact() { + lits1.push(selflit); + continue; + } + for otherlit in lits2.iter() { + let mut newlit = Literal::exact(Vec::with_capacity( + selflit.len() + otherlit.len(), + )); + newlit.extend(&selflit); + newlit.extend(&otherlit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + lits2.drain(..); + self.dedup(); + } + + /// Modify this sequence to contain the cross product between it and + /// the sequence given, where the sequences are treated as suffixes + /// instead of prefixes. Namely, the sequence `other` is *prepended* + /// to `self` (as opposed to `other` being *appended* to `self` in + /// [`Seq::cross_forward`]). + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("quuxfoo"), + /// Literal::inexact("bar"), + /// Literal::exact("bazfoo"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite suffixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior when this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_reverse(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + // We basically proceed as we do in 'cross_forward' at this point, + // except that the outer loop is now 'other' and the inner loop is now + // 'self'. That's because 'self' corresponds to suffixes and 'other' + // corresponds to the sequence we want to *prepend* to the suffixes. + let newcap = lits1.len().saturating_mul(lits2.len()); + let selflits = mem::replace(lits1, Vec::with_capacity(newcap)); + for (i, otherlit) in lits2.drain(..).enumerate() { + for selflit in selflits.iter() { + if !selflit.is_exact() { + // If the suffix isn't exact, then we can't prepend + // anything to it. However, we still want to keep it. But + // we only want to keep one of them, to avoid duplication. + // (The duplication is okay from a correctness perspective, + // but wasteful.) + if i == 0 { + lits1.push(selflit.clone()); + } + continue; + } + let mut newlit = Literal::exact(Vec::with_capacity( + otherlit.len() + selflit.len(), + )); + newlit.extend(&otherlit); + newlit.extend(&selflit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + self.dedup(); + } + + /// A helper function the corresponds to the subtle preamble for both + /// `cross_forward` and `cross_reverse`. In effect, it handles the cases + /// of infinite sequences for both `self` and `other`, as well as ensuring + /// that literals from `other` are drained even if they aren't used. + fn cross_preamble<'a>( + &'a mut self, + other: &'a mut Seq, + ) -> Option<(&'a mut Vec, &'a mut Vec)> { + let lits2 = match other.literals { + None => { + // If our current seq contains the empty string and the seq + // we're adding matches any literal, then it follows that the + // current seq must now also match any literal. + // + // Otherwise, we just have to make sure everything in this + // sequence is inexact. + if self.min_literal_len() == Some(0) { + *self = Seq::infinite(); + } else { + self.make_inexact(); + } + return None; + } + Some(ref mut lits) => lits, + }; + let lits1 = match self.literals { + None => { + // If we aren't going to make it to the end of this routine + // where lits2 is drained, then we need to do it now. + lits2.drain(..); + return None; + } + Some(ref mut lits) => lits, + }; + Some((lits1, lits2)) + } + + /// Unions the `other` sequence into this one. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// // Adjacent literals are deduped, but non-adjacent literals may not be. + /// assert_eq!(Seq::new(&["foo", "bar", "quux", "foo"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::infinite(); + /// // Infinite sequences have no finite length. + /// assert_eq!(None, seq1.len()); + /// + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // seq1 is still infinite and seq2 has been drained. + /// assert_eq!(None, seq1.len()); + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union(&mut self, other: &mut Seq) { + let lits2 = match other.literals { + None => { + // Unioning with an infinite sequence always results in an + // infinite sequence. + self.make_infinite(); + return; + } + Some(ref mut lits) => lits.drain(..), + }; + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + lits1.extend(lits2); + self.dedup(); + } + + /// Unions the `other` sequence into this one by splice the `other` + /// sequence at the position of the first zero-length literal. + /// + /// This is useful for preserving preference order semantics when combining + /// two literal sequences. For example, in the regex `(a||f)+foo`, the + /// correct preference order prefix sequence is `[a, foo, f]`. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. Note that + /// the literals are drained even if no union is performed as well, i.e., + /// when this sequence does not contain a zero-length literal. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["a", "", "f", ""]); + /// let mut seq2 = Seq::new(&["foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// // 'foo' gets spliced into seq1 where the first empty string occurs. + /// assert_eq!(Seq::new(&["a", "foo", "f"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // seq1 has no zero length literals, so no splicing happens. + /// assert_eq!(Seq::new(&["foo", "bar"]), seq1); + /// // Even though no splicing happens, seq2 is still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union_into_empty(&mut self, other: &mut Seq) { + let lits2 = other.literals.as_mut().map(|lits| lits.drain(..)); + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + let first_empty = match lits1.iter().position(|m| m.is_empty()) { + None => return, + Some(i) => i, + }; + let lits2 = match lits2 { + None => { + // Note that we are only here if we've found an empty literal, + // which implies that an infinite sequence infects this seq and + // also turns it into an infinite sequence. + self.literals = None; + return; + } + Some(lits) => lits, + }; + // Clearing out the empties needs to come before the splice because + // the splice might add more empties that we don't want to get rid + // of. Since we're splicing into the position of the first empty, the + // 'first_empty' position computed above is still correct. + lits1.retain(|m| !m.is_empty()); + lits1.splice(first_empty..first_empty, lits2); + self.dedup(); + } + + /// Deduplicate adjacent equivalent literals in this sequence. + /// + /// If adjacent literals are equivalent strings but one is exact and the + /// other inexact, the inexact literal is kept and the exact one is + /// removed. + /// + /// Deduping an infinite sequence is a no-op. + /// + /// # Example + /// + /// This example shows how literals that are duplicate byte strings but + /// are not equivalent with respect to exactness are resolved. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("foo"), + /// ]); + /// seq.dedup(); + /// + /// assert_eq!(Seq::from_iter([Literal::inexact("foo")]), seq); + /// ``` + #[inline] + pub fn dedup(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.dedup_by(|lit1, lit2| { + if lit1.as_bytes() != lit2.as_bytes() { + return false; + } + if lit1.is_exact() != lit2.is_exact() { + lit1.make_inexact(); + lit2.make_inexact(); + } + true + }); + } + } + + /// Sorts this sequence of literals lexicographically. + /// + /// Note that if, before sorting, if a literal that is a prefix of another + /// literal appears after it, then after sorting, the sequence will not + /// represent the same preference order match semantics. For example, + /// sorting the sequence `[samwise, sam]` yields the sequence `[sam, + /// samwise]`. Under preference order semantics, the latter sequence will + /// never match `samwise` where as the first sequence can. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::new(&["foo", "quux", "bar"]); + /// seq.sort(); + /// + /// assert_eq!(Seq::new(&["bar", "foo", "quux"]), seq); + /// ``` + #[inline] + pub fn sort(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.sort(); + } + } + + /// Reverses all of the literals in this sequence. + /// + /// The order of the sequence itself is preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::new(&["oof", "rab"]); + /// seq.reverse_literals(); + /// assert_eq!(Seq::new(&["foo", "bar"]), seq); + /// ``` + #[inline] + pub fn reverse_literals(&mut self) { + if let Some(ref mut lits) = self.literals { + for lit in lits.iter_mut() { + lit.reverse(); + } + } + } + + /// Shrinks this seq to its minimal size while respecting the preference + /// order of its literals. + /// + /// While this routine will remove duplicate literals from this seq, it + /// will also remove literals that can never match in a leftmost-first or + /// "preference order" search. Similar to [`Seq::dedup`], if a literal is + /// deduped, then the one that remains is made inexact. + /// + /// This is a no-op on seqs that are empty or not finite. + /// + /// # Example + /// + /// This example shows the difference between `{sam, samwise}` and + /// `{samwise, sam}`. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // If 'sam' comes before 'samwise' and a preference order search is + /// // executed, then 'samwise' can never match. + /// let mut seq = Seq::new(&["sam", "samwise"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq); + /// + /// // But if they are reversed, then it's possible for 'samwise' to match + /// // since it is given higher preference. + /// let mut seq = Seq::new(&["samwise", "sam"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::new(&["samwise", "sam"]), seq); + /// ``` + /// + /// This example shows that if an empty string is in this seq, then + /// anything that comes after it can never match. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // An empty string is a prefix of all strings, so it automatically + /// // inhibits any subsequent strings from matching. + /// let mut seq = Seq::new(&["foo", "bar", "", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// let expected = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact("bar"), + /// Literal::inexact(""), + /// ]); + /// assert_eq!(expected, seq); + /// + /// // And of course, if it's at the beginning, then it makes it impossible + /// // for anything else to match. + /// let mut seq = Seq::new(&["", "foo", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq); + /// ``` + #[inline] + pub fn minimize_by_preference(&mut self) { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, false); + } + } + + /// Trims all literals in this seq such that only the first `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::new(&["a", "foo", "quux"]); + /// seq.keep_first_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("fo"), + /// Literal::inexact("qu"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_first_bytes(len); + } + } + } + + /// Trims all literals in this seq such that only the last `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::new(&["a", "foo", "quux"]); + /// seq.keep_last_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("oo"), + /// Literal::inexact("ux"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_last_bytes(len); + } + } + } + + /// Returns true if this sequence is finite. + /// + /// When false, this sequence is infinite and must be treated as if it + /// contains every possible literal. + #[inline] + pub fn is_finite(&self) -> bool { + self.literals.is_some() + } + + /// Returns true if and only if this sequence is finite and empty. + /// + /// An empty sequence never matches anything. It can only be produced by + /// literal extraction when the corresponding regex itself cannot match. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == Some(0) + } + + /// Returns the number of literals in this sequence if the sequence is + /// finite. If the sequence is infinite, then `None` is returned. + #[inline] + pub fn len(&self) -> Option { + self.literals.as_ref().map(|lits| lits.len()) + } + + /// Returns true if and only if all literals in this sequence are exact. + /// + /// This returns false if the sequence is infinite. + #[inline] + pub fn is_exact(&self) -> bool { + self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact())) + } + + /// Returns true if and only if all literals in this sequence are inexact. + /// + /// This returns true if the sequence is infinite. + #[inline] + pub fn is_inexact(&self) -> bool { + self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact())) + } + + /// Return the maximum length of the sequence that would result from + /// unioning `self` with `other`. If either set is infinite, then this + /// returns `None`. + #[inline] + pub fn max_union_len(&self, other: &Seq) -> Option { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_add(len2)) + } + + /// Return the maximum length of the sequence that would result from the + /// cross product of `self` with `other`. If either set is infinite, then + /// this returns `None`. + #[inline] + pub fn max_cross_len(&self, other: &Seq) -> Option { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_mul(len2)) + } + + /// Returns the length of the shortest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn min_literal_len(&self) -> Option { + self.literals.as_ref()?.iter().map(|x| x.len()).min() + } + + /// Returns the length of the longest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn max_literal_len(&self) -> Option { + self.literals.as_ref()?.iter().map(|x| x.len()).max() + } + + /// Returns the longest common prefix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful prefix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common prefix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::new(&["foo", "foobar", "fo"]); + /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// ``` + #[inline] + pub fn longest_common_prefix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common prefix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .zip(base[..len].iter()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[..len]) + } + + /// Returns the longest common suffix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful suffix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common suffix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::new(&["oof", "raboof", "of"]); + /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// ``` + #[inline] + pub fn longest_common_suffix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common suffix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .rev() + .zip(base[base.len() - len..].iter().rev()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[base.len() - len..]) + } + + /// Optimizes this seq while treating its literals as prefixes and + /// respecting the preference order of its literals. + /// + /// The specific way "optimization" works is meant to be an implementation + /// detail, as it essentially represents a set of heuristics. The goal + /// that optimization tries to accomplish is to make the literals in this + /// set reflect inputs that will result in a more effective prefilter. + /// Principally by reducing the false positive rate of candidates found by + /// the literals in this sequence. That is, when a match of a literal is + /// found, we would like it to be a strong predictor of the overall match + /// of the regex. If it isn't, then much time will be spent starting and + /// stopping the prefilter search and attempting to confirm the match only + /// to have it fail. + /// + /// Some of those heuristics might be: + /// + /// * Identifying a common prefix from a larger sequence of literals, and + /// shrinking the sequence down to that single common prefix. + /// * Rejecting the sequence entirely if it is believed to result in very + /// high false positive rate. When this happens, the sequence is made + /// infinite. + /// * Shrinking the sequence to a smaller number of literals representing + /// prefixes, but not shrinking it so much as to make literals too short. + /// (A sequence with very short literals, of 1 or 2 bytes, will typically + /// result in a higher false positive rate.) + /// + /// Optimization should only be run once extraction is complete. Namely, + /// optimization may make assumptions that do not compose with other + /// operations in the middle of extraction. For example, optimization will + /// reduce `[E(sam), E(samwise)]` to `[E(sam)]`, but such a transformation + /// is only valid if no other extraction will occur. If other extraction + /// may occur, then the correct transformation would be to `[I(sam)]`. + /// + /// The [`Seq::optimize_for_suffix_by_preference`] does the same thing, but + /// for suffixes. + /// + /// # Example + /// + /// This shows how optimization might transform a sequence. Note that + /// the specific behavior is not a documented guarantee. The heuristics + /// used are an implementation detail and may change over time in semver + /// compatible releases. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert_eq!(Seq::from_iter([ + /// Literal::exact("samantha"), + /// // Kept exact even though 'samwise' got pruned + /// // because optimization assumes literal extraction + /// // has finished. + /// Literal::exact("sam"), + /// Literal::exact("frodo"), + /// ]), seq); + /// ``` + /// + /// # Example: optimization may make the sequence infinite + /// + /// If the heuristics deem that the sequence could cause a very high false + /// positive rate, then it may make the sequence infinite, effectively + /// disabling its use as a prefilter. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// // An empty string matches at every position, + /// // thus rendering the prefilter completely + /// // ineffective. + /// "", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(!seq.is_finite()); + /// ``` + /// + /// Do note that just because there is a `" "` in the sequence, that + /// doesn't mean the sequence will always be made infinite after it is + /// optimized. Namely, if the sequence is considered exact (any match + /// corresponds to an overall match of the original regex), then any match + /// is an overall match, and so the false positive rate is always `0`. + /// + /// To demonstrate this, we remove `samwise` from our sequence. This + /// results in no optimization happening and all literals remain exact. + /// Thus the entire sequence is exact, and it is kept as-is, even though + /// one is an ASCII space: + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// " ", + /// "sam", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(seq.is_finite()); + /// ``` + #[inline] + pub fn optimize_for_prefix_by_preference(&mut self) { + self.optimize_by_preference(true); + } + + /// Optimizes this seq while treating its literals as suffixes and + /// respecting the preference order of its literals. + /// + /// Optimization should only be run once extraction is complete. + /// + /// The [`Seq::optimize_for_prefix_by_preference`] does the same thing, but + /// for prefixes. See its documentation for more explanation. + #[inline] + pub fn optimize_for_suffix_by_preference(&mut self) { + self.optimize_by_preference(false); + } + + fn optimize_by_preference(&mut self, prefix: bool) { + let origlen = match self.len() { + None => return, + Some(len) => len, + }; + // Just give up now if our sequence contains an empty string. + if self.min_literal_len().map_or(false, |len| len == 0) { + // We squash the sequence so that nobody else gets any bright + // ideas to try and use it. An empty string implies a match at + // every position. A prefilter cannot help you here. + self.make_infinite(); + return; + } + // Make sure we start with the smallest sequence possible. We use a + // special version of preference minimization that retains exactness. + // This is legal because optimization is only expected to occur once + // extraction is complete. + if prefix { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, true); + } + } + + // Look for a common prefix (or suffix). If we found one of those and + // it's long enough, then it's a good bet that it will be our fastest + // possible prefilter since single-substring search is so fast. + let fix = if prefix { + self.longest_common_prefix() + } else { + self.longest_common_suffix() + }; + if let Some(fix) = fix { + // As a special case, if we have a common prefix and the leading + // byte of that prefix is one that we think probably occurs rarely, + // then strip everything down to just that single byte. This should + // promote the use of memchr. + // + // ... we only do this though if our sequence has more than one + // literal. Otherwise, we'd rather just stick with a single literal + // scan. That is, using memchr is probably better than looking + // for 2 or more literals, but probably not as good as a straight + // memmem search. + // + // ... and also only do this when the prefix is short and probably + // not too discriminatory anyway. If it's longer, then it's + // probably quite discriminatory and thus is likely to have a low + // false positive rate. + if prefix + && origlen > 1 + && fix.len() >= 1 + && fix.len() <= 3 + && rank(fix[0]) < 200 + { + self.keep_first_bytes(1); + self.dedup(); + return; + } + // We only strip down to the common prefix/suffix if we think + // the existing set of literals isn't great, or if the common + // prefix/suffix is expected to be particularly discriminatory. + let isfast = + self.is_exact() && self.len().map_or(false, |len| len <= 16); + let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast); + if usefix { + // If we keep exactly the number of bytes equal to the length + // of the prefix (or suffix), then by the definition of a + // prefix, every literal in the sequence will be equivalent. + // Thus, 'dedup' will leave us with one literal. + // + // We do it this way to avoid an alloc, but also to make sure + // the exactness of literals is kept (or not). + if prefix { + self.keep_first_bytes(fix.len()); + } else { + self.keep_last_bytes(fix.len()); + } + self.dedup(); + assert_eq!(Some(1), self.len()); + // We still fall through here. In particular, we want our + // longest common prefix to be subject to the poison check. + } + } + // If we have an exact sequence, we *probably* just want to keep it + // as-is. But there are some cases where we don't. So we save a copy of + // the exact sequence now, and then try to do some more optimizations + // below. If those don't work out, we go back to this exact sequence. + // + // The specific motivation for this is that we sometimes wind up with + // an exact sequence with a hefty number of literals. Say, 100. If we + // stuck with that, it would be too big for Teddy and would result in + // using Aho-Corasick. Which is fine... but the lazy DFA is plenty + // suitable in such cases. The real issue is that we will wind up not + // using a fast prefilter at all. So in cases like this, even though + // we have an exact sequence, it would be better to try and shrink the + // sequence (which we do below) and use it as a prefilter that can + // produce false positive matches. + // + // But if the shrinking below results in a sequence that "sucks," then + // we don't want to use that because we already have an exact sequence + // in hand. + let exact: Option = + if self.is_exact() { Some(self.clone()) } else { None }; + // Now we attempt to shorten the sequence. The idea here is that we + // don't want to look for too many literals, but we want to shorten + // our sequence enough to improve our odds of using better algorithms + // downstream (such as Teddy). + // + // The pair of numbers in this list corresponds to the maximal prefix + // (in bytes) to keep for all literals and the length of the sequence + // at which to do it. + // + // So for example, the pair (3, 500) would mean, "if we have more than + // 500 literals in our sequence, then truncate all of our literals + // such that they are at most 3 bytes in length and the minimize the + // sequence." + const ATTEMPTS: [(usize, usize); 5] = + [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)]; + for (keep, limit) in ATTEMPTS { + let len = match self.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + if prefix { + self.keep_first_bytes(keep); + } else { + self.keep_last_bytes(keep); + } + if prefix { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, true); + } + } + } + // Check for a poison literal. A poison literal is one that is short + // and is believed to have a very high match count. These poisons + // generally lead to a prefilter with a very high false positive rate, + // and thus overall worse performance. + // + // We do this last because we could have gone from a non-poisonous + // sequence to a poisonous one. Perhaps we should add some code to + // prevent such transitions in the first place, but then again, we + // likely only made the transition in the first place if the sequence + // was itself huge. And huge sequences are themselves poisonous. So... + if let Some(lits) = self.literals() { + if lits.iter().any(|lit| lit.is_poisonous()) { + self.make_infinite(); + } + } + // OK, if we had an exact sequence before attempting more optimizations + // above and our post-optimized sequence sucks for some reason or + // another, then we go back to the exact sequence. + if let Some(exact) = exact { + // If optimizing resulted in dropping our literals, then certainly + // backup and use the exact sequence that we had. + if !self.is_finite() { + *self = exact; + return; + } + // If our optimized sequence contains a short literal, then it's + // *probably* not so great. So throw it away and revert to the + // exact sequence. + if self.min_literal_len().map_or(true, |len| len <= 2) { + *self = exact; + return; + } + // Finally, if our optimized sequence is "big" (i.e., can't use + // Teddy), then also don't use it and rely on the exact sequence. + if self.len().map_or(true, |len| len > 64) { + *self = exact; + return; + } + } + } +} + +impl core::fmt::Debug for Seq { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Seq")?; + if let Some(lits) = self.literals() { + f.debug_list().entries(lits.iter()).finish() + } else { + write!(f, "[∞]") + } + } +} + +impl FromIterator for Seq { + fn from_iter>(it: T) -> Seq { + let mut seq = Seq::empty(); + for literal in it { + seq.push(literal); + } + seq + } +} + +/// A single literal extracted from an [`Hir`] expression. +/// +/// A literal is composed of two things: +/// +/// * A sequence of bytes. No guarantees with respect to UTF-8 are provided. +/// In particular, even if the regex a literal is extracted from is UTF-8, the +/// literal extracted may not be valid UTF-8. (For example, if an [`Extractor`] +/// limit resulted in trimming a literal in a way that splits a codepoint.) +/// * Whether the literal is "exact" or not. An "exact" literal means that it +/// has not been trimmed, and may continue to be extended. If a literal is +/// "exact" after visiting the entire `Hir` expression, then this implies that +/// the literal leads to a match state. (Although it doesn't necessarily imply +/// all occurrences of the literal correspond to a match of the regex, since +/// literal extraction ignores look-around assertions.) +#[derive(Clone, Eq, PartialEq, PartialOrd, Ord)] +pub struct Literal { + bytes: Vec, + exact: bool, +} + +impl Literal { + /// Returns a new exact literal containing the bytes given. + #[inline] + pub fn exact>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: true } + } + + /// Returns a new inexact literal containing the bytes given. + #[inline] + pub fn inexact>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: false } + } + + /// Returns the bytes in this literal. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Yields ownership of the bytes inside this literal. + /// + /// Note that this throws away whether the literal is "exact" or not. + #[inline] + pub fn into_bytes(self) -> Vec { + self.bytes + } + + /// Returns the length of this literal in bytes. + #[inline] + pub fn len(&self) -> usize { + self.as_bytes().len() + } + + /// Returns true if and only if this literal has zero bytes. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if this literal is exact. + #[inline] + pub fn is_exact(&self) -> bool { + self.exact + } + + /// Marks this literal as inexact. + /// + /// Inexact literals can never be extended. For example, + /// [`Seq::cross_forward`] will not extend inexact literals. + #[inline] + pub fn make_inexact(&mut self) { + self.exact = false; + } + + /// Reverse the bytes in this literal. + #[inline] + pub fn reverse(&mut self) { + self.bytes.reverse(); + } + + /// Extend this literal with the literal given. + /// + /// If this literal is inexact, then this is a no-op. + #[inline] + pub fn extend(&mut self, lit: &Literal) { + if !self.is_exact() { + return; + } + self.bytes.extend_from_slice(&lit.bytes); + } + + /// Trims this literal such that only the first `len` bytes remain. If + /// this literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.truncate(len); + } + + /// Trims this literal such that only the last `len` bytes remain. If this + /// literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.drain(..self.len() - len); + } + + /// Returns true if it is believe that this literal is likely to match very + /// frequently, and is thus not a good candidate for a prefilter. + fn is_poisonous(&self) -> bool { + self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250) + } +} + +impl From for Literal { + fn from(byte: u8) -> Literal { + Literal::exact(vec![byte]) + } +} + +impl From for Literal { + fn from(ch: char) -> Literal { + use alloc::string::ToString; + Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string()) + } +} + +impl AsRef<[u8]> for Literal { + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let tag = if self.exact { "E" } else { "I" }; + f.debug_tuple(tag) + .field(&crate::debug::Bytes(self.as_bytes())) + .finish() + } +} + +/// A "preference" trie that rejects literals that will never match when +/// executing a leftmost first or "preference" search. +/// +/// For example, if 'sam' is inserted, then trying to insert 'samwise' will be +/// rejected because 'samwise' can never match since 'sam' will always take +/// priority. However, if 'samwise' is inserted first, then inserting 'sam' +/// after it is accepted. In this case, either 'samwise' or 'sam' can match in +/// a "preference" search. +/// +/// Note that we only use this trie as a "set." That is, given a sequence of +/// literals, we insert each one in order. An `insert` will reject a literal +/// if a prefix of that literal already exists in the trie. Thus, to rebuild +/// the "minimal" sequence, we simply only keep literals that were successfully +/// inserted. (Since we don't need traversal, one wonders whether we can make +/// some simplifications here, but I haven't given it a ton of thought and I've +/// never seen this show up on a profile. Because of the heuristic limits +/// imposed on literal extractions, the size of the inputs here is usually +/// very small.) +#[derive(Debug)] +struct PreferenceTrie { + /// The states in this trie. The index of a state in this vector is its ID. + states: Vec, + /// This vec indicates which states are match states. It always has + /// the same length as `states` and is indexed by the same state ID. + /// A state with identifier `sid` is a match state if and only if + /// `matches[sid].is_some()`. The option contains the index of the literal + /// corresponding to the match. The index is offset by 1 so that it fits in + /// a NonZeroUsize. + matches: Vec>, + /// The index to allocate to the next literal added to this trie. Starts at + /// 1 and increments by 1 for every literal successfully added to the trie. + next_literal_index: usize, +} + +/// A single state in a trie. Uses a sparse representation for its transitions. +#[derive(Debug, Default)] +struct State { + /// Sparse representation of the transitions out of this state. Transitions + /// are sorted by byte. There is at most one such transition for any + /// particular byte. + trans: Vec<(u8, usize)>, +} + +impl PreferenceTrie { + /// Minimizes the given sequence of literals while preserving preference + /// order semantics. + /// + /// When `keep_exact` is true, the exactness of every literal retained is + /// kept. This is useful when dealing with a fully extracted `Seq` that + /// only contains exact literals. In that case, we can keep all retained + /// literals as exact because we know we'll never need to match anything + /// after them and because any removed literals are guaranteed to never + /// match. + fn minimize(literals: &mut Vec, keep_exact: bool) { + let mut trie = PreferenceTrie { + states: vec![], + matches: vec![], + next_literal_index: 1, + }; + let mut make_inexact = vec![]; + literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + if !keep_exact { + make_inexact.push(i.checked_sub(1).unwrap()); + } + false + } + }); + for i in make_inexact { + literals[i].make_inexact(); + } + } + + /// Returns `Ok` if the given byte string is accepted into this trie and + /// `Err` otherwise. The index for the success case corresponds to the + /// index of the literal added. The index for the error case corresponds to + /// the index of the literal already in the trie that prevented the given + /// byte string from being added. (Which implies it is a prefix of the one + /// given.) + /// + /// In short, the byte string given is accepted into the trie if and only + /// if it is possible for it to match when executing a preference order + /// search. + fn insert(&mut self, bytes: &[u8]) -> Result { + let mut prev = self.root(); + if let Some(idx) = self.matches[prev] { + return Err(idx.get()); + } + for &b in bytes.iter() { + match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) { + Ok(i) => { + prev = self.states[prev].trans[i].1; + if let Some(idx) = self.matches[prev] { + return Err(idx.get()); + } + } + Err(i) => { + let next = self.create_state(); + self.states[prev].trans.insert(i, (b, next)); + prev = next; + } + } + } + let idx = self.next_literal_index; + self.next_literal_index += 1; + self.matches[prev] = NonZeroUsize::new(idx); + Ok(idx) + } + + /// Returns the root state ID, and if it doesn't exist, creates it. + fn root(&mut self) -> usize { + if !self.states.is_empty() { + 0 + } else { + self.create_state() + } + } + + /// Creates a new empty state and returns its ID. + fn create_state(&mut self) -> usize { + let id = self.states.len(); + self.states.push(State::default()); + self.matches.push(None); + id + } +} + +/// Returns the "rank" of the given byte. +/// +/// The minimum rank value is `0` and the maximum rank value is `255`. +/// +/// The rank of a byte is derived from a heuristic background distribution of +/// relative frequencies of bytes. The heuristic says that lower the rank of a +/// byte, the less likely that byte is to appear in any arbitrary haystack. +pub fn rank(byte: u8) -> u8 { + crate::rank::BYTE_FREQUENCIES[usize::from(byte)] +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(pattern: &str) -> Hir { + crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap() + } + + fn prefixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + #[allow(non_snake_case)] + fn E(x: &str) -> Literal { + Literal::exact(x.as_bytes()) + } + + #[allow(non_snake_case)] + fn I(x: &str) -> Literal { + Literal::inexact(x.as_bytes()) + } + + fn seq>(it: I) -> Seq { + Seq::from_iter(it) + } + + fn infinite() -> (Seq, Seq) { + (Seq::infinite(), Seq::infinite()) + } + + fn inexact(it1: I1, it2: I2) -> (Seq, Seq) + where + I1: IntoIterator, + I2: IntoIterator, + { + (Seq::from_iter(it1), Seq::from_iter(it2)) + } + + fn exact, I: IntoIterator>(it: I) -> (Seq, Seq) { + let s1 = Seq::new(it); + let s2 = s1.clone(); + (s1, s2) + } + + fn opt, I: IntoIterator>(it: I) -> (Seq, Seq) { + let (mut p, mut s) = exact(it); + p.optimize_for_prefix_by_preference(); + s.optimize_for_suffix_by_preference(); + (p, s) + } + + #[test] + fn literal() { + assert_eq!(exact(["a"]), e("a")); + assert_eq!(exact(["aaaaa"]), e("aaaaa")); + assert_eq!(exact(["A", "a"]), e("(?i-u)a")); + assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab")); + assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c")); + + assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)")); + + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["☃"]), e("☃")); + assert_eq!(exact(["☃"]), e("(?i)☃")); + assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃")); + + assert_eq!(exact(["Δ"]), e("Δ")); + assert_eq!(exact(["δ"]), e("δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)δ")); + + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ")); + } + + let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ"; + assert_eq!(exact([letters]), e(letters)); + } + + #[test] + fn class() { + assert_eq!(exact(["a", "b", "c"]), e("[abc]")); + assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b")); + assert_eq!(exact(["δ", "ε"]), e("[εδ]")); + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]")); + } + } + + #[test] + fn look() { + assert_eq!(exact(["ab"]), e(r"a\Ab")); + assert_eq!(exact(["ab"]), e(r"a\zb")); + assert_eq!(exact(["ab"]), e(r"a(?m:^)b")); + assert_eq!(exact(["ab"]), e(r"a(?m:$)b")); + assert_eq!(exact(["ab"]), e(r"a\bb")); + assert_eq!(exact(["ab"]), e(r"a\Bb")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b")); + + assert_eq!(exact(["ab"]), e(r"^ab")); + assert_eq!(exact(["ab"]), e(r"$ab")); + assert_eq!(exact(["ab"]), e(r"(?m:^)ab")); + assert_eq!(exact(["ab"]), e(r"(?m:$)ab")); + assert_eq!(exact(["ab"]), e(r"\bab")); + assert_eq!(exact(["ab"]), e(r"\Bab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab")); + + assert_eq!(exact(["ab"]), e(r"ab^")); + assert_eq!(exact(["ab"]), e(r"ab$")); + assert_eq!(exact(["ab"]), e(r"ab(?m:^)")); + assert_eq!(exact(["ab"]), e(r"ab(?m:$)")); + assert_eq!(exact(["ab"]), e(r"ab\b")); + assert_eq!(exact(["ab"]), e(r"ab\B")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)")); + + let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")])); + assert_eq!(expected, e(r"^aZ*b")); + } + + #[test] + fn repetition() { + assert_eq!(exact(["a", ""]), e(r"a?")); + assert_eq!(exact(["", "a"]), e(r"a??")); + assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*")); + assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"a+")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+")); + + assert_eq!(exact(["ab"]), e(r"aZ{0}b")); + assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b")); + assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b")); + assert_eq!( + inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]), + e(r"aZ*b") + ); + assert_eq!( + inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]), + e(r"aZ*?b") + ); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b")); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b")); + + assert_eq!(exact(["aZZb"]), e(r"aZ{2}b")); + assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b")); + + assert_eq!(exact(["abc", ""]), e(r"(abc)?")); + assert_eq!(exact(["", "abc"]), e(r"(abc)??")); + + assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b")); + assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b")); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+")); + + // FIXME: The suffixes for this don't look quite right to me. I think + // the right suffixes would be: [I(ac), I(bc), E(c)]. The main issue I + // think is that suffixes are computed by iterating over concatenations + // in reverse, and then [bc, ac, c] ordering is indeed correct from + // that perspective. We also test a few more equivalent regexes, and + // we get the same result, so it is consistent at least I suppose. + // + // The reason why this isn't an issue is that it only messes up + // preference order, and currently, suffixes are never used in a + // context where preference order matters. For prefixes it matters + // because we sometimes want to use prefilters without confirmation + // when all of the literals are exact (and there's no look-around). But + // we never do that for suffixes. Any time we use suffixes, we always + // include a confirmation step. If that ever changes, then it's likely + // this bug will need to be fixed, but last time I looked, it appears + // hard to do so. + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"a*b*c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+)?(b+)?c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+|)(b+|)c") + ); + // A few more similarish but not identical regexes. These may have a + // similar problem as above. + assert_eq!( + inexact( + [I("a"), I("b"), I("c"), E("")], + [I("c"), I("b"), I("a"), E("")] + ), + e(r"a*b*c*") + ); + assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+")); + assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c")); + assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*")); + assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*")); + assert_eq!( + inexact([I("ab"), E("ac")], [I("bc"), E("ac")]), + e(r"ab*c") + ); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c")); + + assert_eq!( + inexact([I("z"), E("azb")], [I("zazb"), E("azb")]), + e(r"z*azb") + ); + + let expected = + exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]); + assert_eq!(expected, e(r"[ab]{3}")); + let expected = inexact( + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + ); + assert_eq!(expected, e(r"[ab]{3,4}")); + } + + #[test] + fn concat() { + let empty: [&str; 0] = []; + + assert_eq!(exact(["abcxyz"]), e(r"abc()xyz")); + assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)")); + assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz")); + assert_eq!(exact(empty), e(r"abc[a&&b]xyz")); + assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz")); + } + + #[test] + fn alternation() { + assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz")); + assert_eq!( + inexact( + [E("abc"), I("mZ"), E("mo"), E("xyz")], + [E("abc"), I("Zo"), E("mo"), E("xyz")] + ), + e(r"abc|mZ*o|xyz") + ); + assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz")); + assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz")); + + assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa")); + assert_eq!( + inexact( + [I("aaa"), E(""), I("aaaaa"), E("aa")], + [I("aaa"), E(""), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*") + ); + assert_eq!( + inexact( + [E(""), I("aaa"), E("aa"), I("aaaaa")], + [E(""), I("aaa"), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*?") + ); + + assert_eq!( + inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]), + e(r"a|b*") + ); + assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+")); + + assert_eq!( + inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]), + e(r"a*b|c") + ); + + assert_eq!( + inexact( + [E("a"), E("b"), I("c"), E("")], + [E("a"), E("b"), I("c"), E("")] + ), + e(r"a|(?:b|c*)") + ); + + assert_eq!( + inexact( + [I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")], + [I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")], + ), + e(r"(a|b)*c|(a|ab)*c") + ); + + assert_eq!( + exact(["abef", "abgh", "cdef", "cdgh"]), + e(r"(ab|cd)(ef|gh)") + ); + assert_eq!( + exact([ + "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl", + "cdghij", "cdghkl", + ]), + e(r"(ab|cd)(ef|gh)(ij|kl)") + ); + + assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}")); + } + + #[test] + fn impossible() { + let empty: [&str; 0] = []; + + assert_eq!(exact(empty), e(r"[a&&b]")); + assert_eq!(exact(empty), e(r"a[a&&b]")); + assert_eq!(exact(empty), e(r"[a&&b]b")); + assert_eq!(exact(empty), e(r"a[a&&b]b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b")); + assert_eq!(exact([""]), e(r"[a&&b]*")); + assert_eq!(exact(["MN"]), e(r"M[a&&b]*N")); + } + + // This tests patterns that contain something that defeats literal + // detection, usually because it would blow some limit on the total number + // of literals that can be returned. + // + // The main idea is that when literal extraction sees something that + // it knows will blow a limit, it replaces it with a marker that says + // "any literal will match here." While not necessarily true, the + // over-estimation is just fine for the purposes of literal extraction, + // because the imprecision doesn't matter: too big is too big. + // + // This is one of the trickier parts of literal extraction, since we need + // to make sure all of our literal extraction operations correctly compose + // with the markers. + #[test] + fn anything() { + assert_eq!(infinite(), e(r".")); + assert_eq!(infinite(), e(r"(?s).")); + assert_eq!(infinite(), e(r"[A-Za-z]")); + assert_eq!(infinite(), e(r"[A-Z]")); + assert_eq!(exact([""]), e(r"[A-Z]{0}")); + assert_eq!(infinite(), e(r"[A-Z]?")); + assert_eq!(infinite(), e(r"[A-Z]*")); + assert_eq!(infinite(), e(r"[A-Z]+")); + assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]")); + assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2")); + assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123")); + assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+")); + assert_eq!(infinite(), e(r"1|[A-Z]|3")); + assert_eq!( + (seq([E("1"), I("2"), E("3")]), Seq::infinite()), + e(r"1|2[A-Z]|3"), + ); + assert_eq!( + (Seq::infinite(), seq([E("1"), I("2"), E("3")])), + e(r"1|[A-Z]2|3"), + ); + assert_eq!( + (seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])), + e(r"1|2[A-Z]3|4"), + ); + assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2")); + assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z")); + } + + // Like the 'anything' test, but it uses smaller limits in order to test + // the logic for effectively aborting literal extraction when the seqs get + // too big. + #[test] + fn anything_small_limits() { + fn prefixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Prefix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Suffix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + assert_eq!( + ( + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]), + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]) + ), + e(r"[ab]{3}{3}") + ); + + assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz")); + } + + #[test] + fn empty() { + assert_eq!(exact([""]), e(r"")); + assert_eq!(exact([""]), e(r"^")); + assert_eq!(exact([""]), e(r"$")); + assert_eq!(exact([""]), e(r"(?m:^)")); + assert_eq!(exact([""]), e(r"(?m:$)")); + assert_eq!(exact([""]), e(r"\b")); + assert_eq!(exact([""]), e(r"\B")); + assert_eq!(exact([""]), e(r"(?-u:\b)")); + assert_eq!(exact([""]), e(r"(?-u:\B)")); + } + + #[test] + fn odds_and_ends() { + assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a")); + assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a.")); + assert_eq!(infinite(), e(r"a|.")); + assert_eq!(infinite(), e(r".|a")); + + let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]"; + let expected = inexact( + ["Mo'am", "Moam", "Mu'am", "Muam"].map(I), + [ + "ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi", + "dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy", + "tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi", + "zzafy", "zafi", "zafy", + ] + .map(I), + ); + assert_eq!(expected, e(pat)); + + assert_eq!( + (seq(["fn is_", "fn as_"].map(I)), Seq::infinite()), + e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"), + ); + assert_eq!( + inexact([I("foo")], [I("quux")]), + e(r"foo[A-Z]+bar[A-Z]+quux") + ); + assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+")); + assert_eq!( + exact(["Sherlock Holmes"]), + e(r"(?m)^Sherlock Holmes|Sherlock Holmes$") + ); + + assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])")); + } + + // This tests a specific regex along with some heuristic steps to reduce + // the sequences extracted. This is meant to roughly correspond to the + // types of heuristics used to shrink literal sets in practice. (Shrinking + // is done because you want to balance "spend too much work looking for + // too many literals" and "spend too much work processing false positive + // matches from short literals.") + #[test] + #[cfg(feature = "unicode-case")] + fn holmes() { + let expected = inexact( + ["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I), + [ + "MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS", + "mes", + ] + .map(I), + ); + let (mut prefixes, mut suffixes) = e(r"(?i)Holmes"); + prefixes.keep_first_bytes(3); + suffixes.keep_last_bytes(3); + prefixes.minimize_by_preference(); + suffixes.minimize_by_preference(); + assert_eq!(expected, (prefixes, suffixes)); + } + + // This tests that we get some kind of literals extracted for a beefier + // alternation with case insensitive mode enabled. At one point during + // development, this returned nothing, and motivated some special case + // code in Extractor::union to try and trim down the literal sequences + // if the union would blow the limits set. + #[test] + #[cfg(feature = "unicode-case")] + fn holmes_alt() { + let mut pre = + prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker"); + assert!(pre.len().unwrap() > 0); + pre.optimize_for_prefix_by_preference(); + assert!(pre.len().unwrap() > 0); + } + + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // We test this here to ensure literal extraction completes in reasonable + // time and isn't materially impacted by these sorts of pathological + // repeats. + #[test] + fn crazy_repeats() { + assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}")); + assert_eq!( + inexact([E("")], [E("")]), + e(r"(?:){64}{64}{64}{64}{64}{64}") + ); + assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}")); + assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}")); + + assert_eq!( + inexact([E("")], [E("")]), + e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + let repa = "a".repeat(100); + assert_eq!( + inexact([I(&repa)], [I(&repa)]), + e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + } + + #[test] + fn huge() { + let pat = r#"(?-u) + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4}| + 3(?: + 12?[5-7]\d{2}| + 0(?: + 2(?: + [025-79]\d| + [348]\d{1,2} + )| + 3(?: + [2-4]\d| + [56]\d? + ) + )| + 2(?: + 1\d{2}| + 2(?: + [12]\d| + [35]\d{1,2}| + 4\d? + ) + )| + 3(?: + 1\d{2}| + 2(?: + [2356]\d| + 4\d{1,2} + ) + )| + 4(?: + 1\d{2}| + 2(?: + 2\d{1,2}| + [47]| + 5\d{2} + ) + )| + 5(?: + 1\d{2}| + 29 + )| + [67]1\d{2}| + 8(?: + 1\d{2}| + 2(?: + 2\d{2}| + 3| + 4\d + ) + ) + )\d{3}| + 4(?: + 0(?: + 2(?: + [09]\d| + 7 + )| + 33\d{2} + )| + 1\d{3}| + 2(?: + 1\d{2}| + 2(?: + [25]\d?| + [348]\d| + [67]\d{1,2} + ) + )| + 3(?: + 1\d{2}(?: + \d{2} + )?| + 2(?: + [045]\d| + [236-9]\d{1,2} + )| + 32\d{2} + )| + 4(?: + [18]\d{2}| + 2(?: + [2-46]\d{2}| + 3 + )| + 5[25]\d{2} + )| + 5(?: + 1\d{2}| + 2(?: + 3\d| + 5 + ) + )| + 6(?: + [18]\d{2}| + 2(?: + 3(?: + \d{2} + )?| + [46]\d{1,2}| + 5\d{2}| + 7\d + )| + 5(?: + 3\d?| + 4\d| + [57]\d{1,2}| + 6\d{2}| + 8 + ) + )| + 71\d{2}| + 8(?: + [18]\d{2}| + 23\d{2}| + 54\d{2} + )| + 9(?: + [18]\d{2}| + 2[2-5]\d{2}| + 53\d{1,2} + ) + )\d{3}| + 5(?: + 02[03489]\d{2}| + 1\d{2}| + 2(?: + 1\d{2}| + 2(?: + 2(?: + \d{2} + )?| + [457]\d{2} + ) + )| + 3(?: + 1\d{2}| + 2(?: + [37](?: + \d{2} + )?| + [569]\d{2} + ) + )| + 4(?: + 1\d{2}| + 2[46]\d{2} + )| + 5(?: + 1\d{2}| + 26\d{1,2} + )| + 6(?: + [18]\d{2}| + 2| + 53\d{2} + )| + 7(?: + 1| + 24 + )\d{2}| + 8(?: + 1| + 26 + )\d{2}| + 91\d{2} + )\d{3}| + 6(?: + 0(?: + 1\d{2}| + 2(?: + 3\d{2}| + 4\d{1,2} + ) + )| + 2(?: + 2[2-5]\d{2}| + 5(?: + [3-5]\d{2}| + 7 + )| + 8\d{2} + )| + 3(?: + 1| + 2[3478] + )\d{2}| + 4(?: + 1| + 2[34] + )\d{2}| + 5(?: + 1| + 2[47] + )\d{2}| + 6(?: + [18]\d{2}| + 6(?: + 2(?: + 2\d| + [34]\d{2} + )| + 5(?: + [24]\d{2}| + 3\d| + 5\d{1,2} + ) + ) + )| + 72[2-5]\d{2}| + 8(?: + 1\d{2}| + 2[2-5]\d{2} + )| + 9(?: + 1\d{2}| + 2[2-6]\d{2} + ) + )\d{3}| + 7(?: + (?: + 02| + [3-589]1| + 6[12]| + 72[24] + )\d{2}| + 21\d{3}| + 32 + )\d{3}| + 8(?: + (?: + 4[12]| + [5-7]2| + 1\d? + )| + (?: + 0| + 3[12]| + [5-7]1| + 217 + )\d + )\d{4}| + 9(?: + [35]1| + (?: + [024]2| + 81 + )\d| + (?: + 1| + [24]1 + )\d{2} + )\d{3} + "#; + // TODO: This is a good candidate of a seq of literals that could be + // shrunk quite a bit and still be very productive with respect to + // literal optimizations. + let (prefixes, suffixes) = e(pat); + assert!(!suffixes.is_finite()); + assert_eq!(Some(243), prefixes.len()); + } + + #[test] + fn optimize() { + // This gets a common prefix that isn't too short. + let (p, s) = + opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]); + assert_eq!(seq([I("foobar")]), p); + assert_eq!(seq([I("foobar")]), s); + + // This also finds a common prefix, but since it's only one byte, it + // prefers the multiple literals. + let (p, s) = opt(["abba", "akka", "abccba"]); + assert_eq!(exact(["abba", "akka", "abccba"]), (p, s)); + + let (p, s) = opt(["sam", "samwise"]); + assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s)); + + // The empty string is poisonous, so our seq becomes infinite, even + // though all literals are exact. + let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]); + assert!(!p.is_finite()); + assert!(!s.is_finite()); + + // A space is also poisonous, so our seq becomes infinite. But this + // only gets triggered when we don't have a completely exact sequence. + // When the sequence is exact, spaces are okay, since we presume that + // any prefilter will match a space more quickly than the regex engine. + // (When the sequence is exact, there's a chance of the prefilter being + // used without needing the regex engine at all.) + let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]); + p.optimize_for_prefix_by_preference(); + assert!(!p.is_finite()); + } +} diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs new file mode 100644 index 0000000..ce38ead --- /dev/null +++ b/vendor/regex-syntax/src/hir/mod.rs @@ -0,0 +1,3861 @@ +/*! +Defines a high-level intermediate (HIR) representation for regular expressions. + +The HIR is represented by the [`Hir`] type, and it principally constructed via +[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users +may use the smart constructors defined on `Hir` to build their own by hand. The +smart constructors simultaneously simplify and "optimize" the HIR, and are also +the same routines used by translation. + +Most regex engines only have an HIR like this, and usually construct it +directly from the concrete syntax. This crate however first parses the +concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`, +as mentioned above. It's done this way to facilitate better error reporting, +and to have a structured representation of a regex that faithfully represents +its concrete syntax. Namely, while an `Hir` value can be converted back to an +equivalent regex pattern string, it is unlikely to look like the original due +to its simplified structure. +*/ + +use core::{char, cmp}; + +use alloc::{ + boxed::Box, + format, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + ast::Span, + hir::interval::{Interval, IntervalSet, IntervalSetIter}, + unicode, +}; + +pub use crate::{ + hir::visitor::{visit, Visitor}, + unicode::CaseFoldError, +}; + +mod interval; +pub mod literal; +pub mod print; +pub mod translate; +mod visitor; + +/// An error that can occur while translating an `Ast` to a `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the translator's Ast was parsed from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error, derived from the Ast given to the translator. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } +} + +/// The type of an error that occurred while building an `Hir`. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// This error occurs when a Unicode feature is used when Unicode + /// support is disabled. For example `(?-u:\pL)` would trigger this error. + UnicodeNotAllowed, + /// This error occurs when translating a pattern that could match a byte + /// sequence that isn't UTF-8 and `utf8` was enabled. + InvalidUtf8, + /// This error occurs when one uses a non-ASCII byte for a line terminator, + /// but where Unicode mode is enabled and UTF-8 mode is disabled. + InvalidLineTerminator, + /// This occurs when an unrecognized Unicode property name could not + /// be found. + UnicodePropertyNotFound, + /// This occurs when an unrecognized Unicode property value could not + /// be found. + UnicodePropertyValueNotFound, + /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or + /// `\d`) could not be found. This can occur when the `unicode-perl` + /// crate feature is not enabled. + UnicodePerlClassNotFound, + /// This occurs when the Unicode simple case mapping tables are not + /// available, and the regular expression required Unicode aware case + /// insensitivity. + UnicodeCaseUnavailable, +} + +#[cfg(feature = "std")] +impl std::error::Error for Error {} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl core::fmt::Display for ErrorKind { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use self::ErrorKind::*; + + let msg = match *self { + UnicodeNotAllowed => "Unicode not allowed here", + InvalidUtf8 => "pattern can match invalid UTF-8", + InvalidLineTerminator => "invalid line terminator, must be ASCII", + UnicodePropertyNotFound => "Unicode property not found", + UnicodePropertyValueNotFound => "Unicode property value not found", + UnicodePerlClassNotFound => { + "Unicode-aware Perl class not found \ + (make sure the unicode-perl feature is enabled)" + } + UnicodeCaseUnavailable => { + "Unicode-aware case insensitivity matching is not available \ + (make sure the unicode-case feature is enabled)" + } + }; + f.write_str(msg) + } +} + +/// A high-level intermediate representation (HIR) for a regular expression. +/// +/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`]. +/// An `HirKind` indicates what kind of regular expression it is (a literal, +/// a repetition, a look-around assertion, etc.), where as a `Properties` +/// describes various facts about the regular expression. For example, whether +/// it matches UTF-8 or if it matches the empty string. +/// +/// The HIR of a regular expression represents an intermediate step between +/// its abstract syntax (a structured description of the concrete syntax) and +/// an actual regex matcher. The purpose of HIR is to make regular expressions +/// easier to analyze. In particular, the AST is much more complex than the +/// HIR. For example, while an AST supports arbitrarily nested character +/// classes, the HIR will flatten all nested classes into a single set. The HIR +/// will also "compile away" every flag present in the concrete syntax. For +/// example, users of HIR expressions never need to worry about case folding; +/// it is handled automatically by the translator (e.g., by translating +/// `(?i:A)` to `[aA]`). +/// +/// The specific type of an HIR expression can be accessed via its `kind` +/// or `into_kind` methods. This extra level of indirection exists for two +/// reasons: +/// +/// 1. Construction of an HIR expression *must* use the constructor methods on +/// this `Hir` type instead of building the `HirKind` values directly. This +/// permits construction to enforce invariants like "concatenations always +/// consist of two or more sub-expressions." +/// 2. Every HIR expression contains attributes that are defined inductively, +/// and can be computed cheaply during the construction process. For example, +/// one such attribute is whether the expression must match at the beginning of +/// the haystack. +/// +/// In particular, if you have an `HirKind` value, then there is intentionally +/// no way to build an `Hir` value from it. You instead need to do case +/// analysis on the `HirKind` value and build the `Hir` value using its smart +/// constructors. +/// +/// # UTF-8 +/// +/// If the HIR was produced by a translator with +/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled, +/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty +/// matches. +/// +/// For empty matches, those can occur at any position. It is the +/// responsibility of the regex engine to determine whether empty matches are +/// permitted between the code units of a single codepoint. +/// +/// # Stack space +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the HIR. +/// +/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular +/// expression pattern string, and uses constant stack space and heap space +/// proportional to the size of the `Hir`. The regex it prints is guaranteed to +/// be _semantically_ equivalent to the original concrete syntax, but it may +/// look very different. (And potentially not practically readable by a human.) +/// +/// An `Hir`'s `fmt::Debug` implementation currently does not use constant +/// stack space. The implementation will also suppress some details (such as +/// the `Properties` inlined into every `Hir` value to make it less noisy). +#[derive(Clone, Eq, PartialEq)] +pub struct Hir { + /// The underlying HIR kind. + kind: HirKind, + /// Analysis info about this HIR, computed during construction. + props: Properties, +} + +/// Methods for accessing the underlying `HirKind` and `Properties`. +impl Hir { + /// Returns a reference to the underlying HIR kind. + pub fn kind(&self) -> &HirKind { + &self.kind + } + + /// Consumes ownership of this HIR expression and returns its underlying + /// `HirKind`. + pub fn into_kind(mut self) -> HirKind { + core::mem::replace(&mut self.kind, HirKind::Empty) + } + + /// Returns the properties computed for this `Hir`. + pub fn properties(&self) -> &Properties { + &self.props + } + + /// Splits this HIR into its constituent parts. + /// + /// This is useful because `let Hir { kind, props } = hir;` does not work + /// because of `Hir`'s custom `Drop` implementation. + fn into_parts(mut self) -> (HirKind, Properties) { + ( + core::mem::replace(&mut self.kind, HirKind::Empty), + core::mem::replace(&mut self.props, Properties::empty()), + ) + } +} + +/// Smart constructors for HIR values. +/// +/// These constructors are called "smart" because they do inductive work or +/// simplifications. For example, calling `Hir::repetition` with a repetition +/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind +/// since it is equivalent to an empty regex. Another example is calling +/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll +/// just get back the original `expr` since it's precisely equivalent. +/// +/// Smart constructors enable maintaining invariants about the HIR data type +/// while also simulanteously keeping the representation as simple as possible. +impl Hir { + /// Returns an empty HIR expression. + /// + /// An empty HIR expression always matches, including the empty string. + #[inline] + pub fn empty() -> Hir { + let props = Properties::empty(); + Hir { kind: HirKind::Empty, props } + } + + /// Returns an HIR expression that can never match anything. That is, + /// the size of the set of strings in the language described by the HIR + /// returned is `0`. + /// + /// This is distinct from [`Hir::empty`] in that the empty string matches + /// the HIR returned by `Hir::empty`. That is, the set of strings in the + /// language describe described by `Hir::empty` is non-empty. + /// + /// Note that currently, the HIR returned uses an empty character class to + /// indicate that nothing can match. An equivalent expression that cannot + /// match is an empty alternation, but all such "fail" expressions are + /// normalized (via smart constructors) to empty character classes. This is + /// because empty character classes can be spelled in the concrete syntax + /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but + /// empty alternations cannot. + #[inline] + pub fn fail() -> Hir { + let class = Class::Bytes(ClassBytes::empty()); + let props = Properties::class(&class); + // We can't just call Hir::class here because it defers to Hir::fail + // in order to canonicalize the Hir value used to represent "cannot + // match." + Hir { kind: HirKind::Class(class), props } + } + + /// Creates a literal HIR expression. + /// + /// This accepts anything that can be converted into a `Box<[u8]>`. + /// + /// Note that there is no mechanism for storing a `char` or a `Box` + /// in an HIR. Everything is "just bytes." Whether a `Literal` (or + /// any HIR node) matches valid UTF-8 exclusively can be queried via + /// [`Properties::is_utf8`]. + /// + /// # Example + /// + /// This example shows that concatenations of `Literal` HIR values will + /// automatically get flattened and combined together. So for example, even + /// if you concat multiple `Literal` values that are themselves not valid + /// UTF-8, they might add up to valid UTF-8. This also demonstrates just + /// how "smart" Hir's smart constructors are. + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let literals = vec![ + /// Hir::literal([0xE2]), + /// Hir::literal([0x98]), + /// Hir::literal([0x83]), + /// ]; + /// // Each literal, on its own, is invalid UTF-8. + /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8())); + /// + /// let concat = Hir::concat(literals); + /// // But the concatenation is valid UTF-8! + /// assert!(concat.properties().is_utf8()); + /// + /// // And also notice that the literals have been concatenated into a + /// // single `Literal`, to the point where there is no explicit `Concat`! + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, concat.kind()); + /// ``` + #[inline] + pub fn literal>>(lit: B) -> Hir { + let bytes = lit.into(); + if bytes.is_empty() { + return Hir::empty(); + } + + let lit = Literal(bytes); + let props = Properties::literal(&lit); + Hir { kind: HirKind::Literal(lit), props } + } + + /// Creates a class HIR expression. The class may either be defined over + /// ranges of Unicode codepoints or ranges of raw byte values. + /// + /// Note that an empty class is permitted. An empty class is equivalent to + /// `Hir::fail()`. + #[inline] + pub fn class(class: Class) -> Hir { + if class.is_empty() { + return Hir::fail(); + } else if let Some(bytes) = class.literal() { + return Hir::literal(bytes); + } + let props = Properties::class(&class); + Hir { kind: HirKind::Class(class), props } + } + + /// Creates a look-around assertion HIR expression. + #[inline] + pub fn look(look: Look) -> Hir { + let props = Properties::look(look); + Hir { kind: HirKind::Look(look), props } + } + + /// Creates a repetition HIR expression. + #[inline] + pub fn repetition(mut rep: Repetition) -> Hir { + // If the sub-expression of a repetition can only match the empty + // string, then we force its maximum to be at most 1. + if rep.sub.properties().maximum_len() == Some(0) { + rep.min = cmp::min(rep.min, 1); + rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1)); + } + // The regex 'a{0}' is always equivalent to the empty regex. This is + // true even when 'a' is an expression that never matches anything + // (like '\P{any}'). + // + // Additionally, the regex 'a{1}' is always equivalent to 'a'. + if rep.min == 0 && rep.max == Some(0) { + return Hir::empty(); + } else if rep.min == 1 && rep.max == Some(1) { + return *rep.sub; + } + let props = Properties::repetition(&rep); + Hir { kind: HirKind::Repetition(rep), props } + } + + /// Creates a capture HIR expression. + /// + /// Note that there is no explicit HIR value for a non-capturing group. + /// Since a non-capturing group only exists to override precedence in the + /// concrete syntax and since an HIR already does its own grouping based on + /// what is parsed, there is no need to explicitly represent non-capturing + /// groups in the HIR. + #[inline] + pub fn capture(capture: Capture) -> Hir { + let props = Properties::capture(&capture); + Hir { kind: HirKind::Capture(capture), props } + } + + /// Returns the concatenation of the given expressions. + /// + /// This attempts to flatten and simplify the concatenation as appropriate. + /// + /// # Example + /// + /// This shows a simple example of basic flattening of both concatenations + /// and literals. + /// + /// ``` + /// use regex_syntax::hir::Hir; + /// + /// let hir = Hir::concat(vec![ + /// Hir::concat(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal([b'x']), + /// Hir::literal([b'y']), + /// Hir::literal([b'z']), + /// ]), + /// ]); + /// let expected = Hir::literal("abcxyz".as_bytes()); + /// assert_eq!(expected, hir); + /// ``` + pub fn concat(subs: Vec) -> Hir { + // We rebuild the concatenation by simplifying it. Would be nice to do + // it in place, but that seems a little tricky? + let mut new = vec![]; + // This gobbles up any adjacent literals in a concatenation and smushes + // them together. Basically, when we see a literal, we add its bytes + // to 'prior_lit', and whenever we see anything else, we first take + // any bytes in 'prior_lit' and add it to the 'new' concatenation. + let mut prior_lit: Option> = None; + for sub in subs { + let (kind, props) = sub.into_parts(); + match kind { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + // We also flatten concats that are direct children of another + // concat. We only need to do this one level deep since + // Hir::concat is the only way to build concatenations, and so + // flattening happens inductively. + HirKind::Concat(subs2) => { + for sub2 in subs2 { + let (kind2, props2) = sub2.into_parts(); + match kind2 { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + kind2 => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind: kind2, props: props2 }); + } + } + } + } + // We can just skip empty HIRs. + HirKind::Empty => {} + kind => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind, props }); + } + } + } + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + if new.is_empty() { + return Hir::empty(); + } else if new.len() == 1 { + return new.pop().unwrap(); + } + let props = Properties::concat(&new); + Hir { kind: HirKind::Concat(new), props } + } + + /// Returns the alternation of the given expressions. + /// + /// This flattens and simplifies the alternation as appropriate. This may + /// include factoring out common prefixes or even rewriting the alternation + /// as a character class. + /// + /// Note that an empty alternation is equivalent to `Hir::fail()`. (It + /// is not possible for one to write an empty alternation, or even an + /// alternation with a single sub-expression, in the concrete syntax of a + /// regex.) + /// + /// # Example + /// + /// This is a simple example showing how an alternation might get + /// simplified. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// Hir::literal([b'd']), + /// Hir::literal([b'e']), + /// Hir::literal([b'f']), + /// ]); + /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'f'), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` + /// + /// And another example showing how common prefixes might get factored + /// out. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// let expected = Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::alternation(vec![ + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// assert_eq!(expected, hir); + /// ``` + /// + /// Note that these sorts of simplifications are not guaranteed. + pub fn alternation(subs: Vec) -> Hir { + // We rebuild the alternation by simplifying it. We proceed similarly + // as the concatenation case. But in this case, there's no literal + // simplification happening. We're just flattening alternations. + let mut new = Vec::with_capacity(subs.len()); + for sub in subs { + let (kind, props) = sub.into_parts(); + match kind { + HirKind::Alternation(subs2) => { + new.extend(subs2); + } + kind => { + new.push(Hir { kind, props }); + } + } + } + if new.is_empty() { + return Hir::fail(); + } else if new.len() == 1 { + return new.pop().unwrap(); + } + // Now that it's completely flattened, look for the special case of + // 'char1|char2|...|charN' and collapse that into a class. Note that + // we look for 'char' first and then bytes. The issue here is that if + // we find both non-ASCII codepoints and non-ASCII singleton bytes, + // then it isn't actually possible to smush them into a single class. + // (Because classes are either "all codepoints" or "all bytes." You + // can have a class that both matches non-ASCII but valid UTF-8 and + // invalid UTF-8.) So we look for all chars and then all bytes, and + // don't handle anything else. + if let Some(singletons) = singleton_chars(&new) { + let it = singletons + .into_iter() + .map(|ch| ClassUnicodeRange { start: ch, end: ch }); + return Hir::class(Class::Unicode(ClassUnicode::new(it))); + } + if let Some(singletons) = singleton_bytes(&new) { + let it = singletons + .into_iter() + .map(|b| ClassBytesRange { start: b, end: b }); + return Hir::class(Class::Bytes(ClassBytes::new(it))); + } + // Similar to singleton chars, we can also look for alternations of + // classes. Those can be smushed into a single class. + if let Some(cls) = class_chars(&new) { + return Hir::class(cls); + } + if let Some(cls) = class_bytes(&new) { + return Hir::class(cls); + } + // Factor out a common prefix if we can, which might potentially + // simplify the expression and unlock other optimizations downstream. + // It also might generally make NFA matching and DFA construction + // faster by reducing the scope of branching in the regex. + new = match lift_common_prefix(new) { + Ok(hir) => return hir, + Err(unchanged) => unchanged, + }; + let props = Properties::alternation(&new); + Hir { kind: HirKind::Alternation(new), props } + } + + /// Returns an HIR expression for `.`. + /// + /// * [`Dot::AnyChar`] maps to `(?su-R:.)`. + /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`. + /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`. + /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`. + /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`. + /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`. + /// + /// # Example + /// + /// Note that this is a convenience routine for constructing the correct + /// character class based on the value of `Dot`. There is no explicit "dot" + /// HIR value. It is just an abbreviation for a common character class. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::dot(Dot::AnyByte); + /// let expected = Hir::class(Class::Bytes(ClassBytes::new([ + /// ClassBytesRange::new(0x00, 0xFF), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` + #[inline] + pub fn dot(dot: Dot) -> Hir { + match dot { + Dot::AnyChar => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByte => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyCharExcept(ch) => { + let mut cls = + ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); + cls.negate(); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyCharExceptLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyCharExceptCRLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\x0C')); + cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByteExcept(byte) => { + let mut cls = + ClassBytes::new([ClassBytesRange::new(byte, byte)]); + cls.negate(); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyByteExceptLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyByteExceptCRLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\x0C')); + cls.push(ClassBytesRange::new(b'\x0E', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + } + } +} + +/// The underlying kind of an arbitrary [`Hir`] expression. +/// +/// An `HirKind` is principally useful for doing case analysis on the type +/// of a regular expression. If you're looking to build new `Hir` values, +/// then you _must_ use the smart constructors defined on `Hir`, like +/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does +/// not expose any way of building an `Hir` directly from an `HirKind`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HirKind { + /// The empty regular expression, which matches everything, including the + /// empty string. + Empty, + /// A literalstring that matches exactly these bytes. + Literal(Literal), + /// A single character class that matches any of the characters in the + /// class. A class can either consist of Unicode scalar values as + /// characters, or it can use bytes. + /// + /// A class may be empty. In which case, it matches nothing. + Class(Class), + /// A look-around assertion. A look-around match always has zero length. + Look(Look), + /// A repetition operation applied to a sub-expression. + Repetition(Repetition), + /// A capturing group, which contains a sub-expression. + Capture(Capture), + /// A concatenation of expressions. + /// + /// A concatenation matches only if each of its sub-expressions match one + /// after the other. + /// + /// Concatenations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Concat(Vec), + /// An alternation of expressions. + /// + /// An alternation matches only if at least one of its sub-expressions + /// match. If multiple sub-expressions match, then the leftmost is + /// preferred. + /// + /// Alternations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Alternation(Vec), +} + +impl HirKind { + /// Returns a slice of this kind's sub-expressions, if any. + pub fn subs(&self) -> &[Hir] { + use core::slice::from_ref; + + match *self { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => &[], + HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), + HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), + HirKind::Concat(ref subs) => subs, + HirKind::Alternation(ref subs) => subs, + } + } +} + +impl core::fmt::Debug for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.kind.fmt(f) + } +} + +/// Print a display representation of this Hir. +/// +/// The result of this is a valid regular expression pattern string. +/// +/// This implementation uses constant stack space and heap space proportional +/// to the size of the `Hir`. +impl core::fmt::Display for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::hir::print::Printer::new().print(self, f) + } +} + +/// The high-level intermediate representation of a literal. +/// +/// A literal corresponds to `0` or more bytes that should be matched +/// literally. The smart constructors defined on `Hir` will automatically +/// concatenate adjacent literals into one literal, and will even automatically +/// replace empty literals with `Hir::empty()`. +/// +/// Note that despite a literal being represented by a sequence of bytes, its +/// `Debug` implementation will attempt to print it as a normal string. (That +/// is, not a sequence of decimal numbers.) +#[derive(Clone, Eq, PartialEq)] +pub struct Literal(pub Box<[u8]>); + +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + crate::debug::Bytes(&self.0).fmt(f) + } +} + +/// The high-level intermediate representation of a character class. +/// +/// A character class corresponds to a set of characters. A character is either +/// defined by a Unicode scalar value or a byte. +/// +/// A character class, regardless of its character type, is represented by a +/// sequence of non-overlapping non-adjacent ranges of characters. +/// +/// There are no guarantees about which class variant is used. Generally +/// speaking, the Unicode variat is used whenever a class needs to contain +/// non-ASCII Unicode scalar values. But the Unicode variant can be used even +/// when Unicode mode is disabled. For example, at the time of writing, the +/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class +/// `[a\u00A0]` due to optimizations. +/// +/// Note that `Bytes` variant may be produced even when it exclusively matches +/// valid UTF-8. This is because a `Bytes` variant represents an intention by +/// the author of the regular expression to disable Unicode mode, which in turn +/// impacts the semantics of case insensitive matching. For example, `(?i)k` +/// and `(?i-u)k` will not match the same set of strings. +#[derive(Clone, Eq, PartialEq)] +pub enum Class { + /// A set of characters represented by Unicode scalar values. + Unicode(ClassUnicode), + /// A set of characters represented by arbitrary bytes (one byte per + /// character). + Bytes(ClassBytes), +} + +impl Class { + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled and the underlying class is Unicode oriented. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. + pub fn case_fold_simple(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.case_fold_simple(), + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + } + + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Error + /// + /// This routine returns an error when the case mapping data necessary + /// for this routine to complete is unavailable. This occurs when the + /// `unicode-case` feature is not enabled and the underlying class is + /// Unicode oriented. + pub fn try_case_fold_simple( + &mut self, + ) -> core::result::Result<(), CaseFoldError> { + match *self { + Class::Unicode(ref mut x) => x.try_case_fold_simple()?, + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + Ok(()) + } + + /// Negate this character class in place. + /// + /// After completion, this character class will contain precisely the + /// characters that weren't previously in the class. + pub fn negate(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.negate(), + Class::Bytes(ref mut x) => x.negate(), + } + } + + /// Returns true if and only if this character class will only ever match + /// valid UTF-8. + /// + /// A character class can match invalid UTF-8 only when the following + /// conditions are met: + /// + /// 1. The translator was configured to permit generating an expression + /// that can match invalid UTF-8. (By default, this is disabled.) + /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete + /// syntax or in the parser builder. By default, Unicode mode is + /// enabled. + pub fn is_utf8(&self) -> bool { + match *self { + Class::Unicode(_) => true, + Class::Bytes(ref x) => x.is_ascii(), + } + } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// minimum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// // The empty string has a min length of 0. + /// let hir = parse(r"")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that can match the empty string but match more is still 0. + /// let hir = parse(r"a*")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that matches nothing has no minimum defined. + /// let hir = parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().minimum_len()); + /// // Character classes usually have a minimum length of 1. + /// let hir = parse(r"\w")?; + /// assert_eq!(Some(1), hir.properties().minimum_len()); + /// // But sometimes Unicode classes might be bigger! + /// let hir = parse(r"\p{Cyrillic}")?; + /// assert_eq!(Some(2), hir.properties().minimum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn minimum_len(&self) -> Option { + match *self { + Class::Unicode(ref x) => x.minimum_len(), + Class::Bytes(ref x) => x.minimum_len(), + } + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// maximum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// // The empty string has a max length of 0. + /// let hir = parse(r"")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // A regex that matches nothing has no maximum defined. + /// let hir = parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // Bounded repeats work as you expect. + /// let hir = parse(r"x{2,10}")?; + /// assert_eq!(Some(10), hir.properties().maximum_len()); + /// // An unbounded repeat means there is no maximum. + /// let hir = parse(r"x{2,}")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // With Unicode enabled, \w can match up to 4 bytes! + /// let hir = parse(r"\w")?; + /// assert_eq!(Some(4), hir.properties().maximum_len()); + /// // Without Unicode enabled, \w matches at most 1 byte. + /// let hir = parse(r"(?-u)\w")?; + /// assert_eq!(Some(1), hir.properties().maximum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn maximum_len(&self) -> Option { + match *self { + Class::Unicode(ref x) => x.maximum_len(), + Class::Bytes(ref x) => x.maximum_len(), + } + } + + /// Returns true if and only if this character class is empty. That is, + /// it has no elements. + /// + /// An empty character can never match anything, including an empty string. + pub fn is_empty(&self) -> bool { + match *self { + Class::Unicode(ref x) => x.ranges().is_empty(), + Class::Bytes(ref x) => x.ranges().is_empty(), + } + } + + /// If this class consists of exactly one element (whether a codepoint or a + /// byte), then return it as a literal byte string. + /// + /// If this class is empty or contains more than one element, then `None` + /// is returned. + pub fn literal(&self) -> Option> { + match *self { + Class::Unicode(ref x) => x.literal(), + Class::Bytes(ref x) => x.literal(), + } + } +} + +impl core::fmt::Debug for Class { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::debug::Byte; + + let mut fmter = f.debug_set(); + match *self { + Class::Unicode(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(r.start..=r.end)); + } + } + Class::Bytes(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(Byte(r.start)..=Byte(r.end))); + } + } + } + fmter.finish() + } +} + +/// A set of characters represented by Unicode scalar values. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + set: IntervalSet, +} + +impl ClassUnicode { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. + pub fn new(ranges: I) -> ClassUnicode + where + I: IntoIterator, + { + ClassUnicode { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. + pub fn empty() -> ClassUnicode { + ClassUnicode::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassUnicodeRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassUnicodeIter<'_> { + ClassUnicodeIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassUnicodeRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. + pub fn case_fold_simple(&mut self) { + self.set + .case_fold_simple() + .expect("unicode-case feature must be enabled"); + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Error + /// + /// This routine returns an error when the case mapping data necessary + /// for this routine to complete is unavailable. This occurs when the + /// `unicode-case` feature is not enabled. + pub fn try_case_fold_simple( + &mut self, + ) -> core::result::Result<(), CaseFoldError> { + self.set.case_fold_simple() + } + + /// Negate this character class. + /// + /// For all `c` where `c` is a Unicode scalar value, if `c` was in this + /// set, then it will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this character class with the given character class, in place. + pub fn union(&mut self, other: &ClassUnicode) { + self.set.union(&other.set); + } + + /// Intersect this character class with the given character class, in + /// place. + pub fn intersect(&mut self, other: &ClassUnicode) { + self.set.intersect(&other.set); + } + + /// Subtract the given character class from this character class, in place. + pub fn difference(&mut self, other: &ClassUnicode) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given character classes, in + /// place. + /// + /// This computes the symmetric difference of two character classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassUnicode) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII codepoint. + pub fn is_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') + } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option { + let first = self.ranges().get(0)?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(first.start.len_utf8()) + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option { + let last = self.ranges().last()?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(last.end.len_utf8()) + } + + /// If this class consists of exactly one codepoint, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one codepoint, then `None` + /// is returned. + pub fn literal(&self) -> Option> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes()) + } else { + None + } + } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent byte class. + pub fn to_byte_class(&self) -> Option { + if !self.is_ascii() { + return None; + } + Some(ClassBytes::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our codepoint range is ASCII, the + // 'u8::try_from' calls below are guaranteed to be correct. + ClassBytesRange { + start: u8::try_from(r.start).unwrap(), + end: u8::try_from(r.end).unwrap(), + } + }))) + } +} + +/// An iterator over all ranges in a Unicode character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); + +impl<'a> Iterator for ClassUnicodeIter<'a> { + type Item = &'a ClassUnicodeRange; + + fn next(&mut self) -> Option<&'a ClassUnicodeRange> { + self.0.next() + } +} + +/// A single range of characters represented by Unicode scalar values. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassUnicodeRange { + start: char, + end: char, +} + +impl core::fmt::Debug for ClassUnicodeRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let start = if !self.start.is_whitespace() && !self.start.is_control() + { + self.start.to_string() + } else { + format!("0x{:X}", u32::from(self.start)) + }; + let end = if !self.end.is_whitespace() && !self.end.is_control() { + self.end.to_string() + } else { + format!("0x{:X}", u32::from(self.end)) + }; + f.debug_struct("ClassUnicodeRange") + .field("start", &start) + .field("end", &end) + .finish() + } +} + +impl Interval for ClassUnicodeRange { + type Bound = char; + + #[inline] + fn lower(&self) -> char { + self.start + } + #[inline] + fn upper(&self) -> char { + self.end + } + #[inline] + fn set_lower(&mut self, bound: char) { + self.start = bound; + } + #[inline] + fn set_upper(&mut self, bound: char) { + self.end = bound; + } + + /// Apply simple case folding to this Unicode scalar value range. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple( + &self, + ranges: &mut Vec, + ) -> Result<(), unicode::CaseFoldError> { + let mut folder = unicode::SimpleCaseFolder::new()?; + if !folder.overlaps(self.start, self.end) { + return Ok(()); + } + let (start, end) = (u32::from(self.start), u32::from(self.end)); + for cp in (start..=end).filter_map(char::from_u32) { + for &cp_folded in folder.mapping(cp) { + ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); + } + } + Ok(()) + } +} + +impl ClassUnicodeRange { + /// Create a new Unicode scalar value range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: char, end: char) -> ClassUnicodeRange { + ClassUnicodeRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> char { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> char { + self.end + } + + /// Returns the number of codepoints in this range. + pub fn len(&self) -> usize { + let diff = 1 + u32::from(self.end) - u32::from(self.start); + // This is likely to panic in 16-bit targets since a usize can only fit + // 2^16. It's not clear what to do here, other than to return an error + // when building a Unicode class that contains a range whose length + // overflows usize. (Which, to be honest, is probably quite common on + // 16-bit targets. For example, this would imply that '.' and '\p{any}' + // would be impossible to build.) + usize::try_from(diff).expect("char class len fits in usize") + } +} + +/// A set of characters represented by arbitrary bytes. +/// +/// Each byte corresponds to one character. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBytes { + set: IntervalSet, +} + +impl ClassBytes { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. + pub fn new(ranges: I) -> ClassBytes + where + I: IntoIterator, + { + ClassBytes { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. + pub fn empty() -> ClassBytes { + ClassBytes::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassBytesRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassBytesIter<'_> { + ClassBytesIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassBytesRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// Note that this only applies ASCII case folding, which is limited to the + /// characters `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + self.set.case_fold_simple().expect("ASCII case folding never fails"); + } + + /// Negate this byte class. + /// + /// For all `b` where `b` is a any byte, if `b` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this byte class with the given byte class, in place. + pub fn union(&mut self, other: &ClassBytes) { + self.set.union(&other.set); + } + + /// Intersect this byte class with the given byte class, in place. + pub fn intersect(&mut self, other: &ClassBytes) { + self.set.intersect(&other.set); + } + + /// Subtract the given byte class from this byte class, in place. + pub fn difference(&mut self, other: &ClassBytes) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given byte classes, in place. + /// + /// This computes the symmetric difference of two byte classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassBytes) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII byte. + pub fn is_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) + } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } + + /// If this class consists of exactly one byte, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one byte, then `None` + /// is returned. + pub fn literal(&self) -> Option> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(vec![rs[0].start]) + } else { + None + } + } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent Unicode class. + pub fn to_unicode_class(&self) -> Option { + if !self.is_ascii() { + return None; + } + Some(ClassUnicode::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our byte range is ASCII, the + // 'char::from' calls below are correct and will not erroneously + // convert a raw byte value into its corresponding codepoint. + ClassUnicodeRange { + start: char::from(r.start), + end: char::from(r.end), + } + }))) + } +} + +/// An iterator over all ranges in a byte character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); + +impl<'a> Iterator for ClassBytesIter<'a> { + type Item = &'a ClassBytesRange; + + fn next(&mut self) -> Option<&'a ClassBytesRange> { + self.0.next() + } +} + +/// A single range of characters represented by arbitrary bytes. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassBytesRange { + start: u8, + end: u8, +} + +impl Interval for ClassBytesRange { + type Bound = u8; + + #[inline] + fn lower(&self) -> u8 { + self.start + } + #[inline] + fn upper(&self) -> u8 { + self.end + } + #[inline] + fn set_lower(&mut self, bound: u8) { + self.start = bound; + } + #[inline] + fn set_upper(&mut self, bound: u8) { + self.end = bound; + } + + /// Apply simple case folding to this byte range. Only ASCII case mappings + /// (for a-z) are applied. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple( + &self, + ranges: &mut Vec, + ) -> Result<(), unicode::CaseFoldError> { + if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'a'); + let upper = cmp::min(self.end, b'z'); + ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); + } + if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'A'); + let upper = cmp::min(self.end, b'Z'); + ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); + } + Ok(()) + } +} + +impl ClassBytesRange { + /// Create a new byte range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: u8, end: u8) -> ClassBytesRange { + ClassBytesRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> u8 { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> u8 { + self.end + } + + /// Returns the number of bytes in this range. + pub fn len(&self) -> usize { + usize::from(self.end.checked_sub(self.start).unwrap()) + .checked_add(1) + .unwrap() + } +} + +impl core::fmt::Debug for ClassBytesRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("ClassBytesRange") + .field("start", &crate::debug::Byte(self.start)) + .field("end", &crate::debug::Byte(self.end)) + .finish() + } +} + +/// The high-level intermediate representation for a look-around assertion. +/// +/// An assertion match is always zero-length. Also called an "empty match." +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + Start = 1 << 0, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + End = 1 << 1, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF = 1 << 2, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF = 1 << 3, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF = 1 << 4, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF = 1 << 5, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordAscii = 1 << 6, + /// Match an ASCII-only negation of a word boundary. + WordAsciiNegate = 1 << 7, + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordUnicode = 1 << 8, + /// Match a Unicode-aware negation of a word boundary. + WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, +} + +impl Look { + /// Flip the look-around assertion to its equivalent for reverse searches. + /// For example, `StartLF` gets translated to `EndLF`. + /// + /// Some assertions, such as `WordUnicode`, remain the same since they + /// match the same positions regardless of the direction of the search. + #[inline] + pub const fn reversed(self) -> Look { + match self { + Look::Start => Look::End, + Look::End => Look::Start, + Look::StartLF => Look::EndLF, + Look::EndLF => Look::StartLF, + Look::StartCRLF => Look::EndCRLF, + Look::EndCRLF => Look::StartCRLF, + Look::WordAscii => Look::WordAscii, + Look::WordAsciiNegate => Look::WordAsciiNegate, + Look::WordUnicode => Look::WordUnicode, + Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, + } + } + + /// Return the underlying representation of this look-around enumeration + /// as an integer. Giving the return value to the [`Look::from_repr`] + /// constructor is guaranteed to return the same look-around variant that + /// one started with within a semver compatible release of this crate. + #[inline] + pub const fn as_repr(self) -> u32 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + self as u32 + } + + /// Given the underlying representation of a `Look` value, return the + /// corresponding `Look` value if the representation is valid. Otherwise + /// `None` is returned. + #[inline] + pub const fn from_repr(repr: u32) -> Option { + match repr { + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), + _ => None, + } + } + + /// Returns a convenient single codepoint representation of this + /// look-around assertion. Each assertion is guaranteed to be represented + /// by a distinct character. + /// + /// This is useful for succinctly representing a look-around assertion in + /// human friendly but succinct output intended for a programmer working on + /// regex internals. + #[inline] + pub const fn as_char(self) -> char { + match self { + Look::Start => 'A', + Look::End => 'z', + Look::StartLF => '^', + Look::EndLF => '$', + Look::StartCRLF => 'r', + Look::EndCRLF => 'R', + Look::WordAscii => 'b', + Look::WordAsciiNegate => 'B', + Look::WordUnicode => '𝛃', + Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', + } + } +} + +/// The high-level intermediate representation for a capturing group. +/// +/// A capturing group always has an index and a child expression. It may +/// also have a name associated with it (e.g., `(?P\w)`), but it's not +/// necessary. +/// +/// Note that there is no explicit representation of a non-capturing group +/// in a `Hir`. Instead, non-capturing grouping is handled automatically by +/// the recursive structure of the `Hir` itself. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Capture { + /// The capture index of the capture. + pub index: u32, + /// The name of the capture, if it exists. + pub name: Option>, + /// The expression inside the capturing group, which may be empty. + pub sub: Box, +} + +/// The high-level intermediate representation of a repetition operator. +/// +/// A repetition operator permits the repetition of an arbitrary +/// sub-expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The minimum range of the repetition. + /// + /// Note that special cases like `?`, `+` and `*` all get translated into + /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. + /// + /// When `min` is zero, this expression can match the empty string + /// regardless of what its sub-expression is. + pub min: u32, + /// The maximum range of the repetition. + /// + /// Note that when `max` is `None`, `min` acts as a lower bound but where + /// there is no upper bound. For something like `x{5}` where the min and + /// max are equivalent, `min` will be set to `5` and `max` will be set to + /// `Some(5)`. + pub max: Option, + /// Whether this repetition operator is greedy or not. A greedy operator + /// will match as much as it can. A non-greedy operator will match as + /// little as it can. + /// + /// Typically, operators are greedy by default and are only non-greedy when + /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is + /// not. However, this can be inverted via the `U` "ungreedy" flag. + pub greedy: bool, + /// The expression being repeated. + pub sub: Box, +} + +impl Repetition { + /// Returns a new repetition with the same `min`, `max` and `greedy` + /// values, but with its sub-expression replaced with the one given. + pub fn with(&self, sub: Hir) -> Repetition { + Repetition { + min: self.min, + max: self.max, + greedy: self.greedy, + sub: Box::new(sub), + } + } +} + +/// A type describing the different flavors of `.`. +/// +/// This type is meant to be used with [`Hir::dot`], which is a convenience +/// routine for building HIR values derived from the `.` regex. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Dot { + /// Matches the UTF-8 encoding of any Unicode scalar value. + /// + /// This is equivalent to `(?su:.)` and also `\p{any}`. + AnyChar, + /// Matches any byte value. + /// + /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`. + AnyByte, + /// Matches the UTF-8 encoding of any Unicode scalar value except for the + /// `char` given. + /// + /// This is equivalent to using `(?u-s:.)` with the line terminator set + /// to a particular ASCII byte. (Because of peculiarities in the regex + /// engines, a line terminator must be a single byte. It follows that when + /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar + /// value. That is, ti must be ASCII.) + /// + /// (This and `AnyCharExceptLF` both exist because of legacy reasons. + /// `AnyCharExceptLF` will be dropped in the next breaking change release.) + AnyCharExcept(char), + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. + /// + /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. + AnyCharExceptLF, + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r` + /// and `\n`. + /// + /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. + AnyCharExceptCRLF, + /// Matches any byte value except for the `u8` given. + /// + /// This is equivalent to using `(?-us:.)` with the line terminator set + /// to a particular ASCII byte. (Because of peculiarities in the regex + /// engines, a line terminator must be a single byte. It follows that when + /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar + /// value. That is, ti must be ASCII.) + /// + /// (This and `AnyByteExceptLF` both exist because of legacy reasons. + /// `AnyByteExceptLF` will be dropped in the next breaking change release.) + AnyByteExcept(u8), + /// Matches any byte value except for `\n`. + /// + /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. + AnyByteExceptLF, + /// Matches any byte value except for `\r` and `\n`. + /// + /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`. + AnyByteExceptCRLF, +} + +/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack +/// space but heap space proportional to the depth of the total `Hir`. +impl Drop for Hir { + fn drop(&mut self) { + use core::mem; + + match *self.kind() { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => return, + HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { + return + } + HirKind::Concat(ref x) if x.is_empty() => return, + HirKind::Alternation(ref x) if x.is_empty() => return, + _ => {} + } + + let mut stack = vec![mem::replace(self, Hir::empty())]; + while let Some(mut expr) = stack.pop() { + match expr.kind { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => {} + HirKind::Capture(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Concat(ref mut x) => { + stack.extend(x.drain(..)); + } + HirKind::Alternation(ref mut x) => { + stack.extend(x.drain(..)); + } + } + } + } +} + +/// A type that collects various properties of an HIR value. +/// +/// Properties are always scalar values and represent meta data that is +/// computed inductively on an HIR value. Properties are defined for all +/// HIR values. +/// +/// All methods on a `Properties` value take constant time and are meant to +/// be cheap to call. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Properties(Box); + +/// The property definition. It is split out so that we can box it, and +/// there by make `Properties` use less stack size. This is kind-of important +/// because every HIR value has a `Properties` attached to it. +/// +/// This does have the unfortunate consequence that creating any HIR value +/// always leads to at least one alloc for properties, but this is generally +/// true anyway (for pretty much all HirKinds except for look-arounds). +#[derive(Clone, Debug, Eq, PartialEq)] +struct PropertiesI { + minimum_len: Option, + maximum_len: Option, + look_set: LookSet, + look_set_prefix: LookSet, + look_set_suffix: LookSet, + look_set_prefix_any: LookSet, + look_set_suffix_any: LookSet, + utf8: bool, + explicit_captures_len: usize, + static_explicit_captures_len: Option, + literal: bool, + alternation_literal: bool, +} + +impl Properties { + /// Returns the length (in bytes) of the smallest string matched by this + /// HIR. + /// + /// A return value of `0` is possible and occurs when the HIR can match an + /// empty string. + /// + /// `None` is returned when there is no minimum length. This occurs in + /// precisely the cases where the HIR matches nothing. i.e., The language + /// the regex matches is empty. An example of such a regex is `\P{any}`. + #[inline] + pub fn minimum_len(&self) -> Option { + self.0.minimum_len + } + + /// Returns the length (in bytes) of the longest string matched by this + /// HIR. + /// + /// A return value of `0` is possible and occurs when nothing longer than + /// the empty string is in the language described by this HIR. + /// + /// `None` is returned when there is no longest matching string. This + /// occurs when the HIR matches nothing or when there is no upper bound on + /// the length of matching strings. Example of such regexes are `\P{any}` + /// (matches nothing) and `a+` (has no upper bound). + #[inline] + pub fn maximum_len(&self) -> Option { + self.0.maximum_len + } + + /// Returns a set of all look-around assertions that appear at least once + /// in this HIR value. + #[inline] + pub fn look_set(&self) -> LookSet { + self.0.look_set + } + + /// Returns a set of all look-around assertions that appear as a prefix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed before matching any bytes in a haystack. + /// + /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true + /// if and only if the HIR is fully anchored at the start. + #[inline] + pub fn look_set_prefix(&self) -> LookSet { + self.0.look_set_prefix + } + + /// Returns a set of all look-around assertions that appear as a _possible_ + /// prefix for this HIR value. That is, the set returned corresponds to the + /// set of assertions that _may_ be passed before matching any bytes in a + /// haystack. + /// + /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns + /// true if and only if it's possible for the regex to match through a + /// anchored assertion before consuming any input. + #[inline] + pub fn look_set_prefix_any(&self) -> LookSet { + self.0.look_set_prefix_any + } + + /// Returns a set of all look-around assertions that appear as a suffix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed in order to be considered a match after + /// all other consuming HIR expressions. + /// + /// For example, `hir.look_set_suffix().contains(Look::End)` returns true + /// if and only if the HIR is fully anchored at the end. + #[inline] + pub fn look_set_suffix(&self) -> LookSet { + self.0.look_set_suffix + } + + /// Returns a set of all look-around assertions that appear as a _possible_ + /// suffix for this HIR value. That is, the set returned corresponds to the + /// set of assertions that _may_ be passed before matching any bytes in a + /// haystack. + /// + /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns + /// true if and only if it's possible for the regex to match through a + /// anchored assertion at the end of a match without consuming any input. + #[inline] + pub fn look_set_suffix_any(&self) -> LookSet { + self.0.look_set_suffix_any + } + + /// Return true if and only if the corresponding HIR will always match + /// valid UTF-8. + /// + /// When this returns false, then it is possible for this HIR expression to + /// match invalid UTF-8, including by matching between the code units of + /// a single UTF-8 encoded codepoint. + /// + /// Note that this returns true even when the corresponding HIR can match + /// the empty string. Since an empty string can technically appear between + /// UTF-8 code units, it is possible for a match to be reported that splits + /// a codepoint which could in turn be considered matching invalid UTF-8. + /// However, it is generally assumed that such empty matches are handled + /// specially by the search routine if it is absolutely required that + /// matches not split a codepoint. + /// + /// # Example + /// + /// This code example shows the UTF-8 property of a variety of patterns. + /// + /// ``` + /// use regex_syntax::{ParserBuilder, parse}; + /// + /// // Examples of 'is_utf8() == true'. + /// assert!(parse(r"a")?.properties().is_utf8()); + /// assert!(parse(r"[^a]")?.properties().is_utf8()); + /// assert!(parse(r".")?.properties().is_utf8()); + /// assert!(parse(r"\W")?.properties().is_utf8()); + /// assert!(parse(r"\b")?.properties().is_utf8()); + /// assert!(parse(r"\B")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\b")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\B")?.properties().is_utf8()); + /// // Unicode mode is enabled by default, and in + /// // that mode, all \x hex escapes are treated as + /// // codepoints. So this actually matches the UTF-8 + /// // encoding of U+00FF. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// + /// // Now we show examples of 'is_utf8() == false'. + /// // The only way to do this is to force the parser + /// // to permit invalid UTF-8, otherwise all of these + /// // would fail to parse! + /// let parse = |pattern| { + /// ParserBuilder::new().utf8(false).build().parse(pattern) + /// }; + /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u).")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8()); + /// // Conversely to the equivalent example above, + /// // when Unicode mode is disabled, \x hex escapes + /// // are treated as their raw byte values. + /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8()); + /// // Note that just because we disabled UTF-8 in the + /// // parser doesn't mean we still can't use Unicode. + /// // It is enabled by default, so \xFF is still + /// // equivalent to matching the UTF-8 encoding of + /// // U+00FF by default. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// // Even though we use raw bytes that individually + /// // are not valid UTF-8, when combined together, the + /// // overall expression *does* match valid UTF-8! + /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8()); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn is_utf8(&self) -> bool { + self.0.utf8 + } + + /// Returns the total number of explicit capturing groups in the + /// corresponding HIR. + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match that is typically included by regex + /// engines. + /// + /// # Example + /// + /// This method will return `0` for `a` and `1` for `(a)`: + /// + /// ``` + /// use regex_syntax::parse; + /// + /// assert_eq!(0, parse("a")?.properties().explicit_captures_len()); + /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn explicit_captures_len(&self) -> usize { + self.0.explicit_captures_len + } + + /// Returns the total number of explicit capturing groups that appear in + /// every possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex_syntax::parse; + /// + /// let len = |pattern| { + /// parse(pattern).map(|h| { + /// h.properties().static_explicit_captures_len() + /// }) + /// }; + /// + /// assert_eq!(Some(0), len("a")?); + /// assert_eq!(Some(1), len("(a)")?); + /// assert_eq!(Some(1), len("(a)|(b)")?); + /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(1), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_explicit_captures_len(&self) -> Option { + self.0.static_explicit_captures_len + } + + /// Return true if and only if this HIR is a simple literal. This is + /// only true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s. + /// + /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and + /// the empty string are not (even though they contain sub-expressions that + /// are literals). + #[inline] + pub fn is_literal(&self) -> bool { + self.0.literal + } + + /// Return true if and only if this HIR is either a simple literal or an + /// alternation of simple literals. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s or an alternation of only `Literal`s. + /// + /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation + /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not + /// (even though that contain sub-expressions that are literals). + #[inline] + pub fn is_alternation_literal(&self) -> bool { + self.0.alternation_literal + } + + /// Returns the total amount of heap memory usage, in bytes, used by this + /// `Properties` value. + #[inline] + pub fn memory_usage(&self) -> usize { + core::mem::size_of::() + } + + /// Returns a new set of properties that corresponds to the union of the + /// iterator of properties given. + /// + /// This is useful when one has multiple `Hir` expressions and wants + /// to combine them into a single alternation without constructing the + /// corresponding `Hir`. This routine provides a way of combining the + /// properties of each `Hir` expression into one set of properties + /// representing the union of those expressions. + /// + /// # Example: union with HIRs that never match + /// + /// This example shows that unioning properties together with one that + /// represents a regex that never matches will "poison" certain attributes, + /// like the minimum and maximum lengths. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// let hir1 = parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = parse(r"[a&&b]")?; + /// assert_eq!(None, hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(None, unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// The maximum length can also be "poisoned" by a pattern that has no + /// upper bound on the length of a match. The minimum length remains + /// unaffected: + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// let hir1 = parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = parse(r"a+")?; + /// assert_eq!(Some(1), hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(Some(1), unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn union(props: I) -> Properties + where + I: IntoIterator, + P: core::borrow::Borrow, + { + let mut it = props.into_iter().peekable(); + // While empty alternations aren't possible, we still behave as if they + // are. When we have an empty alternate, then clearly the look-around + // prefix and suffix is empty. Otherwise, it is the intersection of all + // prefixes and suffixes (respectively) of the branches. + let fix = if it.peek().is_none() { + LookSet::empty() + } else { + LookSet::full() + }; + // And also, an empty alternate means we have 0 static capture groups, + // but we otherwise start with the number corresponding to the first + // alternate. If any subsequent alternate has a different number of + // static capture groups, then we overall have a variation and not a + // static number of groups. + let static_explicit_captures_len = + it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); + // The base case is an empty alternation, which matches nothing. + // Note though that empty alternations aren't possible, because the + // Hir::alternation smart constructor rewrites those as empty character + // classes. + let mut props = PropertiesI { + minimum_len: None, + maximum_len: None, + look_set: LookSet::empty(), + look_set_prefix: fix, + look_set_suffix: fix, + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len, + literal: false, + alternation_literal: true, + }; + let (mut min_poisoned, mut max_poisoned) = (false, false); + // Handle properties that need to visit every child hir. + for prop in it { + let p = prop.borrow(); + props.look_set.set_union(p.look_set()); + props.look_set_prefix.set_intersect(p.look_set_prefix()); + props.look_set_suffix.set_intersect(p.look_set_suffix()); + props.look_set_prefix_any.set_union(p.look_set_prefix_any()); + props.look_set_suffix_any.set_union(p.look_set_suffix_any()); + props.utf8 = props.utf8 && p.is_utf8(); + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + if props.static_explicit_captures_len + != p.static_explicit_captures_len() + { + props.static_explicit_captures_len = None; + } + props.alternation_literal = + props.alternation_literal && p.is_literal(); + if !min_poisoned { + if let Some(xmin) = p.minimum_len() { + if props.minimum_len.map_or(true, |pmin| xmin < pmin) { + props.minimum_len = Some(xmin); + } + } else { + props.minimum_len = None; + min_poisoned = true; + } + } + if !max_poisoned { + if let Some(xmax) = p.maximum_len() { + if props.maximum_len.map_or(true, |pmax| xmax > pmax) { + props.maximum_len = Some(xmax); + } + } else { + props.maximum_len = None; + max_poisoned = true; + } + } + } + Properties(Box::new(props)) + } +} + +impl Properties { + /// Create a new set of HIR properties for an empty regex. + fn empty() -> Properties { + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + // It is debatable whether an empty regex always matches at valid + // UTF-8 boundaries. Strictly speaking, at a byte oriented view, + // it is clearly false. There are, for example, many empty strings + // between the bytes encoding a '☃'. + // + // However, when Unicode mode is enabled, the fundamental atom + // of matching is really a codepoint. And in that scenario, an + // empty regex is defined to only match at valid UTF-8 boundaries + // and to never split a codepoint. It just so happens that this + // enforcement is somewhat tricky to do for regexes that match + // the empty string inside regex engines themselves. It usually + // requires some layer above the regex engine to filter out such + // matches. + // + // In any case, 'true' is really the only coherent option. If it + // were false, for example, then 'a*' would also need to be false + // since it too can match the empty string. + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a literal regex. + fn literal(lit: &Literal) -> Properties { + let inner = PropertiesI { + minimum_len: Some(lit.0.len()), + maximum_len: Some(lit.0.len()), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: core::str::from_utf8(&lit.0).is_ok(), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: true, + alternation_literal: true, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a character class. + fn class(class: &Class) -> Properties { + let inner = PropertiesI { + minimum_len: class.minimum_len(), + maximum_len: class.maximum_len(), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: class.is_utf8(), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a look-around assertion. + fn look(look: Look) -> Properties { + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::singleton(look), + look_set_prefix: LookSet::singleton(look), + look_set_suffix: LookSet::singleton(look), + look_set_prefix_any: LookSet::singleton(look), + look_set_suffix_any: LookSet::singleton(look), + // This requires a little explanation. Basically, we don't consider + // matching an empty string to be equivalent to matching invalid + // UTF-8, even though technically matching every empty string will + // split the UTF-8 encoding of a single codepoint when treating a + // UTF-8 encoded string as a sequence of bytes. Our defense here is + // that in such a case, a codepoint should logically be treated as + // the fundamental atom for matching, and thus the only valid match + // points are between codepoints and not bytes. + // + // More practically, this is true here because it's also true + // for 'Hir::empty()', otherwise something like 'a*' would be + // considered to match invalid UTF-8. That in turn makes this + // property borderline useless. + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a repetition. + fn repetition(rep: &Repetition) -> Properties { + let p = rep.sub.properties(); + let minimum_len = p.minimum_len().map(|child_min| { + let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX); + child_min.saturating_mul(rep_min) + }); + let maximum_len = rep.max.and_then(|rep_max| { + let rep_max = usize::try_from(rep_max).ok()?; + let child_max = p.maximum_len()?; + child_max.checked_mul(rep_max) + }); + + let mut inner = PropertiesI { + minimum_len, + maximum_len, + look_set: p.look_set(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: p.look_set_prefix_any(), + look_set_suffix_any: p.look_set_suffix_any(), + utf8: p.is_utf8(), + explicit_captures_len: p.explicit_captures_len(), + static_explicit_captures_len: p.static_explicit_captures_len(), + literal: false, + alternation_literal: false, + }; + // If the repetition operator can match the empty string, then its + // lookset prefix and suffixes themselves remain empty since they are + // no longer required to match. + if rep.min > 0 { + inner.look_set_prefix = p.look_set_prefix(); + inner.look_set_suffix = p.look_set_suffix(); + } + // If the static captures len of the sub-expression is not known or + // is greater than zero, then it automatically propagates to the + // repetition, regardless of the repetition. Otherwise, it might + // change, but only when the repetition can match 0 times. + if rep.min == 0 + && inner.static_explicit_captures_len.map_or(false, |len| len > 0) + { + // If we require a match 0 times, then our captures len is + // guaranteed to be zero. Otherwise, if we *can* match the empty + // string, then it's impossible to know how many captures will be + // in the resulting match. + if rep.max == Some(0) { + inner.static_explicit_captures_len = Some(0); + } else { + inner.static_explicit_captures_len = None; + } + } + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a capture. + fn capture(capture: &Capture) -> Properties { + let p = capture.sub.properties(); + Properties(Box::new(PropertiesI { + explicit_captures_len: p.explicit_captures_len().saturating_add(1), + static_explicit_captures_len: p + .static_explicit_captures_len() + .map(|len| len.saturating_add(1)), + literal: false, + alternation_literal: false, + ..*p.0.clone() + })) + } + + /// Create a new set of HIR properties for a concatenation. + fn concat(concat: &[Hir]) -> Properties { + // The base case is an empty concatenation, which matches the empty + // string. Note though that empty concatenations aren't possible, + // because the Hir::concat smart constructor rewrites those as + // Hir::empty. + let mut props = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: true, + alternation_literal: true, + }; + // Handle properties that need to visit every child hir. + for x in concat.iter() { + let p = x.properties(); + props.look_set.set_union(p.look_set()); + props.utf8 = props.utf8 && p.is_utf8(); + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + props.static_explicit_captures_len = p + .static_explicit_captures_len() + .and_then(|len1| { + Some((len1, props.static_explicit_captures_len?)) + }) + .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); + props.literal = props.literal && p.is_literal(); + props.alternation_literal = + props.alternation_literal && p.is_alternation_literal(); + if let Some(minimum_len) = props.minimum_len { + match p.minimum_len() { + None => props.minimum_len = None, + Some(len) => { + // We use saturating arithmetic here because the + // minimum is just a lower bound. We can't go any + // higher than what our number types permit. + props.minimum_len = + Some(minimum_len.saturating_add(len)); + } + } + } + if let Some(maximum_len) = props.maximum_len { + match p.maximum_len() { + None => props.maximum_len = None, + Some(len) => { + props.maximum_len = maximum_len.checked_add(len) + } + } + } + } + // Handle the prefix properties, which only requires visiting + // child exprs until one matches more than the empty string. + let mut it = concat.iter(); + while let Some(x) = it.next() { + props.look_set_prefix.set_union(x.properties().look_set_prefix()); + props + .look_set_prefix_any + .set_union(x.properties().look_set_prefix_any()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; + } + } + // Same thing for the suffix properties, but in reverse. + let mut it = concat.iter().rev(); + while let Some(x) = it.next() { + props.look_set_suffix.set_union(x.properties().look_set_suffix()); + props + .look_set_suffix_any + .set_union(x.properties().look_set_suffix_any()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; + } + } + Properties(Box::new(props)) + } + + /// Create a new set of HIR properties for a concatenation. + fn alternation(alts: &[Hir]) -> Properties { + Properties::union(alts.iter().map(|hir| hir.properties())) + } +} + +/// A set of look-around assertions. +/// +/// This is useful for efficiently tracking look-around assertions. For +/// example, an [`Hir`] provides properties that return `LookSet`s. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +pub struct LookSet { + /// The underlying representation this set is exposed to make it possible + /// to store it somewhere efficiently. The representation is that + /// of a bitset, where each assertion occupies bit `i` where `i = + /// Look::as_repr()`. + /// + /// Note that users of this internal representation must permit the full + /// range of `u16` values to be represented. For example, even if the + /// current implementation only makes use of the 10 least significant bits, + /// it may use more bits in a future semver compatible release. + pub bits: u32, +} + +impl LookSet { + /// Create an empty set of look-around assertions. + #[inline] + pub fn empty() -> LookSet { + LookSet { bits: 0 } + } + + /// Create a full set of look-around assertions. + /// + /// This set contains all possible look-around assertions. + #[inline] + pub fn full() -> LookSet { + LookSet { bits: !0 } + } + + /// Create a look-around set containing the look-around assertion given. + /// + /// This is a convenience routine for creating an empty set and inserting + /// one look-around assertions. + #[inline] + pub fn singleton(look: Look) -> LookSet { + LookSet::empty().insert(look) + } + + /// Returns the total number of look-around assertions in this set. + #[inline] + pub fn len(self) -> usize { + // OK because max value always fits in a u8, which in turn always + // fits in a usize, regardless of target. + usize::try_from(self.bits.count_ones()).unwrap() + } + + /// Returns true if and only if this set is empty. + #[inline] + pub fn is_empty(self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if the given look-around assertion is in this + /// set. + #[inline] + pub fn contains(self, look: Look) -> bool { + self.bits & look.as_repr() != 0 + } + + /// Returns true if and only if this set contains any anchor assertions. + /// This includes both "start/end of haystack" and "start/end of line." + #[inline] + pub fn contains_anchor(&self) -> bool { + self.contains_anchor_haystack() || self.contains_anchor_line() + } + + /// Returns true if and only if this set contains any "start/end of + /// haystack" anchors. This doesn't include "start/end of line" anchors. + #[inline] + pub fn contains_anchor_haystack(&self) -> bool { + self.contains(Look::Start) || self.contains(Look::End) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors. This doesn't include "start/end of haystack" anchors. This + /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. + #[inline] + pub fn contains_anchor_line(&self) -> bool { + self.contains(Look::StartLF) + || self.contains(Look::EndLF) + || self.contains(Look::StartCRLF) + || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that only treat `\n` as line terminators. This does not include + /// haystack anchors or CRLF aware line anchors. + #[inline] + pub fn contains_anchor_lf(&self) -> bool { + self.contains(Look::StartLF) || self.contains(Look::EndLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that are CRLF-aware. This doesn't include "start/end of + /// haystack" or "start/end of line-feed" anchors. + #[inline] + pub fn contains_anchor_crlf(&self) -> bool { + self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any word boundary or + /// negated word boundary assertions. This include both Unicode and ASCII + /// word boundaries. + #[inline] + pub fn contains_word(self) -> bool { + self.contains_word_unicode() || self.contains_word_ascii() + } + + /// Returns true if and only if this set contains any Unicode word boundary + /// or negated Unicode word boundary assertions. + #[inline] + pub fn contains_word_unicode(self) -> bool { + self.contains(Look::WordUnicode) + || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) + } + + /// Returns true if and only if this set contains any ASCII word boundary + /// or negated ASCII word boundary assertions. + #[inline] + pub fn contains_word_ascii(self) -> bool { + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) + } + + /// Returns an iterator over all of the look-around assertions in this set. + #[inline] + pub fn iter(self) -> LookSetIter { + LookSetIter { set: self } + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion added to it. If the assertion is already in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn insert(self, look: Look) -> LookSet { + LookSet { bits: self.bits | look.as_repr() } + } + + /// Updates this set in place with the result of inserting the given + /// assertion into this set. + #[inline] + pub fn set_insert(&mut self, look: Look) { + *self = self.insert(look); + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion removed from it. If the assertion is not in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn remove(self, look: Look) -> LookSet { + LookSet { bits: self.bits & !look.as_repr() } + } + + /// Updates this set in place with the result of removing the given + /// assertion from this set. + #[inline] + pub fn set_remove(&mut self, look: Look) { + *self = self.remove(look); + } + + /// Returns a new set that is the result of subtracting the given set from + /// this set. + #[inline] + pub fn subtract(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & !other.bits } + } + + /// Updates this set in place with the result of subtracting the given set + /// from this set. + #[inline] + pub fn set_subtract(&mut self, other: LookSet) { + *self = self.subtract(other); + } + + /// Returns a new set that is the union of this and the one given. + #[inline] + pub fn union(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits | other.bits } + } + + /// Updates this set in place with the result of unioning it with the one + /// given. + #[inline] + pub fn set_union(&mut self, other: LookSet) { + *self = self.union(other); + } + + /// Returns a new set that is the intersection of this and the one given. + #[inline] + pub fn intersect(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & other.bits } + } + + /// Updates this set in place with the result of intersecting it with the + /// one given. + #[inline] + pub fn set_intersect(&mut self, other: LookSet) { + *self = self.intersect(other); + } + + /// Return a `LookSet` from the slice given as a native endian 32-bit + /// integer. + /// + /// # Panics + /// + /// This panics if `slice.len() < 4`. + #[inline] + pub fn read_repr(slice: &[u8]) -> LookSet { + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); + LookSet { bits } + } + + /// Write a `LookSet` as a native endian 32-bit integer to the beginning + /// of the slice given. + /// + /// # Panics + /// + /// This panics if `slice.len() < 4`. + #[inline] + pub fn write_repr(self, slice: &mut [u8]) { + let raw = self.bits.to_ne_bytes(); + slice[0] = raw[0]; + slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "∅"); + } + for look in self.iter() { + write!(f, "{}", look.as_char())?; + } + Ok(()) + } +} + +/// An iterator over all look-around assertions in a [`LookSet`]. +/// +/// This iterator is created by [`LookSet::iter`]. +#[derive(Clone, Debug)] +pub struct LookSetIter { + set: LookSet, +} + +impl Iterator for LookSetIter { + type Item = Look; + + #[inline] + fn next(&mut self) -> Option { + if self.set.is_empty() { + return None; + } + // We'll never have more than u8::MAX distinct look-around assertions, + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; + self.set = self.set.remove(look); + Some(look) + } +} + +/// Given a sequence of HIR values where each value corresponds to a Unicode +/// class (or an all-ASCII byte class), return a single Unicode class +/// corresponding to the union of the classes found. +fn class_chars(hirs: &[Hir]) -> Option { + let mut cls = ClassUnicode::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(cls2); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(&cls2.to_unicode_class()?); + } + _ => return None, + }; + } + Some(Class::Unicode(cls)) +} + +/// Given a sequence of HIR values where each value corresponds to a byte class +/// (or an all-ASCII Unicode class), return a single byte class corresponding +/// to the union of the classes found. +fn class_bytes(hirs: &[Hir]) -> Option { + let mut cls = ClassBytes::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(&cls2.to_byte_class()?); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(cls2); + } + _ => return None, + }; + } + Some(Class::Bytes(cls)) +} + +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single `char`, return that sequence of `char`s. Otherwise return +/// None. No deduplication is done. +fn singleton_chars(hirs: &[Hir]) -> Option> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + let ch = match crate::debug::utf8_decode(literal) { + None => return None, + Some(Err(_)) => return None, + Some(Ok(ch)) => ch, + }; + if literal.len() != ch.len_utf8() { + return None; + } + singletons.push(ch); + } + Some(singletons) +} + +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single byte, return that sequence of bytes. Otherwise return +/// None. No deduplication is done. +fn singleton_bytes(hirs: &[Hir]) -> Option> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + if literal.len() != 1 { + return None; + } + singletons.push(literal[0]); + } + Some(singletons) +} + +/// Looks for a common prefix in the list of alternation branches given. If one +/// is found, then an equivalent but (hopefully) simplified Hir is returned. +/// Otherwise, the original given list of branches is returned unmodified. +/// +/// This is not quite as good as it could be. Right now, it requires that +/// all branches are 'Concat' expressions. It also doesn't do well with +/// literals. For example, given 'foofoo|foobar', it will not refactor it to +/// 'foo(?:foo|bar)' because literals are flattened into their own special +/// concatenation. (One wonders if perhaps 'Literal' should be a single atom +/// instead of a string of bytes because of this. Otherwise, handling the +/// current representation in this routine will be pretty gnarly. Sigh.) +fn lift_common_prefix(hirs: Vec) -> Result> { + if hirs.len() <= 1 { + return Err(hirs); + } + let mut prefix = match hirs[0].kind() { + HirKind::Concat(ref xs) => &**xs, + _ => return Err(hirs), + }; + if prefix.is_empty() { + return Err(hirs); + } + for h in hirs.iter().skip(1) { + let concat = match h.kind() { + HirKind::Concat(ref xs) => xs, + _ => return Err(hirs), + }; + let common_len = prefix + .iter() + .zip(concat.iter()) + .take_while(|(x, y)| x == y) + .count(); + prefix = &prefix[..common_len]; + if prefix.is_empty() { + return Err(hirs); + } + } + let len = prefix.len(); + assert_ne!(0, len); + let mut prefix_concat = vec![]; + let mut suffix_alts = vec![]; + for h in hirs { + let mut concat = match h.into_kind() { + HirKind::Concat(xs) => xs, + // We required all sub-expressions to be + // concats above, so we're only here if we + // have a concat. + _ => unreachable!(), + }; + suffix_alts.push(Hir::concat(concat.split_off(len))); + if prefix_concat.is_empty() { + prefix_concat = concat; + } + } + let mut concat = prefix_concat; + concat.push(Hir::alternation(suffix_alts)); + Ok(Hir::concat(concat)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn uclass(ranges: &[(char, char)]) -> ClassUnicode { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| ClassUnicodeRange::new(s, e)) + .collect(); + ClassUnicode::new(ranges) + } + + fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { + let ranges: Vec = + ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect(); + ClassBytes::new(ranges) + } + + fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + #[cfg(feature = "unicode-case")] + fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn usymdifference( + cls1: &ClassUnicode, + cls2: &ClassUnicode, + ) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn unegate(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + fn bcasefold(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn bnegate(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + #[test] + fn class_range_canonical_unicode() { + let range = ClassUnicodeRange::new('\u{00FF}', '\0'); + assert_eq!('\0', range.start()); + assert_eq!('\u{00FF}', range.end()); + } + + #[test] + fn class_range_canonical_bytes() { + let range = ClassBytesRange::new(b'\xFF', b'\0'); + assert_eq!(b'\0', range.start()); + assert_eq!(b'\xFF', range.end()); + } + + #[test] + fn class_canonicalize_unicode() { + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('a', 'c')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('w', 'y')]); + let expected = vec![('w', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[ + ('c', 'f'), + ('a', 'g'), + ('d', 'j'), + ('a', 'c'), + ('m', 'p'), + ('l', 's'), + ]); + let expected = vec![('a', 'j'), ('l', 's')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('u', 'w')]); + let expected = vec![('u', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); + let expected = vec![('\x00', '\u{10FFFF}')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = vec![('a', 'b')]; + assert_eq!(expected, uranges(&cls)); + } + + #[test] + fn class_canonicalize_bytes() { + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); + let expected = vec![(b'w', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[ + (b'c', b'f'), + (b'a', b'g'), + (b'd', b'j'), + (b'a', b'c'), + (b'm', b'p'), + (b'l', b's'), + ]); + let expected = vec![(b'a', b'j'), (b'l', b's')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); + let expected = vec![(b'u', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); + let expected = vec![(b'\x00', b'\xFF')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = vec![(b'a', b'b')]; + assert_eq!(expected, branges(&cls)); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn class_case_fold_unicode() { + let cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + let expected = uclass(&[ + ('A', 'J'), + ('L', 'S'), + ('a', 'j'), + ('l', 's'), + ('\u{17F}', '\u{17F}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'Z')]); + let expected = uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('a', 'z')]); + let expected = uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('_', '_')]); + let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('=', '=')]); + let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('\x00', '\x10')]); + assert_eq!(cls, ucasefold(&cls)); + + let cls = uclass(&[('k', 'k')]); + let expected = + uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('@', '@')]); + assert_eq!(cls, ucasefold(&cls)); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + assert!(cls.try_case_fold_simple().is_err()); + } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled_panics() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + cls.case_fold_simple(); + } + + #[test] + fn class_case_fold_bytes() { + let cls = bclass(&[ + (b'C', b'F'), + (b'A', b'G'), + (b'D', b'J'), + (b'A', b'C'), + (b'M', b'P'), + (b'L', b'S'), + (b'c', b'f'), + ]); + let expected = + bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'Z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); + let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); + let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'\x00', b'\x10')]); + assert_eq!(cls, bcasefold(&cls)); + + let cls = bclass(&[(b'k', b'k')]); + let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'@', b'@')]); + assert_eq!(cls, bcasefold(&cls)); + } + + #[test] + fn class_negate_unicode() { + let cls = uclass(&[('a', 'a')]); + let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = uclass(&[ + ('\x00', '\x60'), + ('\x64', '\x77'), + ('\x7B', '\u{10FFFF}'), + ]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', 'a')]); + let expected = uclass(&[('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\x60')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}')]); + let expected = uclass(&[]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[]); + let expected = uclass(&[('\x00', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = + uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]); + let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FF}')]); + let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FE}')]); + let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{D7FF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{E000}')]); + assert_eq!(expected, unegate(&cls)); + } + + #[test] + fn class_negate_bytes() { + let cls = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = bclass(&[ + (b'\x00', b'\x60'), + (b'\x64', b'\x77'), + (b'\x7B', b'\xFF'), + ]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'a')]); + let expected = bclass(&[(b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'\xFF')]); + let expected = bclass(&[(b'\x00', b'\x60')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF')]); + let expected = bclass(&[]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[]); + let expected = bclass(&[(b'\x00', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); + let expected = bclass(&[(b'\xFE', b'\xFE')]); + assert_eq!(expected, bnegate(&cls)); + } + + #[test] + fn class_union_unicode() { + let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[('a', 'z'), ('A', 'C')]); + assert_eq!(expected, uunion(&cls1, &cls2)); + } + + #[test] + fn class_union_bytes() { + let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); + assert_eq!(expected, bunion(&cls1, &cls2)); + } + + #[test] + fn class_intersect_unicode() { + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('b', 'b')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('b', 'c')]); + let expected = uclass(&[('b', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('c', 'd')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('b', 'c')]); + let cls2 = uclass(&[('a', 'd')]); + let expected = uclass(&[('b', 'c')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); + let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('h', 'h')]); + let expected = uclass(&[('h', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); + let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); + let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); + let expected = uclass(&[('b', 'f')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + } + + #[test] + fn class_intersect_bytes() { + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'b', b'b')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'b', b'c')]); + let expected = bclass(&[(b'b', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'c', b'd')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'b', b'c')]); + let cls2 = bclass(&[(b'a', b'd')]); + let expected = bclass(&[(b'b', b'c')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); + let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'h', b'h')]); + let expected = bclass(&[(b'h', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); + let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); + let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); + let expected = bclass(&[(b'b', b'f')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + } + + #[test] + fn class_difference_unicode() { + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('b', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('z', 'z')]); + let expected = uclass(&[('a', 'y')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('m', 'm')]); + let expected = uclass(&[('a', 'l'), ('n', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('d', 'v')]); + let expected = uclass(&[('a', 'c')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('x', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('x', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + } + + #[test] + fn class_difference_bytes() { + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'b', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'z', b'z')]); + let expected = bclass(&[(b'a', b'y')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'm', b'm')]); + let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'd', b'v')]); + let expected = bclass(&[(b'a', b'c')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'x', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'x', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_unicode() { + let cls1 = uclass(&[('a', 'm')]); + let cls2 = uclass(&[('g', 't')]); + let expected = uclass(&[('a', 'f'), ('n', 't')]); + assert_eq!(expected, usymdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_bytes() { + let cls1 = bclass(&[(b'a', b'm')]); + let cls2 = bclass(&[(b'g', b't')]); + let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); + assert_eq!(expected, bsymdifference(&cls1, &cls2)); + } + + // We use a thread with an explicit stack size to test that our destructor + // for Hir can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let mut expr = Hir::empty(); + for _ in 0..100 { + expr = Hir::capture(Capture { + index: 1, + name: None, + sub: Box::new(expr), + }); + expr = Hir::repetition(Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(expr), + }); + + expr = Hir { + kind: HirKind::Concat(vec![expr]), + props: Properties::empty(), + }; + expr = Hir { + kind: HirKind::Alternation(vec![expr]), + props: Properties::empty(), + }; + } + assert!(!matches!(*expr.kind(), HirKind::Empty)); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + // + // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests' + // for context on the specific stack size chosen here. + thread::Builder::new() + .stack_size(16 << 10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } + + #[test] + fn look_set_iter() { + let set = LookSet::empty(); + assert_eq!(0, set.iter().count()); + + let set = LookSet::full(); + assert_eq!(18, set.iter().count()); + + let set = + LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); + assert_eq!(2, set.iter().count()); + + let set = LookSet::empty().insert(Look::StartLF); + assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordAsciiNegate); + assert_eq!(1, set.iter().count()); + } + + #[test] + fn look_set_debug() { + let res = format!("{:?}", LookSet::empty()); + assert_eq!("∅", res); + let res = format!("{:?}", LookSet::full()); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); + } +} diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs new file mode 100644 index 0000000..dfa6d40 --- /dev/null +++ b/vendor/regex-syntax/src/hir/print.rs @@ -0,0 +1,608 @@ +/*! +This module provides a regular expression printer for `Hir`. +*/ + +use core::fmt; + +use crate::{ + hir::{ + self, + visitor::{self, Visitor}, + Hir, HirKind, + }, + is_meta_character, +}; + +/// A builder for constructing a printer. +/// +/// Note that since a printer doesn't have any configuration knobs, this type +/// remains unexported. +#[derive(Clone, Debug)] +struct PrinterBuilder { + _priv: (), +} + +impl Default for PrinterBuilder { + fn default() -> PrinterBuilder { + PrinterBuilder::new() + } +} + +impl PrinterBuilder { + fn new() -> PrinterBuilder { + PrinterBuilder { _priv: () } + } + + fn build(&self) -> Printer { + Printer { _priv: () } + } +} + +/// A printer for a regular expression's high-level intermediate +/// representation. +/// +/// A printer converts a high-level intermediate representation (HIR) to a +/// regular expression pattern string. This particular printer uses constant +/// stack space and heap space proportional to the size of the HIR. +/// +/// Since this printer is only using the HIR, the pattern it prints will likely +/// not resemble the original pattern at all. For example, a pattern like +/// `\pL` will have its entire class written out. +/// +/// The purpose of this printer is to provide a means to mutate an HIR and then +/// build a regular expression from the result of that mutation. (A regex +/// library could provide a constructor from this HIR explicitly, but that +/// creates an unnecessary public coupling between the regex library and this +/// specific HIR representation.) +#[derive(Debug)] +pub struct Printer { + _priv: (), +} + +impl Printer { + /// Create a new printer. + pub fn new() -> Printer { + PrinterBuilder::new().build() + } + + /// Print the given `Ast` to the given writer. The writer must implement + /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used + /// here are a `fmt::Formatter` (which is available in `fmt::Display` + /// implementations) or a `&mut String`. + pub fn print(&mut self, hir: &Hir, wtr: W) -> fmt::Result { + visitor::visit(hir, Writer { wtr }) + } +} + +#[derive(Debug)] +struct Writer { + wtr: W, +} + +impl Visitor for Writer { + type Output = (); + type Err = fmt::Error; + + fn finish(self) -> fmt::Result { + Ok(()) + } + + fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { + match *hir.kind() { + HirKind::Empty => { + // Technically an empty sub-expression could be "printed" by + // just ignoring it, but in practice, you could have a + // repetition operator attached to an empty expression, and you + // really need something in the concrete syntax to make that + // work as you'd expect. + self.wtr.write_str(r"(?:)")?; + } + // Repetition operators are strictly suffix oriented. + HirKind::Repetition(_) => {} + HirKind::Literal(hir::Literal(ref bytes)) => { + // See the comment on the 'Concat' and 'Alternation' case below + // for why we put parens here. Literals are, conceptually, + // a special case of concatenation where each element is a + // character. The HIR flattens this into a Box<[u8]>, but we + // still need to treat it like a concatenation for correct + // printing. As a special case, we don't write parens if there + // is only one character. One character means there is no + // concat so we don't need parens. Adding parens would still be + // correct, but we drop them here because it tends to create + // rather noisy regexes even in simple cases. + let result = core::str::from_utf8(bytes); + let len = result.map_or(bytes.len(), |s| s.chars().count()); + if len > 1 { + self.wtr.write_str(r"(?:")?; + } + match result { + Ok(string) => { + for c in string.chars() { + self.write_literal_char(c)?; + } + } + Err(_) => { + for &b in bytes.iter() { + self.write_literal_byte(b)?; + } + } + } + if len > 1 { + self.wtr.write_str(r")")?; + } + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } + self.wtr.write_str("[")?; + for range in cls.iter() { + if range.start() == range.end() { + self.write_literal_char(range.start())?; + } else if u32::from(range.start()) + 1 + == u32::from(range.end()) + { + self.write_literal_char(range.start())?; + self.write_literal_char(range.end())?; + } else { + self.write_literal_char(range.start())?; + self.wtr.write_str("-")?; + self.write_literal_char(range.end())?; + } + } + self.wtr.write_str("]")?; + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } + self.wtr.write_str("(?-u:[")?; + for range in cls.iter() { + if range.start() == range.end() { + self.write_literal_class_byte(range.start())?; + } else if range.start() + 1 == range.end() { + self.write_literal_class_byte(range.start())?; + self.write_literal_class_byte(range.end())?; + } else { + self.write_literal_class_byte(range.start())?; + self.wtr.write_str("-")?; + self.write_literal_class_byte(range.end())?; + } + } + self.wtr.write_str("])")?; + } + HirKind::Look(ref look) => match *look { + hir::Look::Start => { + self.wtr.write_str(r"\A")?; + } + hir::Look::End => { + self.wtr.write_str(r"\z")?; + } + hir::Look::StartLF => { + self.wtr.write_str("(?m:^)")?; + } + hir::Look::EndLF => { + self.wtr.write_str("(?m:$)")?; + } + hir::Look::StartCRLF => { + self.wtr.write_str("(?mR:^)")?; + } + hir::Look::EndCRLF => { + self.wtr.write_str("(?mR:$)")?; + } + hir::Look::WordAscii => { + self.wtr.write_str(r"(?-u:\b)")?; + } + hir::Look::WordAsciiNegate => { + self.wtr.write_str(r"(?-u:\B)")?; + } + hir::Look::WordUnicode => { + self.wtr.write_str(r"\b")?; + } + hir::Look::WordUnicodeNegate => { + self.wtr.write_str(r"\B")?; + } + hir::Look::WordStartAscii => { + self.wtr.write_str(r"(?-u:\b{start})")?; + } + hir::Look::WordEndAscii => { + self.wtr.write_str(r"(?-u:\b{end})")?; + } + hir::Look::WordStartUnicode => { + self.wtr.write_str(r"\b{start}")?; + } + hir::Look::WordEndUnicode => { + self.wtr.write_str(r"\b{end}")?; + } + hir::Look::WordStartHalfAscii => { + self.wtr.write_str(r"(?-u:\b{start-half})")?; + } + hir::Look::WordEndHalfAscii => { + self.wtr.write_str(r"(?-u:\b{end-half})")?; + } + hir::Look::WordStartHalfUnicode => { + self.wtr.write_str(r"\b{start-half}")?; + } + hir::Look::WordEndHalfUnicode => { + self.wtr.write_str(r"\b{end-half}")?; + } + }, + HirKind::Capture(hir::Capture { ref name, .. }) => { + self.wtr.write_str("(")?; + if let Some(ref name) = *name { + write!(self.wtr, "?P<{}>", name)?; + } + } + // Why do this? Wrapping concats and alts in non-capturing groups + // is not *always* necessary, but is sometimes necessary. For + // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' + // and not 'ab|c'. The former is clearly the intended meaning, but + // the latter is actually 'alt(concat(a, b), c)'. + // + // It would be possible to only group these things in cases where + // it's strictly necessary, but it requires knowing the parent + // expression. And since this technique is simpler and always + // correct, we take this route. More to the point, it is a non-goal + // of an HIR printer to show a nice easy-to-read regex. Indeed, + // its construction forbids it from doing so. Therefore, inserting + // extra groups where they aren't necessary is perfectly okay. + HirKind::Concat(_) | HirKind::Alternation(_) => { + self.wtr.write_str(r"(?:")?; + } + } + Ok(()) + } + + fn visit_post(&mut self, hir: &Hir) -> fmt::Result { + match *hir.kind() { + // Handled during visit_pre + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => {} + HirKind::Repetition(ref x) => { + match (x.min, x.max) { + (0, Some(1)) => { + self.wtr.write_str("?")?; + } + (0, None) => { + self.wtr.write_str("*")?; + } + (1, None) => { + self.wtr.write_str("+")?; + } + (1, Some(1)) => { + // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. + return Ok(()); + } + (m, None) => { + write!(self.wtr, "{{{},}}", m)?; + } + (m, Some(n)) if m == n => { + write!(self.wtr, "{{{}}}", m)?; + // a{m} and a{m}? are always exactly equivalent. + return Ok(()); + } + (m, Some(n)) => { + write!(self.wtr, "{{{},{}}}", m, n)?; + } + } + if !x.greedy { + self.wtr.write_str("?")?; + } + } + HirKind::Capture(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => { + self.wtr.write_str(r")")?; + } + } + Ok(()) + } + + fn visit_alternation_in(&mut self) -> fmt::Result { + self.wtr.write_str("|") + } +} + +impl Writer { + fn write_literal_char(&mut self, c: char) -> fmt::Result { + if is_meta_character(c) { + self.wtr.write_str("\\")?; + } + self.wtr.write_char(c) + } + + fn write_literal_byte(&mut self, b: u8) -> fmt::Result { + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) + } else { + write!(self.wtr, "(?-u:\\x{:02X})", b) + } + } + + fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) + } else { + write!(self.wtr, "\\x{:02X}", b) + } + } +} + +#[cfg(test)] +mod tests { + use alloc::{ + boxed::Box, + string::{String, ToString}, + }; + + use crate::ParserBuilder; + + use super::*; + + fn roundtrip(given: &str, expected: &str) { + roundtrip_with(|b| b, given, expected); + } + + fn roundtrip_bytes(given: &str, expected: &str) { + roundtrip_with(|b| b.utf8(false), given, expected); + } + + fn roundtrip_with(mut f: F, given: &str, expected: &str) + where + F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, + { + let mut builder = ParserBuilder::new(); + f(&mut builder); + let hir = builder.build().parse(given).unwrap(); + + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&hir, &mut dst).unwrap(); + + // Check that the result is actually valid. + builder.build().parse(&dst).unwrap(); + + assert_eq!(expected, dst); + } + + #[test] + fn print_literal() { + roundtrip("a", "a"); + roundtrip(r"\xff", "\u{FF}"); + roundtrip_bytes(r"\xff", "\u{FF}"); + roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)"); + roundtrip("☃", "☃"); + } + + #[test] + fn print_class() { + roundtrip(r"[a]", r"a"); + roundtrip(r"[ab]", r"[ab]"); + roundtrip(r"[a-z]", r"[a-z]"); + roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); + roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}"); + roundtrip(r"[-]", r"\-"); + roundtrip(r"[☃-⛄]", r"[☃-⛄]"); + + roundtrip(r"(?-u)[a]", r"a"); + roundtrip(r"(?-u)[ab]", r"(?-u:[ab])"); + roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); + roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); + + // The following test that the printer escapes meta characters + // in character classes. + roundtrip(r"[\[]", r"\["); + roundtrip(r"[Z-_]", r"[Z-_]"); + roundtrip(r"[Z-_--Z]", r"[\[-_]"); + + // The following test that the printer escapes meta characters + // in byte oriented character classes. + roundtrip_bytes(r"(?-u)[\[]", r"\["); + roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); + roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); + + // This tests that an empty character class is correctly roundtripped. + #[cfg(feature = "unicode-gencat")] + roundtrip(r"\P{any}", r"[a&&b]"); + roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]"); + } + + #[test] + fn print_anchor() { + roundtrip(r"^", r"\A"); + roundtrip(r"$", r"\z"); + roundtrip(r"(?m)^", r"(?m:^)"); + roundtrip(r"(?m)$", r"(?m:$)"); + } + + #[test] + fn print_word_boundary() { + roundtrip(r"\b", r"\b"); + roundtrip(r"\B", r"\B"); + roundtrip(r"(?-u)\b", r"(?-u:\b)"); + roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)"); + } + + #[test] + fn print_repetition() { + roundtrip("a?", "a?"); + roundtrip("a??", "a??"); + roundtrip("(?U)a?", "a??"); + + roundtrip("a*", "a*"); + roundtrip("a*?", "a*?"); + roundtrip("(?U)a*", "a*?"); + + roundtrip("a+", "a+"); + roundtrip("a+?", "a+?"); + roundtrip("(?U)a+", "a+?"); + + roundtrip("a{1}", "a"); + roundtrip("a{2}", "a{2}"); + roundtrip("a{1,}", "a+"); + roundtrip("a{1,5}", "a{1,5}"); + roundtrip("a{1}?", "a"); + roundtrip("a{2}?", "a{2}"); + roundtrip("a{1,}?", "a+?"); + roundtrip("a{1,5}?", "a{1,5}?"); + roundtrip("(?U)a{1}", "a"); + roundtrip("(?U)a{2}", "a{2}"); + roundtrip("(?U)a{1,}", "a+?"); + roundtrip("(?U)a{1,5}", "a{1,5}?"); + + // Test that various zero-length repetitions always translate to an + // empty regex. This is more a property of HIR's smart constructors + // than the printer though. + roundtrip("a{0}", "(?:)"); + roundtrip("(?:ab){0}", "(?:)"); + #[cfg(feature = "unicode-gencat")] + { + roundtrip(r"\p{any}{0}", "(?:)"); + roundtrip(r"\P{any}{0}", "(?:)"); + } + } + + #[test] + fn print_group() { + roundtrip("()", "((?:))"); + roundtrip("(?P)", "(?P(?:))"); + roundtrip("(?:)", "(?:)"); + + roundtrip("(a)", "(a)"); + roundtrip("(?Pa)", "(?Pa)"); + roundtrip("(?:a)", "a"); + + roundtrip("((((a))))", "((((a))))"); + } + + #[test] + fn print_alternation() { + roundtrip("|", "(?:(?:)|(?:))"); + roundtrip("||", "(?:(?:)|(?:)|(?:))"); + + roundtrip("a|b", "[ab]"); + roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); + roundtrip("a|b|c", "[a-c]"); + roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))"); + roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); + } + + // This is a regression test that stresses a peculiarity of how the HIR + // is both constructed and printed. Namely, it is legal for a repetition + // to directly contain a concatenation. This particular construct isn't + // really possible to build from the concrete syntax directly, since you'd + // be forced to put the concatenation into (at least) a non-capturing + // group. Concurrently, the printer doesn't consider this case and just + // kind of naively prints the child expression and tacks on the repetition + // operator. + // + // As a result, if you attached '+' to a 'concat(a, b)', the printer gives + // you 'ab+', but clearly it really should be '(?:ab)+'. + // + // This bug isn't easy to surface because most ways of building an HIR + // come directly from the concrete syntax, and as mentioned above, it just + // isn't possible to build this kind of HIR from the concrete syntax. + // Nevertheless, this is definitely a bug. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("x".as_bytes()), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::literal("ab".as_bytes())), + }), + Hir::literal("y".as_bytes()), + ]); + assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ])), + }), + Hir::look(hir::Look::End), + ]); + assert_eq!(r"(?:\A\A\z\z)", expr.to_string()); + } + + // Just like regression_repetition_concat, but with the repetition using + // an alternation as a child expression instead. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_alternation() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("ab".as_bytes()), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::alternation(alloc::vec![ + Hir::literal("cd".as_bytes()), + Hir::literal("ef".as_bytes()), + ])), + }), + Hir::literal("gh".as_bytes()), + ]); + assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ])), + }), + Hir::look(hir::Look::End), + ]); + assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string()); + } + + // This regression test is very similar in flavor to + // regression_repetition_concat in that the root of the issue lies in a + // peculiarity of how the HIR is represented and how the printer writes it + // out. Like the other regression, this one is also rooted in the fact that + // you can't produce the peculiar HIR from the concrete syntax. Namely, you + // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally + // be in (at least) a non-capturing group. Why? Because the '|' has very + // low precedence (lower that concatenation), and so something like 'ab|c' + // is actually 'alt(ab, c)'. + // + // See: https://github.com/rust-lang/regex/issues/516 + #[test] + fn regression_alternation_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("ab".as_bytes()), + Hir::alternation(alloc::vec![ + Hir::literal("mn".as_bytes()), + Hir::literal("xy".as_bytes()), + ]), + ]); + assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ]), + ]); + assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string()); + } +} diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs new file mode 100644 index 0000000..313a1e9 --- /dev/null +++ b/vendor/regex-syntax/src/hir/translate.rs @@ -0,0 +1,3724 @@ +/*! +Defines a translator that converts an `Ast` to an `Hir`. +*/ + +use core::cell::{Cell, RefCell}; + +use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; + +use crate::{ + ast::{self, Ast, Span, Visitor}, + either::Either, + hir::{self, Error, ErrorKind, Hir, HirKind}, + unicode::{self, ClassQuery}, +}; + +type Result = core::result::Result; + +/// A builder for constructing an AST->HIR translator. +#[derive(Clone, Debug)] +pub struct TranslatorBuilder { + utf8: bool, + line_terminator: u8, + flags: Flags, +} + +impl Default for TranslatorBuilder { + fn default() -> TranslatorBuilder { + TranslatorBuilder::new() + } +} + +impl TranslatorBuilder { + /// Create a new translator builder with a default c onfiguration. + pub fn new() -> TranslatorBuilder { + TranslatorBuilder { + utf8: true, + line_terminator: b'\n', + flags: Flags::default(), + } + } + + /// Build a translator using the current configuration. + pub fn build(&self) -> Translator { + Translator { + stack: RefCell::new(vec![]), + flags: Cell::new(self.flags), + utf8: self.utf8, + line_terminator: self.line_terminator, + } + } + + /// When disabled, translation will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When enabled (the default), the translator is guaranteed to produce an + /// expression that, for non-empty matches, will only ever produce spans + /// that are entirely valid UTF-8 (otherwise, the translator will return an + /// error). + /// + /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even + /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete + /// syntax) will be allowed even though they can produce matches that split + /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" + /// matches, and it is expected that the regex engine itself must handle + /// these cases if necessary (perhaps by suppressing any zero-width matches + /// that split a codepoint). + pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.utf8 = yes; + self + } + + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder { + self.line_terminator = byte; + self + } + + /// Enable or disable the case insensitive flag (`i`) by default. + pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.case_insensitive = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the multi-line matching flag (`m`) by default. + pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.multi_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "dot matches any character" flag (`s`) by + /// default. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut TranslatorBuilder { + self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the CRLF mode flag (`R`) by default. + pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.crlf = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "swap greed" flag (`U`) by default. + pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.swap_greed = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.unicode = if yes { None } else { Some(false) }; + self + } +} + +/// A translator maps abstract syntax to a high level intermediate +/// representation. +/// +/// A translator may be benefit from reuse. That is, a translator can translate +/// many abstract syntax trees. +/// +/// A `Translator` can be configured in more detail via a +/// [`TranslatorBuilder`]. +#[derive(Clone, Debug)] +pub struct Translator { + /// Our call stack, but on the heap. + stack: RefCell>, + /// The current flag settings. + flags: Cell, + /// Whether we're allowed to produce HIR that can match arbitrary bytes. + utf8: bool, + /// The line terminator to use for `.`. + line_terminator: u8, +} + +impl Translator { + /// Create a new translator using the default configuration. + pub fn new() -> Translator { + TranslatorBuilder::new().build() + } + + /// Translate the given abstract syntax tree (AST) into a high level + /// intermediate representation (HIR). + /// + /// If there was a problem doing the translation, then an HIR-specific + /// error is returned. + /// + /// The original pattern string used to produce the `Ast` *must* also be + /// provided. The translator does not use the pattern string during any + /// correct translation, but is used for error reporting. + pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result { + ast::visit(ast, TranslatorI::new(self, pattern)) + } +} + +/// An HirFrame is a single stack frame, represented explicitly, which is +/// created for each item in the Ast that we traverse. +/// +/// Note that technically, this type doesn't represent our entire stack +/// frame. In particular, the Ast visitor represents any state associated with +/// traversing the Ast itself. +#[derive(Clone, Debug)] +enum HirFrame { + /// An arbitrary HIR expression. These get pushed whenever we hit a base + /// case in the Ast. They get popped after an inductive (i.e., recursive) + /// step is complete. + Expr(Hir), + /// A literal that is being constructed, character by character, from the + /// AST. We need this because the AST gives each individual character its + /// own node. So as we see characters, we peek at the top-most HirFrame. + /// If it's a literal, then we add to it. Otherwise, we push a new literal. + /// When it comes time to pop it, we convert it to an Hir via Hir::literal. + Literal(Vec), + /// A Unicode character class. This frame is mutated as we descend into + /// the Ast of a character class (which is itself its own mini recursive + /// structure). + ClassUnicode(hir::ClassUnicode), + /// A byte-oriented character class. This frame is mutated as we descend + /// into the Ast of a character class (which is itself its own mini + /// recursive structure). + /// + /// Byte character classes are created when Unicode mode (`u`) is disabled. + /// If `utf8` is enabled (the default), then a byte character is only + /// permitted to match ASCII text. + ClassBytes(hir::ClassBytes), + /// This is pushed whenever a repetition is observed. After visiting every + /// sub-expression in the repetition, the translator's stack is expected to + /// have this sentinel at the top. + /// + /// This sentinel only exists to stop other things (like flattening + /// literals) from reaching across repetition operators. + Repetition, + /// This is pushed on to the stack upon first seeing any kind of capture, + /// indicated by parentheses (including non-capturing groups). It is popped + /// upon leaving a group. + Group { + /// The old active flags when this group was opened. + /// + /// If this group sets flags, then the new active flags are set to the + /// result of merging the old flags with the flags introduced by this + /// group. If the group doesn't set any flags, then this is simply + /// equivalent to whatever flags were set when the group was opened. + /// + /// When this group is popped, the active flags should be restored to + /// the flags set here. + /// + /// The "active" flags correspond to whatever flags are set in the + /// Translator. + old_flags: Flags, + }, + /// This is pushed whenever a concatenation is observed. After visiting + /// every sub-expression in the concatenation, the translator's stack is + /// popped until it sees a Concat frame. + Concat, + /// This is pushed whenever an alternation is observed. After visiting + /// every sub-expression in the alternation, the translator's stack is + /// popped until it sees an Alternation frame. + Alternation, + /// This is pushed immediately before each sub-expression in an + /// alternation. This separates the branches of an alternation on the + /// stack and prevents literal flattening from reaching across alternation + /// branches. + /// + /// It is popped after each expression in a branch until an 'Alternation' + /// frame is observed when doing a post visit on an alternation. + AlternationBranch, +} + +impl HirFrame { + /// Assert that the current stack frame is an Hir expression and return it. + fn unwrap_expr(self) -> Hir { + match self { + HirFrame::Expr(expr) => expr, + HirFrame::Literal(lit) => Hir::literal(lit), + _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), + } + } + + /// Assert that the current stack frame is a Unicode class expression and + /// return it. + fn unwrap_class_unicode(self) -> hir::ClassUnicode { + match self { + HirFrame::ClassUnicode(cls) => cls, + _ => panic!( + "tried to unwrap Unicode class \ + from HirFrame, got: {:?}", + self + ), + } + } + + /// Assert that the current stack frame is a byte class expression and + /// return it. + fn unwrap_class_bytes(self) -> hir::ClassBytes { + match self { + HirFrame::ClassBytes(cls) => cls, + _ => panic!( + "tried to unwrap byte class \ + from HirFrame, got: {:?}", + self + ), + } + } + + /// Assert that the current stack frame is a repetition sentinel. If it + /// isn't, then panic. + fn unwrap_repetition(self) { + match self { + HirFrame::Repetition => {} + _ => { + panic!( + "tried to unwrap repetition from HirFrame, got: {:?}", + self + ) + } + } + } + + /// Assert that the current stack frame is a group indicator and return + /// its corresponding flags (the flags that were active at the time the + /// group was entered). + fn unwrap_group(self) -> Flags { + match self { + HirFrame::Group { old_flags } => old_flags, + _ => { + panic!("tried to unwrap group from HirFrame, got: {:?}", self) + } + } + } + + /// Assert that the current stack frame is an alternation pipe sentinel. If + /// it isn't, then panic. + fn unwrap_alternation_pipe(self) { + match self { + HirFrame::AlternationBranch => {} + _ => { + panic!( + "tried to unwrap alt pipe from HirFrame, got: {:?}", + self + ) + } + } + } +} + +impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { + type Output = Hir; + type Err = Error; + + fn finish(self) -> Result { + // ... otherwise, we should have exactly one HIR on the stack. + assert_eq!(self.trans().stack.borrow().len(), 1); + Ok(self.pop().unwrap().unwrap_expr()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::ClassBracketed(_) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + Ast::Repetition(_) => self.push(HirFrame::Repetition), + Ast::Group(ref x) => { + let old_flags = x + .flags() + .map(|ast| self.set_flags(ast)) + .unwrap_or_else(|| self.flags()); + self.push(HirFrame::Group { old_flags }); + } + Ast::Concat(_) => { + self.push(HirFrame::Concat); + } + Ast::Alternation(ref x) => { + self.push(HirFrame::Alternation); + if !x.asts.is_empty() { + self.push(HirFrame::AlternationBranch); + } + } + _ => {} + } + Ok(()) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) => { + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Flags(ref x) => { + self.set_flags(&x.flags); + // Flags in the AST are generally considered directives and + // not actual sub-expressions. However, they can be used in + // the concrete syntax like `((?i))`, and we need some kind of + // indication of an expression there, and Empty is the correct + // choice. + // + // There can also be things like `(?i)+`, but we rule those out + // in the parser. In the future, we might allow them for + // consistency sake. + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + }, + }, + Ast::Dot(ref span) => { + self.push(HirFrame::Expr(self.hir_dot(**span)?)); + } + Ast::Assertion(ref x) => { + self.push(HirFrame::Expr(self.hir_assertion(x)?)); + } + Ast::ClassPerl(ref x) => { + if self.flags().unicode() { + let cls = self.hir_perl_unicode_class(x)?; + let hcls = hir::Class::Unicode(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } else { + let cls = self.hir_perl_byte_class(x)?; + let hcls = hir::Class::Bytes(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } + } + Ast::ClassUnicode(ref x) => { + let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); + self.push(HirFrame::Expr(Hir::class(cls))); + } + Ast::ClassBracketed(ref ast) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; + let expr = Hir::class(hir::Class::Unicode(cls)); + self.push(HirFrame::Expr(expr)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + self.bytes_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; + let expr = Hir::class(hir::Class::Bytes(cls)); + self.push(HirFrame::Expr(expr)); + } + } + Ast::Repetition(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + self.pop().unwrap().unwrap_repetition(); + self.push(HirFrame::Expr(self.hir_repetition(x, expr))); + } + Ast::Group(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + let old_flags = self.pop().unwrap().unwrap_group(); + self.trans().flags.set(old_flags); + self.push(HirFrame::Expr(self.hir_capture(x, expr))); + } + Ast::Concat(_) => { + let mut exprs = vec![]; + while let Some(expr) = self.pop_concat_expr() { + if !matches!(*expr.kind(), HirKind::Empty) { + exprs.push(expr); + } + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::concat(exprs))); + } + Ast::Alternation(_) => { + let mut exprs = vec![]; + while let Some(expr) = self.pop_alt_expr() { + self.pop().unwrap().unwrap_alternation_pipe(); + exprs.push(expr); + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::alternation(exprs))); + } + } + Ok(()) + } + + fn visit_alternation_in(&mut self) -> Result<()> { + self.push(HirFrame::AlternationBranch); + Ok(()) + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Bracketed(_) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + // We needn't handle the Union case here since the visitor will + // do it for us. + _ => {} + } + Ok(()) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) => {} + ast::ClassSetItem::Literal(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let byte = self.class_literal_byte(x)?; + cls.push(hir::ClassBytesRange::new(byte, byte)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Range(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let start = self.class_literal_byte(&x.start)?; + let end = self.class_literal_byte(&x.end)?; + cls.push(hir::ClassBytesRange::new(start, end)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Ascii(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_ascii_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_ascii_byte_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Unicode(ref x) => { + let xcls = self.hir_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } + ast::ClassSetItem::Perl(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_perl_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_perl_byte_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Bracketed(ref ast) => { + if self.flags().unicode() { + let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; + + let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); + cls2.union(&cls1); + self.push(HirFrame::ClassUnicode(cls2)); + } else { + let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); + self.bytes_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; + + let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); + cls2.union(&cls1); + self.push(HirFrame::ClassBytes(cls2)); + } + } + // This is handled automatically by the visitor. + ast::ClassSetItem::Union(_) => {} + } + Ok(()) + } + + fn visit_class_set_binary_op_pre( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_in( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_post( + &mut self, + op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + use crate::ast::ClassSetBinaryOpKind::*; + + if self.flags().unicode() { + let mut rhs = self.pop().unwrap().unwrap_class_unicode(); + let mut lhs = self.pop().unwrap().unwrap_class_unicode(); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + if self.flags().case_insensitive() { + rhs.try_case_fold_simple().map_err(|_| { + self.error( + op.rhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + lhs.try_case_fold_simple().map_err(|_| { + self.error( + op.lhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut rhs = self.pop().unwrap().unwrap_class_bytes(); + let mut lhs = self.pop().unwrap().unwrap_class_bytes(); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + if self.flags().case_insensitive() { + rhs.case_fold_simple(); + lhs.case_fold_simple(); + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } +} + +/// The internal implementation of a translator. +/// +/// This type is responsible for carrying around the original pattern string, +/// which is not tied to the internal state of a translator. +/// +/// A TranslatorI exists for the time it takes to translate a single Ast. +#[derive(Clone, Debug)] +struct TranslatorI<'t, 'p> { + trans: &'t Translator, + pattern: &'p str, +} + +impl<'t, 'p> TranslatorI<'t, 'p> { + /// Build a new internal translator. + fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { + TranslatorI { trans, pattern } + } + + /// Return a reference to the underlying translator. + fn trans(&self) -> &Translator { + &self.trans + } + + /// Push the given frame on to the call stack. + fn push(&self, frame: HirFrame) { + self.trans().stack.borrow_mut().push(frame); + } + + /// Push the given literal char on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the char + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given char is pushed to the top of the stack. + fn push_char(&self, ch: char) { + let mut buf = [0; 4]; + let bytes = ch.encode_utf8(&mut buf).as_bytes(); + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.extend_from_slice(bytes); + } else { + stack.push(HirFrame::Literal(bytes.to_vec())); + } + } + + /// Push the given literal byte on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the byte + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given byte is pushed to the top of the stack. + fn push_byte(&self, byte: u8) { + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.push(byte); + } else { + stack.push(HirFrame::Literal(vec![byte])); + } + } + + /// Pop the top of the call stack. If the call stack is empty, return None. + fn pop(&self) -> Option { + self.trans().stack.borrow_mut().pop() + } + + /// Pop an HIR expression from the top of the stack for a concatenation. + /// + /// This returns None if the stack is empty or when a concat frame is seen. + /// Otherwise, it panics if it could not find an HIR expression. + fn pop_concat_expr(&self) -> Option { + let frame = self.pop()?; + match frame { + HirFrame::Concat => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or concat, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or concat, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or concat, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or concat, got group") + } + HirFrame::Alternation => { + unreachable!("expected expr or concat, got alt marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or concat, got alt branch marker") + } + } + } + + /// Pop an HIR expression from the top of the stack for an alternation. + /// + /// This returns None if the stack is empty or when an alternation frame is + /// seen. Otherwise, it panics if it could not find an HIR expression. + fn pop_alt_expr(&self) -> Option { + let frame = self.pop()?; + match frame { + HirFrame::Alternation => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or alt, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or alt, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or alt, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or alt, got group") + } + HirFrame::Concat => { + unreachable!("expected expr or alt, got concat marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or alt, got alt branch marker") + } + } + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ErrorKind) -> Error { + Error { kind, pattern: self.pattern.to_string(), span } + } + + /// Return a copy of the active flags. + fn flags(&self) -> Flags { + self.trans().flags.get() + } + + /// Set the flags of this translator from the flags set in the given AST. + /// Then, return the old flags. + fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { + let old_flags = self.flags(); + let mut new_flags = Flags::from_ast(ast_flags); + new_flags.merge(&old_flags); + self.trans().flags.set(new_flags); + old_flags + } + + /// Convert an Ast literal to its scalar representation. + /// + /// When Unicode mode is enabled, then this always succeeds and returns a + /// `char` (Unicode scalar value). + /// + /// When Unicode mode is disabled, then a `char` will still be returned + /// whenever possible. A byte is returned only when invalid UTF-8 is + /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte + /// will result in an error when invalid UTF-8 is not allowed. + fn ast_literal_to_scalar( + &self, + lit: &ast::Literal, + ) -> Result> { + if self.flags().unicode() { + return Ok(Either::Left(lit.c)); + } + let byte = match lit.byte() { + None => return Ok(Either::Left(lit.c)), + Some(byte) => byte, + }; + if byte <= 0x7F { + return Ok(Either::Left(char::try_from(byte).unwrap())); + } + if self.trans().utf8 { + return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); + } + Ok(Either::Right(byte)) + } + + fn case_fold_char(&self, span: Span, c: char) -> Result> { + if !self.flags().case_insensitive() { + return Ok(None); + } + if self.flags().unicode() { + // If case folding won't do anything, then don't bother trying. + let map = unicode::SimpleCaseFolder::new() + .map(|f| f.overlaps(c, c)) + .map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + if !map { + return Ok(None); + } + let mut cls = + hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( + c, c, + )]); + cls.try_case_fold_simple().map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + Ok(Some(Hir::class(hir::Class::Unicode(cls)))) + } else { + if !c.is_ascii() { + return Ok(None); + } + // If case folding won't do anything, then don't bother trying. + match c { + 'A'..='Z' | 'a'..='z' => {} + _ => return Ok(None), + } + let mut cls = + hir::ClassBytes::new(vec![hir::ClassBytesRange::new( + // OK because 'c.len_utf8() == 1' which in turn implies + // that 'c' is ASCII. + u8::try_from(c).unwrap(), + u8::try_from(c).unwrap(), + )]); + cls.case_fold_simple(); + Ok(Some(Hir::class(hir::Class::Bytes(cls)))) + } + } + + fn hir_dot(&self, span: Span) -> Result { + let (utf8, lineterm, flags) = + (self.trans().utf8, self.trans().line_terminator, self.flags()); + if utf8 && (!flags.unicode() || !lineterm.is_ascii()) { + return Err(self.error(span, ErrorKind::InvalidUtf8)); + } + let dot = if flags.dot_matches_new_line() { + if flags.unicode() { + hir::Dot::AnyChar + } else { + hir::Dot::AnyByte + } + } else { + if flags.unicode() { + if flags.crlf() { + hir::Dot::AnyCharExceptCRLF + } else { + if !lineterm.is_ascii() { + return Err( + self.error(span, ErrorKind::InvalidLineTerminator) + ); + } + hir::Dot::AnyCharExcept(char::from(lineterm)) + } + } else { + if flags.crlf() { + hir::Dot::AnyByteExceptCRLF + } else { + hir::Dot::AnyByteExcept(lineterm) + } + } + }; + Ok(Hir::dot(dot)) + } + + fn hir_assertion(&self, asst: &ast::Assertion) -> Result { + let unicode = self.flags().unicode(); + let multi_line = self.flags().multi_line(); + let crlf = self.flags().crlf(); + Ok(match asst.kind { + ast::AssertionKind::StartLine => Hir::look(if multi_line { + if crlf { + hir::Look::StartCRLF + } else { + hir::Look::StartLF + } + } else { + hir::Look::Start + }), + ast::AssertionKind::EndLine => Hir::look(if multi_line { + if crlf { + hir::Look::EndCRLF + } else { + hir::Look::EndLF + } + } else { + hir::Look::End + }), + ast::AssertionKind::StartText => Hir::look(hir::Look::Start), + ast::AssertionKind::EndText => Hir::look(hir::Look::End), + ast::AssertionKind::WordBoundary => Hir::look(if unicode { + hir::Look::WordUnicode + } else { + hir::Look::WordAscii + }), + ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { + hir::Look::WordUnicodeNegate + } else { + hir::Look::WordAsciiNegate + }), + ast::AssertionKind::WordBoundaryStart + | ast::AssertionKind::WordBoundaryStartAngle => { + Hir::look(if unicode { + hir::Look::WordStartUnicode + } else { + hir::Look::WordStartAscii + }) + } + ast::AssertionKind::WordBoundaryEnd + | ast::AssertionKind::WordBoundaryEndAngle => { + Hir::look(if unicode { + hir::Look::WordEndUnicode + } else { + hir::Look::WordEndAscii + }) + } + ast::AssertionKind::WordBoundaryStartHalf => { + Hir::look(if unicode { + hir::Look::WordStartHalfUnicode + } else { + hir::Look::WordStartHalfAscii + }) + } + ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { + hir::Look::WordEndHalfUnicode + } else { + hir::Look::WordEndHalfAscii + }), + }) + } + + fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir { + let (index, name) = match group.kind { + ast::GroupKind::CaptureIndex(index) => (index, None), + ast::GroupKind::CaptureName { ref name, .. } => { + (name.index, Some(name.name.clone().into_boxed_str())) + } + // The HIR doesn't need to use non-capturing groups, since the way + // in which the data type is defined handles this automatically. + ast::GroupKind::NonCapturing(_) => return expr, + }; + Hir::capture(hir::Capture { index, name, sub: Box::new(expr) }) + } + + fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { + let (min, max) = match rep.op.kind { + ast::RepetitionKind::ZeroOrOne => (0, Some(1)), + ast::RepetitionKind::ZeroOrMore => (0, None), + ast::RepetitionKind::OneOrMore => (1, None), + ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { + (m, Some(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { + (m, None) + } + ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( + m, + n, + )) => (m, Some(n)), + }; + let greedy = + if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; + Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub: Box::new(expr), + }) + } + + fn hir_unicode_class( + &self, + ast_class: &ast::ClassUnicode, + ) -> Result { + use crate::ast::ClassUnicodeKind::*; + + if !self.flags().unicode() { + return Err( + self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) + ); + } + let query = match ast_class.kind { + OneLetter(name) => ClassQuery::OneLetter(name), + Named(ref name) => ClassQuery::Binary(name), + NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { + property_name: name, + property_value: value, + }, + }; + let mut result = self.convert_unicode_class_error( + &ast_class.span, + unicode::class(query), + ); + if let Ok(ref mut class) = result { + self.unicode_fold_and_negate( + &ast_class.span, + ast_class.negated, + class, + )?; + } + result + } + + fn hir_ascii_unicode_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassUnicode::new( + ascii_class_as_chars(&ast.kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), + ); + self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_ascii_byte_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassBytes::new( + ascii_class(&ast.kind) + .map(|(s, e)| hir::ClassBytesRange::new(s, e)), + ); + self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_perl_unicode_class( + &self, + ast_class: &ast::ClassPerl, + ) -> Result { + use crate::ast::ClassPerlKind::*; + + assert!(self.flags().unicode()); + let result = match ast_class.kind { + Digit => unicode::perl_digit(), + Space => unicode::perl_space(), + Word => unicode::perl_word(), + }; + let mut class = + self.convert_unicode_class_error(&ast_class.span, result)?; + // We needn't apply case folding here because the Perl Unicode classes + // are already closed under Unicode simple case folding. + if ast_class.negated { + class.negate(); + } + Ok(class) + } + + fn hir_perl_byte_class( + &self, + ast_class: &ast::ClassPerl, + ) -> Result { + use crate::ast::ClassPerlKind::*; + + assert!(!self.flags().unicode()); + let mut class = match ast_class.kind { + Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), + Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), + Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), + }; + // We needn't apply case folding here because the Perl ASCII classes + // are already closed (under ASCII case folding). + if ast_class.negated { + class.negate(); + } + // Negating a Perl byte class is likely to cause it to match invalid + // UTF-8. That's only OK if the translator is configured to allow such + // things. + if self.trans().utf8 && !class.is_ascii() { + return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); + } + Ok(class) + } + + /// Converts the given Unicode specific error to an HIR translation error. + /// + /// The span given should approximate the position at which an error would + /// occur. + fn convert_unicode_class_error( + &self, + span: &Span, + result: core::result::Result, + ) -> Result { + result.map_err(|err| { + let sp = span.clone(); + match err { + unicode::Error::PropertyNotFound => { + self.error(sp, ErrorKind::UnicodePropertyNotFound) + } + unicode::Error::PropertyValueNotFound => { + self.error(sp, ErrorKind::UnicodePropertyValueNotFound) + } + unicode::Error::PerlClassNotFound => { + self.error(sp, ErrorKind::UnicodePerlClassNotFound) + } + } + }) + } + + fn unicode_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassUnicode, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation first, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.try_case_fold_simple().map_err(|_| { + self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) + })?; + } + if negated { + class.negate(); + } + Ok(()) + } + + fn bytes_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassBytes, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation first, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.case_fold_simple(); + } + if negated { + class.negate(); + } + if self.trans().utf8 && !class.is_ascii() { + return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); + } + Ok(()) + } + + /// Return a scalar byte value suitable for use as a literal in a byte + /// character class. + fn class_literal_byte(&self, ast: &ast::Literal) -> Result { + match self.ast_literal_to_scalar(ast)? { + Either::Right(byte) => Ok(byte), + Either::Left(ch) => { + if ch.is_ascii() { + Ok(u8::try_from(ch).unwrap()) + } else { + // We can't feasibly support Unicode in + // byte oriented classes. Byte classes don't + // do Unicode case folding. + Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) + } + } + } + } +} + +/// A translator's representation of a regular expression's flags at any given +/// moment in time. +/// +/// Each flag can be in one of three states: absent, present but disabled or +/// present but enabled. +#[derive(Clone, Copy, Debug, Default)] +struct Flags { + case_insensitive: Option, + multi_line: Option, + dot_matches_new_line: Option, + swap_greed: Option, + unicode: Option, + crlf: Option, + // Note that `ignore_whitespace` is omitted here because it is handled + // entirely in the parser. +} + +impl Flags { + fn from_ast(ast: &ast::Flags) -> Flags { + let mut flags = Flags::default(); + let mut enable = true; + for item in &ast.items { + match item.kind { + ast::FlagsItemKind::Negation => { + enable = false; + } + ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { + flags.case_insensitive = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { + flags.multi_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { + flags.dot_matches_new_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { + flags.swap_greed = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { + flags.unicode = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { + flags.crlf = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} + } + } + flags + } + + fn merge(&mut self, previous: &Flags) { + if self.case_insensitive.is_none() { + self.case_insensitive = previous.case_insensitive; + } + if self.multi_line.is_none() { + self.multi_line = previous.multi_line; + } + if self.dot_matches_new_line.is_none() { + self.dot_matches_new_line = previous.dot_matches_new_line; + } + if self.swap_greed.is_none() { + self.swap_greed = previous.swap_greed; + } + if self.unicode.is_none() { + self.unicode = previous.unicode; + } + if self.crlf.is_none() { + self.crlf = previous.crlf; + } + } + + fn case_insensitive(&self) -> bool { + self.case_insensitive.unwrap_or(false) + } + + fn multi_line(&self) -> bool { + self.multi_line.unwrap_or(false) + } + + fn dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line.unwrap_or(false) + } + + fn swap_greed(&self) -> bool { + self.swap_greed.unwrap_or(false) + } + + fn unicode(&self) -> bool { + self.unicode.unwrap_or(true) + } + + fn crlf(&self) -> bool { + self.crlf.unwrap_or(false) + } +} + +fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { + let ranges: Vec<_> = ascii_class(kind) + .map(|(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + hir::ClassBytes::new(ranges) +} + +fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator { + use crate::ast::ClassAsciiKind::*; + + let slice: &'static [(u8, u8)] = match *kind { + Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], + Alpha => &[(b'A', b'Z'), (b'a', b'z')], + Ascii => &[(b'\x00', b'\x7F')], + Blank => &[(b'\t', b'\t'), (b' ', b' ')], + Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], + Digit => &[(b'0', b'9')], + Graph => &[(b'!', b'~')], + Lower => &[(b'a', b'z')], + Print => &[(b' ', b'~')], + Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], + Space => &[ + (b'\t', b'\t'), + (b'\n', b'\n'), + (b'\x0B', b'\x0B'), + (b'\x0C', b'\x0C'), + (b'\r', b'\r'), + (b' ', b' '), + ], + Upper => &[(b'A', b'Z')], + Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], + Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], + }; + slice.iter().copied() +} + +fn ascii_class_as_chars( + kind: &ast::ClassAsciiKind, +) -> impl Iterator { + ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) +} + +#[cfg(test)] +mod tests { + use crate::{ + ast::{self, parse::ParserBuilder, Ast, Position, Span}, + hir::{self, Hir, HirKind, Look, Properties}, + unicode::{self, ClassQuery}, + }; + + use super::*; + + // We create these errors to compare with real hir::Errors in the tests. + // We define equality between TestError and hir::Error to disregard the + // pattern string in hir::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: hir::ErrorKind, + } + + impl PartialEq for TestError { + fn eq(&self, other: &hir::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq for hir::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn parse(pattern: &str) -> Ast { + ParserBuilder::new().octal(true).build().parse(pattern).unwrap() + } + + fn t(pattern: &str) -> Hir { + TranslatorBuilder::new() + .utf8(true) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn t_err(pattern: &str) -> hir::Error { + TranslatorBuilder::new() + .utf8(true) + .build() + .translate(pattern, &parse(pattern)) + .unwrap_err() + } + + fn t_bytes(pattern: &str) -> Hir { + TranslatorBuilder::new() + .utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn props(pattern: &str) -> Properties { + t(pattern).properties().clone() + } + + fn props_bytes(pattern: &str) -> Properties { + t_bytes(pattern).properties().clone() + } + + fn hir_lit(s: &str) -> Hir { + hir_blit(s.as_bytes()) + } + + fn hir_blit(s: &[u8]) -> Hir { + Hir::literal(s) + } + + fn hir_capture(index: u32, expr: Hir) -> Hir { + Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) }) + } + + fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir { + Hir::capture(hir::Capture { + index, + name: Some(name.into()), + sub: Box::new(expr), + }) + } + + fn hir_quest(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy, + sub: Box::new(expr), + }) + } + + fn hir_star(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy, + sub: Box::new(expr), + }) + } + + fn hir_plus(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy, + sub: Box::new(expr), + }) + } + + fn hir_range(greedy: bool, min: u32, max: Option, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub: Box::new(expr), + }) + } + + fn hir_alt(alts: Vec) -> Hir { + Hir::alternation(alts) + } + + fn hir_cat(exprs: Vec) -> Hir { + Hir::concat(exprs) + } + + #[allow(dead_code)] + fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { + Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) + } + + #[allow(dead_code)] + fn hir_uclass_perl_word() -> Hir { + Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) + } + + fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( + ascii_class_as_chars(kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), + ))) + } + + fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Bytes(hir::ClassBytes::new( + ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), + ))) + } + + fn hir_uclass(ranges: &[(char, char)]) -> Hir { + Hir::class(uclass(ranges)) + } + + fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { + Hir::class(bclass(ranges)) + } + + fn hir_case_fold(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.case_fold_simple(); + Hir::class(cls) + } + _ => panic!("cannot case fold non-class Hir expr"), + } + } + + fn hir_negate(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.negate(); + Hir::class(cls) + } + _ => panic!("cannot negate non-class Hir expr"), + } + } + + fn uclass(ranges: &[(char, char)]) -> hir::Class { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + hir::Class::Unicode(hir::ClassUnicode::new(ranges)) + } + + fn bclass(ranges: &[(u8, u8)]) -> hir::Class { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + hir::Class::Bytes(hir::ClassBytes::new(ranges)) + } + + #[cfg(feature = "unicode-case")] + fn class_case_fold(mut cls: hir::Class) -> Hir { + cls.case_fold_simple(); + Hir::class(cls) + } + + fn class_negate(mut cls: hir::Class) -> Hir { + cls.negate(); + Hir::class(cls) + } + + #[allow(dead_code)] + fn hir_union(expr1: Hir, expr2: Hir) -> Hir { + use crate::hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { + c1.union(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { + c1.union(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot union non-class Hir exprs"), + } + } + + #[allow(dead_code)] + fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { + use crate::hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { + c1.difference(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { + c1.difference(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot difference non-class Hir exprs"), + } + } + + fn hir_look(look: hir::Look) -> Hir { + Hir::look(look) + } + + #[test] + fn empty() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?i)"), Hir::empty()); + assert_eq!(t("()"), hir_capture(1, Hir::empty())); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("(?P)"), hir_capture_name(1, "wat", Hir::empty())); + assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); + assert_eq!( + t("()|()"), + hir_alt(vec![ + hir_capture(1, Hir::empty()), + hir_capture(2, Hir::empty()), + ]) + ); + assert_eq!( + t("(|b)"), + hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) + ); + assert_eq!( + t("(a|)"), + hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) + ); + assert_eq!( + t("(a||c)"), + hir_capture( + 1, + hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) + ) + ); + assert_eq!( + t("(||)"), + hir_capture( + 1, + hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) + ) + ); + } + + #[test] + fn literal() { + assert_eq!(t("a"), hir_lit("a")); + assert_eq!(t("(?-u)a"), hir_lit("a")); + assert_eq!(t("☃"), hir_lit("☃")); + assert_eq!(t("abcd"), hir_lit("abcd")); + + assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); + assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!(t("(?-u)☃"), hir_lit("☃")); + assert_eq!( + t_err(r"(?-u)\xFF"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(9, 1, 10) + ), + } + ); + } + + #[test] + fn literal_case_insensitive() { + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("a(?i)a(?-i)a"), + hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)ab@c"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_uclass(&[('B', 'B'), ('b', 'b')]), + hir_lit("@"), + hir_uclass(&[('C', 'C'), ('c', 'c')]), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)β"), + hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) + ); + + assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?-u)a(?i)a(?-i)a"), + hir_cat(vec![ + hir_lit("a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("a"), + ]) + ); + assert_eq!( + t("(?i-u)ab@c"), + hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), + hir_lit("@"), + hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), + ]) + ); + + assert_eq!( + t_bytes("(?i-u)a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!( + t_bytes("(?i-u)\x61"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!( + t_bytes(r"(?i-u)\x61"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!(t("(?i-u)β"), hir_lit("β"),); + } + + #[test] + fn dot() { + assert_eq!( + t("."), + hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) + ); + assert_eq!( + t("(?R)."), + hir_uclass(&[ + ('\0', '\t'), + ('\x0B', '\x0C'), + ('\x0E', '\u{10FFFF}'), + ]) + ); + assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); + assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); + assert_eq!( + t_bytes("(?-u)."), + hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) + ); + assert_eq!( + t_bytes("(?R-u)."), + hir_bclass(&[ + (b'\0', b'\t'), + (b'\x0B', b'\x0C'), + (b'\x0E', b'\xFF'), + ]) + ); + assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + + // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. + assert_eq!( + t_err("(?-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(6, 1, 7) + ), + } + ); + assert_eq!( + t_err("(?R-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } + ); + assert_eq!( + t_err("(?s-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } + ); + assert_eq!( + t_err("(?Rs-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(7, 1, 8), + Position::new(8, 1, 9) + ), + } + ); + } + + #[test] + fn assertions() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); + assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); + assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); + assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); + } + + #[test] + fn group() { + assert_eq!(t("(a)"), hir_capture(1, hir_lit("a"))); + assert_eq!( + t("(a)(b)"), + hir_cat(vec![ + hir_capture(1, hir_lit("a")), + hir_capture(2, hir_lit("b")), + ]) + ); + assert_eq!( + t("(a)|(b)"), + hir_alt(vec![ + hir_capture(1, hir_lit("a")), + hir_capture(2, hir_lit("b")), + ]) + ); + assert_eq!(t("(?P)"), hir_capture_name(1, "foo", Hir::empty())); + assert_eq!(t("(?Pa)"), hir_capture_name(1, "foo", hir_lit("a"))); + assert_eq!( + t("(?Pa)(?Pb)"), + hir_cat(vec![ + hir_capture_name(1, "foo", hir_lit("a")), + hir_capture_name(2, "bar", hir_lit("b")), + ]) + ); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("(?:a)"), hir_lit("a")); + assert_eq!( + t("(?:a)(b)"), + hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),]) + ); + assert_eq!( + t("(a)(?:b)(c)"), + hir_cat(vec![ + hir_capture(1, hir_lit("a")), + hir_lit("b"), + hir_capture(2, hir_lit("c")), + ]) + ); + assert_eq!( + t("(a)(?Pb)(c)"), + hir_cat(vec![ + hir_capture(1, hir_lit("a")), + hir_capture_name(2, "foo", hir_lit("b")), + hir_capture(3, hir_lit("c")), + ]) + ); + assert_eq!(t("()"), hir_capture(1, Hir::empty())); + assert_eq!(t("((?i))"), hir_capture(1, Hir::empty())); + assert_eq!(t("((?x))"), hir_capture(1, Hir::empty())); + assert_eq!( + t("(((?x)))"), + hir_capture(1, hir_capture(2, Hir::empty())) + ); + } + + #[test] + fn line_anchors() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + + assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); + assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); + assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); + } + + #[test] + fn flags() { + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i:a)a"), + hir_cat( + vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),] + ) + ); + assert_eq!( + t("(?i-u:a)β"), + hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("β"), + ]) + ); + assert_eq!( + t("(?:(?i-u)a)b"), + hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("b"), + ]) + ); + assert_eq!( + t("((?i-u)a)b"), + hir_cat(vec![ + hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("b"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)(?-i:a)a"), + hir_cat( + vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),] + ) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?im)a^"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_look(hir::Look::StartLF), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?im)a^(?i-m)a^"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_look(hir::Look::StartLF), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_look(hir::Look::Start), + ]) + ); + assert_eq!( + t("(?U)a*a*?(?-U)a*a*?"), + hir_cat(vec![ + hir_star(false, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(false, hir_lit("a")), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?:a(?i)a)a"), + hir_cat(vec![ + hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ]), + hir_lit("a"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)(?:a(?-i)a)a"), + hir_cat(vec![ + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ]), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ]) + ); + } + + #[test] + fn escape() { + assert_eq!( + t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), + hir_lit(r"\.+*?()|[]{}^$#") + ); + } + + #[test] + fn repetition() { + assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); + assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); + assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); + assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); + assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); + assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); + + assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),)); + assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),)); + + assert_eq!( + t("ab?"), + hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) + ); + assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab")))); + assert_eq!( + t("a|b?"), + hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) + ); + } + + #[test] + fn cat_alt() { + let a = || hir_look(hir::Look::Start); + let b = || hir_look(hir::Look::End); + let c = || hir_look(hir::Look::WordUnicode); + let d = || hir_look(hir::Look::WordUnicodeNegate); + + assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()]))); + assert_eq!(t("^|$"), hir_alt(vec![a(), b()])); + assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()])); + assert_eq!( + t(r"^$|$\b|\b\B"), + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) + ); + assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()]))); + assert_eq!( + t(r"(^|$|\b)"), + hir_capture(1, hir_alt(vec![a(), b(), c()])) + ); + assert_eq!( + t(r"(^$|$\b|\b\B)"), + hir_capture( + 1, + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) + ) + ); + assert_eq!( + t(r"(^$|($\b|(\b\B)))"), + hir_capture( + 1, + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_capture( + 2, + hir_alt(vec![ + hir_cat(vec![b(), c()]), + hir_capture(3, hir_cat(vec![c(), d()])), + ]) + ), + ]) + ) + ); + } + + // Tests the HIR transformation of things like '[a-z]|[A-Z]' into + // '[A-Za-z]'. In other words, an alternation of just classes is always + // equivalent to a single class corresponding to the union of the branches + // in that class. (Unless some branches match invalid UTF-8 and others + // match non-ASCII Unicode.) + #[test] + fn cat_class_flattened() { + assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + // Combining all of the letter properties should give us the one giant + // letter property. + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"(?x) + \p{Lowercase_Letter} + |\p{Uppercase_Letter} + |\p{Titlecase_Letter} + |\p{Modifier_Letter} + |\p{Other_Letter} + "), + hir_uclass_query(ClassQuery::Binary("letter")) + ); + // Byte classes that can truly match invalid UTF-8 cannot be combined + // with Unicode classes. + assert_eq!( + t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"), + hir_alt(vec![ + hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]), + hir_bclass(&[(b'\x90', b'\xFF')]), + hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]), + ]) + ); + // Byte classes on their own can be combined, even if some are ASCII + // and others are invalid UTF-8. + assert_eq!( + t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"), + hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]), + ); + } + + #[test] + fn class_ascii() { + assert_eq!( + t("[[:alnum:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) + ); + assert_eq!( + t("[[:alpha:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) + ); + assert_eq!( + t("[[:ascii:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) + ); + assert_eq!( + t("[[:blank:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Blank) + ); + assert_eq!( + t("[[:cntrl:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) + ); + assert_eq!( + t("[[:digit:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Digit) + ); + assert_eq!( + t("[[:graph:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Graph) + ); + assert_eq!( + t("[[:lower:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Lower) + ); + assert_eq!( + t("[[:print:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Print) + ); + assert_eq!( + t("[[:punct:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Punct) + ); + assert_eq!( + t("[[:space:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Space) + ); + assert_eq!( + t("[[:upper:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Upper) + ); + assert_eq!( + t("[[:word:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Word) + ); + assert_eq!( + t("[[:xdigit:]]"), + hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) + ); + + assert_eq!( + t("[[:^lower:]]"), + hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[[:lower:]]"), + hir_uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]) + ); + + assert_eq!( + t("(?-u)[[:lower:]]"), + hir_ascii_bclass(&ast::ClassAsciiKind::Lower) + ); + assert_eq!( + t("(?i-u)[[:lower:]]"), + hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) + ); + + assert_eq!( + t_err("(?-u)[[:^lower:]]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(16, 1, 17) + ), + } + ); + assert_eq!( + t_err("(?i-u)[[:^lower:]]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(7, 1, 8), + Position::new(17, 1, 18) + ), + } + ); + } + + #[test] + fn class_ascii_multiple() { + // See: https://github.com/rust-lang/regex/issues/680 + assert_eq!( + t("[[:alnum:][:^ascii:]]"), + hir_union( + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), + hir_uclass(&[('\u{80}', '\u{10FFFF}')]), + ), + ); + assert_eq!( + t_bytes("(?-u)[[:alnum:][:^ascii:]]"), + hir_union( + hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), + hir_bclass(&[(0x80, 0xFF)]), + ), + ); + } + + #[test] + #[cfg(feature = "unicode-perl")] + fn class_perl_unicode() { + // Unicode + assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); + assert_eq!(t(r"\w"), hir_uclass_perl_word()); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\d"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\s"), + hir_uclass_query(ClassQuery::Binary("space")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); + + // Unicode, negated + assert_eq!( + t(r"\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + assert_eq!( + t(r"\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) + ); + assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); + } + + #[test] + fn class_perl_ascii() { + // ASCII only + assert_eq!( + t(r"(?-u)\d"), + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) + ); + assert_eq!( + t(r"(?-u)\s"), + hir_ascii_bclass(&ast::ClassAsciiKind::Space) + ); + assert_eq!( + t(r"(?-u)\w"), + hir_ascii_bclass(&ast::ClassAsciiKind::Word) + ); + assert_eq!( + t(r"(?i-u)\d"), + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) + ); + assert_eq!( + t(r"(?i-u)\s"), + hir_ascii_bclass(&ast::ClassAsciiKind::Space) + ); + assert_eq!( + t(r"(?i-u)\w"), + hir_ascii_bclass(&ast::ClassAsciiKind::Word) + ); + + // ASCII only, negated + assert_eq!( + t_bytes(r"(?-u)\D"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?-u)\S"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t_bytes(r"(?-u)\W"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) + ); + assert_eq!( + t_bytes(r"(?i-u)\D"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?i-u)\S"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t_bytes(r"(?i-u)\W"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) + ); + + // ASCII only, negated, with UTF-8 mode enabled. + // In this case, negating any Perl class results in an error because + // all such classes can match invalid UTF-8. + assert_eq!( + t_err(r"(?-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + } + + #[test] + #[cfg(not(feature = "unicode-perl"))] + fn class_perl_word_disabled() { + assert_eq!( + t_err(r"\w"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] + fn class_perl_space_disabled() { + assert_eq!( + t_err(r"\s"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all( + not(feature = "unicode-perl"), + not(feature = "unicode-gencat") + ))] + fn class_perl_digit_disabled() { + assert_eq!( + t_err(r"\d"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_gencat() { + assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{se PaRa ToR}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{gc:Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{gc=Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{Other}"), + hir_uclass_query(ClassQuery::Binary("Other")) + ); + assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other"))); + + assert_eq!( + t(r"\PZ"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + assert_eq!( + t(r"\P{separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + assert_eq!( + t(r"\P{gc!=separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + + assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); + assert_eq!( + t(r"\p{assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned")) + ); + assert_eq!( + t(r"\p{ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII")) + ); + assert_eq!( + t(r"\p{gc:any}"), + hir_uclass_query(ClassQuery::Binary("Any")) + ); + assert_eq!( + t(r"\p{gc:assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned")) + ); + assert_eq!( + t(r"\p{gc:ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII")) + ); + + assert_eq!( + t_err(r"(?-u)\pZ"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 9) + ), + } + ); + assert_eq!( + t_err(r"(?-u)\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(18, 1, 19) + ), + } + ); + assert_eq!( + t_err(r"\pE"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(3, 1, 4) + ), + } + ); + assert_eq!( + t_err(r"\p{Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + assert_eq!( + t_err(r"\p{gc:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-gencat"))] + fn class_unicode_gencat_disabled() { + assert_eq!( + t_err(r"\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + + assert_eq!( + t_err(r"\p{Any}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-script")] + fn class_unicode_script() { + assert_eq!( + t(r"\p{Greek}"), + hir_uclass_query(ClassQuery::Binary("Greek")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\p{Greek}"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\P{Greek}"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "Greek" + )))) + ); + + assert_eq!( + t_err(r"\p{sc:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + assert_eq!( + t_err(r"\p{scx:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-script"))] + fn class_unicode_script_disabled() { + assert_eq!( + t_err(r"\p{Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 10) + ), + } + ); + + assert_eq!( + t_err(r"\p{scx:Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-age")] + fn class_unicode_age() { + assert_eq!( + t_err(r"\p{age:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_any_empty() { + assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); + } + + #[test] + #[cfg(not(feature = "unicode-age"))] + fn class_unicode_age_disabled() { + assert_eq!( + t_err(r"\p{age:3.0}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + fn class_bracketed() { + assert_eq!(t("[a]"), hir_lit("a")); + assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')])); + assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')]))); + assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); + assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); + assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); + assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); + assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); + assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\pZ]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\p{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\PZ]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\P{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(all( + feature = "unicode-case", + any(feature = "unicode-perl", feature = "unicode-gencat") + ))] + assert_eq!( + t(r"(?i)[^\D]"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[^\P{greek}]"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) + ); + + assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); + assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); + assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); + + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[k]"), + hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[β]"), + hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) + ); + assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); + + assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')]))); + assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')]))); + assert_eq!( + t_bytes("(?-u)[^a]"), + class_negate(bclass(&[(b'a', b'a')])) + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!( + t(r"[^\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\pZ]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\p{separator}]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[^\p{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "greek" + )))) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[\P{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "greek" + )))) + ); + + // Test some weird cases. + assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); + + assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); + assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); + + assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); + assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); + + assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); + assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); + + assert_eq!( + t_err("(?-u)[^a]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(9, 1, 10) + ), + } + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] + assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] + assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); + } + + #[test] + fn class_bracketed_union() { + assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[a\pZb]"), + hir_union( + hir_uclass(&[('a', 'b')]), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ); + #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] + assert_eq!( + t(r"[\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[\p{age:3.0}\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("cyrillic")), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ) + ) + ); + + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), + hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + )) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + )) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ))) + ); + } + + #[test] + fn class_bracketed_nested() { + assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[]))); + + assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); + assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); + + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[a[^c]]"), + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[a-b[^c]]"), + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) + ); + + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[^a-b[^c]]"), + hir_uclass(&[('C', 'C'), ('c', 'c')]) + ); + + assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); + } + + #[test] + fn class_bracketed_intersect() { + assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); + assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + + assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); + assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); + assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); + + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[abc&&b-c]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[abc&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[[abc]&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_uclass(&[('c', 'x')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[c-da-b&&a-d]"), + hir_case_fold(hir_uclass(&[('a', 'd')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[a-d&&c-da-b]"), + hir_case_fold(hir_uclass(&[('a', 'd')])) + ); + + assert_eq!( + t("(?i-u)[abc&&b-c]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[abc&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[[abc]&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_bclass(&[(b'c', b'x')])) + ); + assert_eq!( + t("(?i-u)[c-da-b&&a-d]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')])) + ); + assert_eq!( + t("(?i-u)[a-d&&c-da-b]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')])) + ); + + // In `[a^]`, `^` does not need to be escaped, so it makes sense that + // `^` is also allowed to be unescaped after `&&`. + assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); + // `]` needs to be escaped after `&&` since it's not at start of class. + assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); + assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); + // Test precedence. + assert_eq!( + t(r"[a-w&&[^c-g]z]"), + hir_uclass(&[('a', 'b'), ('h', 'w')]) + ); + } + + #[test] + fn class_bracketed_intersect_negate() { + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^\w&&\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^[\w&&\d]]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^[^\w&&\d]]"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(feature = "unicode-perl")] + assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); + + #[cfg(feature = "unicode-perl")] + assert_eq!( + t_bytes(r"(?-u)[^\w&&\d]"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?-u)[^[a-z&&a-c]]"), + hir_negate(hir_bclass(&[(b'a', b'c')])) + ); + assert_eq!( + t_bytes(r"(?-u)[^[\w&&\d]]"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?-u)[^[^\w&&\d]]"), + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) + ); + assert_eq!( + t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) + ); + } + + #[test] + fn class_bracketed_difference() { + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\pL--[:ascii:]]"), + hir_difference( + hir_uclass_query(ClassQuery::Binary("letter")), + hir_uclass(&[('\0', '\x7F')]) + ) + ); + + assert_eq!( + t(r"(?-u)[[:alpha:]--[:lower:]]"), + hir_bclass(&[(b'A', b'Z')]) + ); + } + + #[test] + fn class_bracketed_symmetric_difference() { + #[cfg(feature = "unicode-script")] + assert_eq!( + t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), + hir_uclass(&[ + ('\u{0342}', '\u{0342}'), + ('\u{0345}', '\u{0345}'), + ('\u{1DC0}', '\u{1DC1}'), + ]) + ); + assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')])); + + assert_eq!( + t(r"(?-u)[a-g~~c-j]"), + hir_bclass(&[(b'a', b'b'), (b'h', b'j')]) + ); + } + + #[test] + fn ignore_whitespace() { + assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); + assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); + assert_eq!( + t(r"(?x)\x # comment +{ # comment + 53 # comment +} #comment"), + hir_lit("S") + ); + + assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); + assert_eq!( + t(r"(?x)\x # comment + 53 # comment"), + hir_lit("S") + ); + assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); + + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"(?x)\p # comment +{ # comment + Separator # comment +} # comment"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + + assert_eq!( + t(r"(?x)a # comment +{ # comment + 5 # comment + , # comment + 10 # comment +} # comment"), + hir_range(true, 5, Some(10), hir_lit("a")) + ); + + assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); + } + + #[test] + fn analysis_is_utf8() { + // Positive examples. + assert!(props_bytes(r"a").is_utf8()); + assert!(props_bytes(r"ab").is_utf8()); + assert!(props_bytes(r"(?-u)a").is_utf8()); + assert!(props_bytes(r"(?-u)ab").is_utf8()); + assert!(props_bytes(r"\xFF").is_utf8()); + assert!(props_bytes(r"\xFF\xFF").is_utf8()); + assert!(props_bytes(r"[^a]").is_utf8()); + assert!(props_bytes(r"[^a][^a]").is_utf8()); + assert!(props_bytes(r"\b").is_utf8()); + assert!(props_bytes(r"\B").is_utf8()); + assert!(props_bytes(r"(?-u)\b").is_utf8()); + assert!(props_bytes(r"(?-u)\B").is_utf8()); + + // Negative examples. + assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); + } + + #[test] + fn analysis_captures_len() { + assert_eq!(0, props(r"a").explicit_captures_len()); + assert_eq!(0, props(r"(?:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u)a").explicit_captures_len()); + assert_eq!(1, props(r"(a)").explicit_captures_len()); + assert_eq!(1, props(r"(?Pa)").explicit_captures_len()); + assert_eq!(1, props(r"()").explicit_captures_len()); + assert_eq!(1, props(r"()a").explicit_captures_len()); + assert_eq!(1, props(r"(a)+").explicit_captures_len()); + assert_eq!(2, props(r"(a)(b)").explicit_captures_len()); + assert_eq!(2, props(r"(a)|(b)").explicit_captures_len()); + assert_eq!(2, props(r"((a))").explicit_captures_len()); + assert_eq!(1, props(r"([a&&b])").explicit_captures_len()); + } + + #[test] + fn analysis_static_captures_len() { + let len = |pattern| props(pattern).static_explicit_captures_len(); + assert_eq!(Some(0), len(r"")); + assert_eq!(Some(0), len(r"foo|bar")); + assert_eq!(None, len(r"(foo)|bar")); + assert_eq!(None, len(r"foo|(bar)")); + assert_eq!(Some(1), len(r"(foo|bar)")); + assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); + assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); + assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); + assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); + assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); + assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); + assert_eq!(None, len(r"(a)(b)(extra)?")); + assert_eq!(Some(1), len(r"(foo)|(bar)")); + assert_eq!(Some(2), len(r"(foo)(bar)")); + assert_eq!(Some(2), len(r"(foo)+(bar)")); + assert_eq!(None, len(r"(foo)*(bar)")); + assert_eq!(Some(0), len(r"(foo)?{0}")); + assert_eq!(None, len(r"(foo)?{1}")); + assert_eq!(Some(1), len(r"(foo){1}")); + assert_eq!(Some(1), len(r"(foo){1,}")); + assert_eq!(Some(1), len(r"(foo){1,}?")); + assert_eq!(None, len(r"(foo){1,}??")); + assert_eq!(None, len(r"(foo){0,}")); + assert_eq!(Some(1), len(r"(foo)(?:bar)")); + assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); + assert_eq!(Some(2), len(r"(?Pfoo)(?:bar)(bal|loon)")); + assert_eq!( + Some(2), + len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) + ); + } + + #[test] + fn analysis_is_all_assertions() { + // Positive examples. + let p = props(r"\b"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\A"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\z"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$^\z\A\b\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$|^|\z|\A|\b|\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^$|$^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"((\b)+())*^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + // Negative examples. + let p = props(r"^a"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(1)); + } + + #[test] + fn analysis_look_set_prefix_any() { + let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"); + assert!(p.look_set_prefix_any().contains(Look::WordAscii)); + } + + #[test] + fn analysis_is_anchored() { + let is_start = |p| props(p).look_set_prefix().contains(Look::Start); + let is_end = |p| props(p).look_set_suffix().contains(Look::End); + + // Positive examples. + assert!(is_start(r"^")); + assert!(is_end(r"$")); + + assert!(is_start(r"^^")); + assert!(props(r"$$").look_set_suffix().contains(Look::End)); + + assert!(is_start(r"^$")); + assert!(is_end(r"^$")); + + assert!(is_start(r"^foo")); + assert!(is_end(r"foo$")); + + assert!(is_start(r"^foo|^bar")); + assert!(is_end(r"foo$|bar$")); + + assert!(is_start(r"^(foo|bar)")); + assert!(is_end(r"(foo|bar)$")); + + assert!(is_start(r"^+")); + assert!(is_end(r"$+")); + assert!(is_start(r"^++")); + assert!(is_end(r"$++")); + assert!(is_start(r"(^)+")); + assert!(is_end(r"($)+")); + + assert!(is_start(r"$^")); + assert!(is_start(r"$^")); + assert!(is_start(r"$^|^$")); + assert!(is_end(r"$^|^$")); + + assert!(is_start(r"\b^")); + assert!(is_end(r"$\b")); + assert!(is_start(r"^(?m:^)")); + assert!(is_end(r"(?m:$)$")); + assert!(is_start(r"(?m:^)^")); + assert!(is_end(r"$(?m:$)")); + + // Negative examples. + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"(?m:^$)|$^")); + assert!(!is_end(r"(?m:^$)|$^")); + assert!(!is_start(r"$^|(?m:^$)")); + assert!(!is_end(r"$^|(?m:^$)")); + + assert!(!is_start(r"a^")); + assert!(!is_start(r"$a")); + + assert!(!is_end(r"a^")); + assert!(!is_end(r"$a")); + + assert!(!is_start(r"^foo|bar")); + assert!(!is_end(r"foo|bar$")); + + assert!(!is_start(r"^*")); + assert!(!is_end(r"$*")); + assert!(!is_start(r"^*+")); + assert!(!is_end(r"$*+")); + assert!(!is_start(r"^+*")); + assert!(!is_end(r"$+*")); + assert!(!is_start(r"(^)*")); + assert!(!is_end(r"($)*")); + } + + #[test] + fn analysis_is_any_anchored() { + let is_start = |p| props(p).look_set().contains(Look::Start); + let is_end = |p| props(p).look_set().contains(Look::End); + + // Positive examples. + assert!(is_start(r"^")); + assert!(is_end(r"$")); + assert!(is_start(r"\A")); + assert!(is_end(r"\z")); + + // Negative examples. + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"$")); + assert!(!is_end(r"^")); + } + + #[test] + fn analysis_can_empty() { + // Positive examples. + let assert_empty = + |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); + assert_empty(r""); + assert_empty(r"()"); + assert_empty(r"()*"); + assert_empty(r"()+"); + assert_empty(r"()?"); + assert_empty(r"a*"); + assert_empty(r"a?"); + assert_empty(r"a{0}"); + assert_empty(r"a{0,}"); + assert_empty(r"a{0,1}"); + assert_empty(r"a{0,10}"); + #[cfg(feature = "unicode-gencat")] + assert_empty(r"\pL*"); + assert_empty(r"a*|b"); + assert_empty(r"b|a*"); + assert_empty(r"a|"); + assert_empty(r"|a"); + assert_empty(r"a||b"); + assert_empty(r"a*a?(abcd)*"); + assert_empty(r"^"); + assert_empty(r"$"); + assert_empty(r"(?m)^"); + assert_empty(r"(?m)$"); + assert_empty(r"\A"); + assert_empty(r"\z"); + assert_empty(r"\B"); + assert_empty(r"(?-u)\B"); + assert_empty(r"\b"); + assert_empty(r"(?-u)\b"); + + // Negative examples. + let assert_non_empty = + |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); + assert_non_empty(r"a+"); + assert_non_empty(r"a{1}"); + assert_non_empty(r"a{1,}"); + assert_non_empty(r"a{1,2}"); + assert_non_empty(r"a{1,10}"); + assert_non_empty(r"b|a"); + assert_non_empty(r"a*a+(abcd)*"); + #[cfg(feature = "unicode-gencat")] + assert_non_empty(r"\P{any}"); + assert_non_empty(r"[a--a]"); + assert_non_empty(r"[a&&b]"); + } + + #[test] + fn analysis_is_literal() { + // Positive examples. + assert!(props(r"a").is_literal()); + assert!(props(r"ab").is_literal()); + assert!(props(r"abc").is_literal()); + assert!(props(r"(?m)abc").is_literal()); + assert!(props(r"(?:a)").is_literal()); + assert!(props(r"foo(?:a)").is_literal()); + assert!(props(r"(?:a)foo").is_literal()); + assert!(props(r"[a]").is_literal()); + + // Negative examples. + assert!(!props(r"").is_literal()); + assert!(!props(r"^").is_literal()); + assert!(!props(r"a|b").is_literal()); + assert!(!props(r"(a)").is_literal()); + assert!(!props(r"a+").is_literal()); + assert!(!props(r"foo(a)").is_literal()); + assert!(!props(r"(a)foo").is_literal()); + assert!(!props(r"[ab]").is_literal()); + } + + #[test] + fn analysis_is_alternation_literal() { + // Positive examples. + assert!(props(r"a").is_alternation_literal()); + assert!(props(r"ab").is_alternation_literal()); + assert!(props(r"abc").is_alternation_literal()); + assert!(props(r"(?m)abc").is_alternation_literal()); + assert!(props(r"foo|bar").is_alternation_literal()); + assert!(props(r"foo|bar|baz").is_alternation_literal()); + assert!(props(r"[a]").is_alternation_literal()); + assert!(props(r"(?:ab)|cd").is_alternation_literal()); + assert!(props(r"ab|(?:cd)").is_alternation_literal()); + + // Negative examples. + assert!(!props(r"").is_alternation_literal()); + assert!(!props(r"^").is_alternation_literal()); + assert!(!props(r"(a)").is_alternation_literal()); + assert!(!props(r"a+").is_alternation_literal()); + assert!(!props(r"foo(a)").is_alternation_literal()); + assert!(!props(r"(a)foo").is_alternation_literal()); + assert!(!props(r"[ab]").is_alternation_literal()); + assert!(!props(r"[ab]|b").is_alternation_literal()); + assert!(!props(r"a|[ab]").is_alternation_literal()); + assert!(!props(r"(a)|b").is_alternation_literal()); + assert!(!props(r"a|(b)").is_alternation_literal()); + assert!(!props(r"a|b").is_alternation_literal()); + assert!(!props(r"a|b|c").is_alternation_literal()); + assert!(!props(r"[a]|b").is_alternation_literal()); + assert!(!props(r"a|[b]").is_alternation_literal()); + assert!(!props(r"(?:a)|b").is_alternation_literal()); + assert!(!props(r"a|(?:b)").is_alternation_literal()); + assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal()); + } + + // This tests that the smart Hir::repetition constructors does some basic + // simplifications. + #[test] + fn smart_repetition() { + assert_eq!(t(r"a{0}"), Hir::empty()); + assert_eq!(t(r"a{1}"), hir_lit("a")); + assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate)); + } + + // This tests that the smart Hir::concat constructor simplifies the given + // exprs in a way we expect. + #[test] + fn smart_concat() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("abc"), hir_lit("abc")); + assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar")); + assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz")); + assert_eq!( + t("foo(?:bar^baz)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + assert_eq!( + t("foo(?:ba(?:r^b)az)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + } + + // This tests that the smart Hir::alternation constructor simplifies the + // given exprs in a way we expect. + #[test] + fn smart_alternation() { + assert_eq!( + t("(?:foo)|(?:bar)"), + hir_alt(vec![hir_lit("foo"), hir_lit("bar")]) + ); + assert_eq!( + t("quux|(?:abc|def|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + assert_eq!( + t("quux|(?:abc|(?:def|mno)|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("mno"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + assert_eq!( + t("a|b|c|d|e|f|x|y|z"), + hir_uclass(&[('a', 'f'), ('x', 'z')]), + ); + // Tests that we lift common prefixes out of an alternation. + assert_eq!( + t("[A-Z]foo|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z][A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![Hir::empty(), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z]foo|[A-Z]foobar"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]), + ]), + ); + } + + #[test] + fn regression_alt_empty_concat() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); + } + + #[test] + fn regression_empty_alt() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); + } + + #[test] + fn regression_singleton_alt() { + use crate::{ + ast::{self, Ast}, + hir::Dot, + }; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::dot(span)], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 + #[test] + fn regression_fuzz_match() { + let pat = "[(\u{6} \0-\u{afdf5}] \0 "; + let ast = ParserBuilder::new() + .octal(false) + .ignore_whitespace(true) + .build() + .parse(pat) + .unwrap(); + let hir = TranslatorBuilder::new() + .utf8(true) + .case_insensitive(false) + .multi_line(false) + .dot_matches_new_line(false) + .swap_greed(true) + .unicode(true) + .build() + .translate(pat, &ast) + .unwrap(); + assert_eq!( + hir, + Hir::concat(vec![ + hir_uclass(&[('\0', '\u{afdf5}')]), + hir_lit("\0"), + ]) + ); + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 + #[cfg(feature = "unicode")] + #[test] + fn regression_fuzz_difference1() { + let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*"; + let _ = t(pat); // shouldn't panic + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 + #[test] + fn regression_fuzz_char_decrement1() { + let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0] Result; + + /// This method is called before beginning traversal of the HIR. + fn start(&mut self) {} + + /// This method is called on an `Hir` before descending into child `Hir` + /// nodes. + fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Hir` after descending all of its child + /// `Hir` nodes. + fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an alternation. + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of a concatenation. + fn visit_concat_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Hir` while calling +/// appropriate methods provided by the [`Visitor`] trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Hir` without using a stack size proportional to the depth +/// of the `Hir`. Namely, this method will instead use constant stack space, +/// but will use heap space proportional to the size of the `Hir`. This may be +/// desirable in cases where the size of `Hir` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit(hir: &Hir, visitor: V) -> Result { + HeapVisitor::new().visit(hir, visitor) +} + +/// HeapVisitor visits every item in an `Hir` recursively using constant stack +/// size and a heap size proportional to the size of the `Hir`. +struct HeapVisitor<'a> { + /// A stack of `Hir` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Hir, Frame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Hir`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a hir::Repetition), + /// A stack frame allocated just before descending into a capture's child + /// node. + Capture(&'a hir::Capture), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![] } + } + + fn visit( + &mut self, + mut hir: &'a Hir, + mut visitor: V, + ) -> Result { + self.stack.clear(); + + visitor.start(); + loop { + visitor.visit_pre(hir)?; + if let Some(x) = self.induct(hir) { + let child = x.child(); + self.stack.push((hir, x)); + hir = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + visitor.visit_post(hir)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_hir, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_hir, frame)) => (post_hir, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + match x { + Frame::Alternation { .. } => { + visitor.visit_alternation_in()?; + } + Frame::Concat { .. } => { + visitor.visit_concat_in()?; + } + _ => {} + } + hir = x.child(); + self.stack.push((post_hir, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this HIR, so we can post visit it now. + visitor.visit_post(post_hir)?; + } + } + } + + /// Build a stack frame for the given HIR if one is needed (which occurs if + /// and only if there are child nodes in the HIR). Otherwise, return None. + fn induct(&mut self, hir: &'a Hir) -> Option> { + match *hir.kind() { + HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), + HirKind::Capture(ref x) => Some(Frame::Capture(x)), + HirKind::Concat(ref x) if x.is_empty() => None, + HirKind::Concat(ref x) => { + Some(Frame::Concat { head: &x[0], tail: &x[1..] }) + } + HirKind::Alternation(ref x) if x.is_empty() => None, + HirKind::Alternation(ref x) => { + Some(Frame::Alternation { head: &x[0], tail: &x[1..] }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option> { + match induct { + Frame::Repetition(_) => None, + Frame::Capture(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { head: &tail[0], tail: &tail[1..] }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child HIR node to visit. + fn child(&self) -> &'a Hir { + match *self { + Frame::Repetition(rep) => &rep.sub, + Frame::Capture(capture) => &capture.sub, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} diff --git a/vendor/regex-syntax/src/lib.rs b/vendor/regex-syntax/src/lib.rs new file mode 100644 index 0000000..20f25db --- /dev/null +++ b/vendor/regex-syntax/src/lib.rs @@ -0,0 +1,431 @@ +/*! +This crate provides a robust regular expression parser. + +This crate defines two primary types: + +* [`Ast`](ast::Ast) is the abstract syntax of a regular expression. + An abstract syntax corresponds to a *structured representation* of the + concrete syntax of a regular expression, where the concrete syntax is the + pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it + can be converted back to the original concrete syntax (modulo some details, + like whitespace). To a first approximation, the abstract syntax is complex + and difficult to analyze. +* [`Hir`](hir::Hir) is the high-level intermediate representation + ("HIR" or "high-level IR" for short) of regular expression. It corresponds to + an intermediate state of a regular expression that sits between the abstract + syntax and the low level compiled opcodes that are eventually responsible for + executing a regular expression search. Given some high-level IR, it is not + possible to produce the original concrete syntax (although it is possible to + produce an equivalent concrete syntax, but it will likely scarcely resemble + the original pattern). To a first approximation, the high-level IR is simple + and easy to analyze. + +These two types come with conversion routines: + +* An [`ast::parse::Parser`] converts concrete syntax (a `&str`) to an +[`Ast`](ast::Ast). +* A [`hir::translate::Translator`] converts an [`Ast`](ast::Ast) to a +[`Hir`](hir::Hir). + +As a convenience, the above two conversion routines are combined into one via +the top-level [`Parser`] type. This `Parser` will first convert your pattern to +an `Ast` and then convert the `Ast` to an `Hir`. It's also exposed as top-level +[`parse`] free function. + + +# Example + +This example shows how to parse a pattern string into its HIR: + +``` +use regex_syntax::{hir::Hir, parse}; + +let hir = parse("a|b")?; +assert_eq!(hir, Hir::alternation(vec![ + Hir::literal("a".as_bytes()), + Hir::literal("b".as_bytes()), +])); +# Ok::<(), Box>(()) +``` + + +# Concrete syntax supported + +The concrete syntax is documented as part of the public API of the +[`regex` crate](https://docs.rs/regex/%2A/regex/#syntax). + + +# Input safety + +A key feature of this library is that it is safe to use with end user facing +input. This plays a significant role in the internal implementation. In +particular: + +1. Parsers provide a `nest_limit` option that permits callers to control how + deeply nested a regular expression is allowed to be. This makes it possible + to do case analysis over an `Ast` or an `Hir` using recursion without + worrying about stack overflow. +2. Since relying on a particular stack size is brittle, this crate goes to + great lengths to ensure that all interactions with both the `Ast` and the + `Hir` do not use recursion. Namely, they use constant stack space and heap + space proportional to the size of the original pattern string (in bytes). + This includes the type's corresponding destructors. (One exception to this + is literal extraction, but this will eventually get fixed.) + + +# Error reporting + +The `Display` implementations on all `Error` types exposed in this library +provide nice human readable errors that are suitable for showing to end users +in a monospace font. + + +# Literal extraction + +This crate provides limited support for [literal extraction from `Hir` +values](hir::literal). Be warned that literal extraction uses recursion, and +therefore, stack size proportional to the size of the `Hir`. + +The purpose of literal extraction is to speed up searches. That is, if you +know a regular expression must match a prefix or suffix literal, then it is +often quicker to search for instances of that literal, and then confirm or deny +the match using the full regular expression engine. These optimizations are +done automatically in the `regex` crate. + + +# Crate features + +An important feature provided by this crate is its Unicode support. This +includes things like case folding, boolean properties, general categories, +scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`. +However, a downside of this support is that it requires bundling several +Unicode data tables that are substantial in size. + +A fair number of use cases do not require full Unicode support. For this +reason, this crate exposes a number of features to control which Unicode +data is available. + +If a regular expression attempts to use a Unicode feature that is not available +because the corresponding crate feature was disabled, then translating that +regular expression to an `Hir` will return an error. (It is still possible +construct an `Ast` for such a regular expression, since Unicode data is not +used until translation to an `Hir`.) Stated differently, enabling or disabling +any of the features below can only add or subtract from the total set of valid +regular expressions. Enabling or disabling a feature will never modify the +match semantics of a regular expression. + +The following features are available: + +* **std** - + Enables support for the standard library. This feature is enabled by default. + When disabled, only `core` and `alloc` are used. Otherwise, enabling `std` + generally just enables `std::error::Error` trait impls for the various error + types. +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. +* **arbitrary** - + Enabling this feature introduces a public dependency on the + [`arbitrary`](https://crates.io/crates/arbitrary) + crate. Namely, it implements the `Arbitrary` trait from that crate for the + [`Ast`](crate::ast::Ast) type. This feature is disabled by default. +*/ + +#![no_std] +#![forbid(unsafe_code)] +#![deny(missing_docs, rustdoc::broken_intra_doc_links)] +#![warn(missing_debug_implementations)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +#[cfg(any(test, feature = "std"))] +extern crate std; + +extern crate alloc; + +pub use crate::{ + error::Error, + parser::{parse, Parser, ParserBuilder}, + unicode::UnicodeWordError, +}; + +use alloc::string::String; + +pub mod ast; +mod debug; +mod either; +mod error; +pub mod hir; +mod parser; +mod rank; +mod unicode; +mod unicode_tables; +pub mod utf8; + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn escape(text: &str) -> String { + let mut quoted = String::new(); + escape_into(text, &mut quoted); + quoted +} + +/// Escapes all meta characters in `text` and writes the result into `buf`. +/// +/// This will append escape characters into the given buffer. The characters +/// that are appended are safe to use as a literal in a regular expression. +pub fn escape_into(text: &str, buf: &mut String) { + buf.reserve(text.len()); + for c in text.chars() { + if is_meta_character(c) { + buf.push('\\'); + } + buf.push(c); + } +} + +/// Returns true if the given character has significance in a regex. +/// +/// Generally speaking, these are the only characters which _must_ be escaped +/// in order to match their literal meaning. For example, to match a literal +/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For +/// example, `-` is treated as a meta character because of its significance +/// for writing ranges inside of character classes, but the regex `-` will +/// match a literal `-` because `-` has no special meaning outside of character +/// classes. +/// +/// In order to determine whether a character may be escaped at all, the +/// [`is_escapeable_character`] routine should be used. The difference between +/// `is_meta_character` and `is_escapeable_character` is that the latter will +/// return true for some characters that are _not_ meta characters. For +/// example, `%` and `\%` both match a literal `%` in all contexts. In other +/// words, `is_escapeable_character` includes "superfluous" escapes. +/// +/// Note that the set of characters for which this function returns `true` or +/// `false` is fixed and won't change in a semver compatible release. (In this +/// case, "semver compatible release" actually refers to the `regex` crate +/// itself, since reducing or expanding the set of meta characters would be a +/// breaking change for not just `regex-syntax` but also `regex` itself.) +/// +/// # Example +/// +/// ``` +/// use regex_syntax::is_meta_character; +/// +/// assert!(is_meta_character('?')); +/// assert!(is_meta_character('-')); +/// assert!(is_meta_character('&')); +/// assert!(is_meta_character('#')); +/// +/// assert!(!is_meta_character('%')); +/// assert!(!is_meta_character('/')); +/// assert!(!is_meta_character('!')); +/// assert!(!is_meta_character('"')); +/// assert!(!is_meta_character('e')); +/// ``` +pub fn is_meta_character(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' + | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, + _ => false, + } +} + +/// Returns true if the given character can be escaped in a regex. +/// +/// This returns true in all cases that `is_meta_character` returns true, but +/// also returns true in some cases where `is_meta_character` returns false. +/// For example, `%` is not a meta character, but it is escapeable. That is, +/// `%` and `\%` both match a literal `%` in all contexts. +/// +/// The purpose of this routine is to provide knowledge about what characters +/// may be escaped. Namely, most regex engines permit "superfluous" escapes +/// where characters without any special significance may be escaped even +/// though there is no actual _need_ to do so. +/// +/// This will return false for some characters. For example, `e` is not +/// escapeable. Therefore, `\e` will either result in a parse error (which is +/// true today), or it could backwards compatibly evolve into a new construct +/// with its own meaning. Indeed, that is the purpose of banning _some_ +/// superfluous escapes: it provides a way to evolve the syntax in a compatible +/// manner. +/// +/// # Example +/// +/// ``` +/// use regex_syntax::is_escapeable_character; +/// +/// assert!(is_escapeable_character('?')); +/// assert!(is_escapeable_character('-')); +/// assert!(is_escapeable_character('&')); +/// assert!(is_escapeable_character('#')); +/// assert!(is_escapeable_character('%')); +/// assert!(is_escapeable_character('/')); +/// assert!(is_escapeable_character('!')); +/// assert!(is_escapeable_character('"')); +/// +/// assert!(!is_escapeable_character('e')); +/// ``` +pub fn is_escapeable_character(c: char) -> bool { + // Certainly escapeable if it's a meta character. + if is_meta_character(c) { + return true; + } + // Any character that isn't ASCII is definitely not escapeable. There's + // no real need to allow things like \☃ right? + if !c.is_ascii() { + return false; + } + // Otherwise, we basically say that everything is escapeable unless it's a + // letter or digit. Things like \3 are either octal (when enabled) or an + // error, and we should keep it that way. Otherwise, letters are reserved + // for adding new syntax in a backwards compatible way. + match c { + '0'..='9' | 'A'..='Z' | 'a'..='z' => false, + // While not currently supported, we keep these as not escapeable to + // give us some flexibility with respect to supporting the \< and + // \> word boundary assertions in the future. By rejecting them as + // escapeable, \< and \> will result in a parse error. Thus, we can + // turn them into something else in the future without it being a + // backwards incompatible change. + // + // OK, now we support \< and \>, and we need to retain them as *not* + // escapeable here since the escape sequence is significant. + '<' | '>' => false, + _ => true, + } +} + +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +/// +/// # Panics +/// +/// If the `unicode-perl` feature is not enabled, then this function +/// panics. For this reason, it is recommended that callers use +/// [`try_is_word_character`] instead. +pub fn is_word_character(c: char) -> bool { + try_is_word_character(c).expect("unicode-perl feature must be enabled") +} + +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +/// +/// # Errors +/// +/// If the `unicode-perl` feature is not enabled, then this function always +/// returns an error. +pub fn try_is_word_character( + c: char, +) -> core::result::Result { + unicode::is_word_character(c) +} + +/// Returns true if and only if the given character is an ASCII word character. +/// +/// An ASCII word character is defined by the following character class: +/// `[_0-9a-zA-Z]`. +pub fn is_word_byte(c: u8) -> bool { + match c { + b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use alloc::string::ToString; + + use super::*; + + #[test] + fn escape_meta() { + assert_eq!( + escape(r"\.+*?()|[]{}^$#&-~"), + r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string() + ); + } + + #[test] + fn word_byte() { + assert!(is_word_byte(b'a')); + assert!(!is_word_byte(b'-')); + } + + #[test] + #[cfg(feature = "unicode-perl")] + fn word_char() { + assert!(is_word_character('a'), "ASCII"); + assert!(is_word_character('à'), "Latin-1"); + assert!(is_word_character('β'), "Greek"); + assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)"); + assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)"); + assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)"); + assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)"); + assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)"); + assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)"); + assert!(!is_word_character('-')); + assert!(!is_word_character('☃')); + } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_panic() { + assert!(is_word_character('a')); + } + + #[test] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_error() { + assert!(try_is_word_character('a').is_err()); + } +} diff --git a/vendor/regex-syntax/src/parser.rs b/vendor/regex-syntax/src/parser.rs new file mode 100644 index 0000000..f482b84 --- /dev/null +++ b/vendor/regex-syntax/src/parser.rs @@ -0,0 +1,254 @@ +use crate::{ast, hir, Error}; + +/// A convenience routine for parsing a regex using default options. +/// +/// This is equivalent to `Parser::new().parse(pattern)`. +/// +/// If you need to set non-default options, then use a [`ParserBuilder`]. +/// +/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically +/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator +/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then +/// you should use a [`ast::parse::Parser`]. +pub fn parse(pattern: &str) -> Result { + Parser::new().parse(pattern) +} + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +/// +/// This type combines the builder options for both the [AST +/// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR +/// `TranslatorBuilder`](hir::translate::TranslatorBuilder). +#[derive(Clone, Debug, Default)] +pub struct ParserBuilder { + ast: ast::parse::ParserBuilder, + hir: hir::translate::TranslatorBuilder, +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder::default() + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { ast: self.ast.build(), hir: self.hir.build() } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.ast.nest_limit(limit); + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.octal(yes); + self + } + + /// When disabled, translation will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When enabled (the default), the translator is guaranteed to produce an + /// expression that, for non-empty matches, will only ever produce spans + /// that are entirely valid UTF-8 (otherwise, the translator will return an + /// error). + /// + /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even + /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete + /// syntax) will be allowed even though they can produce matches that split + /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" + /// matches, and it is expected that the regex engine itself must handle + /// these cases if necessary (perhaps by suppressing any zero-width matches + /// that split a codepoint). + pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.utf8(yes); + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insignificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.ignore_whitespace(yes); + self + } + + /// Enable or disable the case insensitive flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.case_insensitive(yes); + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.multi_line(yes); + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.dot_matches_new_line(yes); + self + } + + /// Enable or disable the CRLF mode flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `R` flag. + /// + /// When CRLF mode is enabled, the following happens: + /// + /// * Unless `dot_matches_new_line` is enabled, `.` will match any character + /// except for `\r` and `\n`. + /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, + /// `\r` and `\n` as line terminators. And in particular, neither will + /// match between a `\r` and a `\n`. + pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.crlf(yes); + self + } + + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder { + self.hir.line_terminator(byte); + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.swap_greed(yes); + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless `utf8` is disabled (it's enabled by default), a + /// regular expression will fail to parse if Unicode mode is disabled and a + /// sub-expression could possibly match invalid UTF-8. + pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.unicode(yes); + self + } +} + +/// A convenience parser for regular expressions. +/// +/// This parser takes as input a regular expression pattern string (the +/// "concrete syntax") and returns a high-level intermediate representation +/// (the HIR) suitable for most types of analysis. In particular, this parser +/// hides the intermediate state of producing an AST (the "abstract syntax"). +/// The AST is itself far more complex than the HIR, so this parser serves as a +/// convenience for never having to deal with it at all. +/// +/// If callers have more fine grained use cases that need an AST, then please +/// see the [`ast::parse`] module. +/// +/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. +#[derive(Clone, Debug)] +pub struct Parser { + ast: ast::parse::Parser, + hir: hir::translate::Translator, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with `parse` method. The parse method returns + /// a high level intermediate representation of the given regular + /// expression. + /// + /// To set configuration options on the parser, use [`ParserBuilder`]. + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into a high level intermediate + /// representation. + pub fn parse(&mut self, pattern: &str) -> Result { + let ast = self.ast.parse(pattern)?; + let hir = self.hir.translate(pattern, &ast)?; + Ok(hir) + } +} diff --git a/vendor/regex-syntax/src/rank.rs b/vendor/regex-syntax/src/rank.rs new file mode 100644 index 0000000..ccb25a2 --- /dev/null +++ b/vendor/regex-syntax/src/rank.rs @@ -0,0 +1,258 @@ +pub(crate) const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/vendor/regex-syntax/src/unicode.rs b/vendor/regex-syntax/src/unicode.rs new file mode 100644 index 0000000..393a4c0 --- /dev/null +++ b/vendor/regex-syntax/src/unicode.rs @@ -0,0 +1,1039 @@ +use alloc::{ + string::{String, ToString}, + vec::Vec, +}; + +use crate::hir; + +/// An inclusive range of codepoints from a generated file (hence the static +/// lifetime). +type Range = &'static [(char, char)]; + +/// An error that occurs when dealing with Unicode. +/// +/// We don't impl the Error trait here because these always get converted +/// into other public errors. (This error type isn't exported.) +#[derive(Debug)] +pub enum Error { + PropertyNotFound, + PropertyValueNotFound, + // Not used when unicode-perl is enabled. + #[allow(dead_code)] + PerlClassNotFound, +} + +/// An error that occurs when Unicode-aware simple case folding fails. +/// +/// This error can occur when the case mapping tables necessary for Unicode +/// aware case folding are unavailable. This only occurs when the +/// `unicode-case` feature is disabled. (The feature is enabled by default.) +#[derive(Debug)] +pub struct CaseFoldError(()); + +#[cfg(feature = "std")] +impl std::error::Error for CaseFoldError {} + +impl core::fmt::Display for CaseFoldError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "Unicode-aware case folding is not available \ + (probably because the unicode-case feature is not enabled)" + ) + } +} + +/// An error that occurs when the Unicode-aware `\w` class is unavailable. +/// +/// This error can occur when the data tables necessary for the Unicode aware +/// Perl character class `\w` are unavailable. This only occurs when the +/// `unicode-perl` feature is disabled. (The feature is enabled by default.) +#[derive(Debug)] +pub struct UnicodeWordError(()); + +#[cfg(feature = "std")] +impl std::error::Error for UnicodeWordError {} + +impl core::fmt::Display for UnicodeWordError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "Unicode-aware \\w class is not available \ + (probably because the unicode-perl feature is not enabled)" + ) + } +} + +/// A state oriented traverser of the simple case folding table. +/// +/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will +/// return an error if the underlying case folding table is unavailable. +/// +/// After construction, it is expected that callers will use +/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly +/// increasing order. For example, calling it on `b` and then on `a` is illegal +/// and will result in a panic. +/// +/// The main idea of this type is that it tries hard to make mapping lookups +/// fast by exploiting the structure of the underlying table, and the ordering +/// assumption enables this. +#[derive(Debug)] +pub struct SimpleCaseFolder { + /// The simple case fold table. It's a sorted association list, where the + /// keys are Unicode scalar values and the values are the corresponding + /// equivalence class (not including the key) of the "simple" case folded + /// Unicode scalar values. + table: &'static [(char, &'static [char])], + /// The last codepoint that was used for a lookup. + last: Option, + /// The index to the entry in `table` corresponding to the smallest key `k` + /// such that `k > k0`, where `k0` is the most recent key lookup. Note that + /// in particular, `k0` may not be in the table! + next: usize, +} + +impl SimpleCaseFolder { + /// Create a new simple case folder, returning an error if the underlying + /// case folding table is unavailable. + pub fn new() -> Result { + #[cfg(not(feature = "unicode-case"))] + { + Err(CaseFoldError(())) + } + #[cfg(feature = "unicode-case")] + { + Ok(SimpleCaseFolder { + table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, + last: None, + next: 0, + }) + } + } + + /// Return the equivalence class of case folded codepoints for the given + /// codepoint. The equivalence class returned never includes the codepoint + /// given. If the given codepoint has no case folded codepoints (i.e., + /// no entry in the underlying case folding table), then this returns an + /// empty slice. + /// + /// # Panics + /// + /// This panics when called with a `c` that is less than or equal to the + /// previous call. In other words, callers need to use this method with + /// strictly increasing values of `c`. + pub fn mapping(&mut self, c: char) -> &'static [char] { + if let Some(last) = self.last { + assert!( + last < c, + "got codepoint U+{:X} which occurs before \ + last codepoint U+{:X}", + u32::from(c), + u32::from(last), + ); + } + self.last = Some(c); + if self.next >= self.table.len() { + return &[]; + } + let (k, v) = self.table[self.next]; + if k == c { + self.next += 1; + return v; + } + match self.get(c) { + Err(i) => { + self.next = i; + &[] + } + Ok(i) => { + // Since we require lookups to proceed + // in order, anything we find should be + // after whatever we thought might be + // next. Otherwise, the caller is either + // going out of order or we would have + // found our next key at 'self.next'. + assert!(i > self.next); + self.next = i + 1; + self.table[i].1 + } + } + } + + /// Returns true if and only if the given range overlaps with any region + /// of the underlying case folding table. That is, when true, there exists + /// at least one codepoint in the inclusive range `[start, end]` that has + /// a non-trivial equivalence class of case folded codepoints. Conversely, + /// when this returns false, all codepoints in the range `[start, end]` + /// correspond to the trivial equivalence class of case folded codepoints, + /// i.e., itself. + /// + /// This is useful to call before iterating over the codepoints in the + /// range and looking up the mapping for each. If you know none of the + /// mappings will return anything, then you might be able to skip doing it + /// altogether. + /// + /// # Panics + /// + /// This panics when `end < start`. + pub fn overlaps(&self, start: char, end: char) -> bool { + use core::cmp::Ordering; + + assert!(start <= end); + self.table + .binary_search_by(|&(c, _)| { + if start <= c && c <= end { + Ordering::Equal + } else if c > end { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok() + } + + /// Returns the index at which `c` occurs in the simple case fold table. If + /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < + /// c` and `table[i].0 > c`. + fn get(&self, c: char) -> Result { + self.table.binary_search_by_key(&c, |&(c1, _)| c1) + } +} + +/// A query for finding a character class defined by Unicode. This supports +/// either use of a property name directly, or lookup by property value. The +/// former generally refers to Binary properties (see UTS#44, Table 8), but +/// as a special exception (see UTS#18, Section 1.2) both general categories +/// (an enumeration) and scripts (a catalog) are supported as if each of their +/// possible values were a binary property. +/// +/// In all circumstances, property names and values are normalized and +/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. +/// +/// The lifetime `'a` refers to the shorter of the lifetimes of property name +/// and property value. +#[derive(Debug)] +pub enum ClassQuery<'a> { + /// Return a class corresponding to a Unicode binary property, named by + /// a single letter. + OneLetter(char), + /// Return a class corresponding to a Unicode binary property. + /// + /// Note that, by special exception (see UTS#18, Section 1.2), both + /// general category values and script values are permitted here as if + /// they were a binary property. + Binary(&'a str), + /// Return a class corresponding to all codepoints whose property + /// (identified by `property_name`) corresponds to the given value + /// (identified by `property_value`). + ByValue { + /// A property name. + property_name: &'a str, + /// A property value. + property_value: &'a str, + }, +} + +impl<'a> ClassQuery<'a> { + fn canonicalize(&self) -> Result { + match *self { + ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), + ClassQuery::Binary(name) => self.canonical_binary(name), + ClassQuery::ByValue { property_name, property_value } => { + let property_name = symbolic_name_normalize(property_name); + let property_value = symbolic_name_normalize(property_value); + + let canon_name = match canonical_prop(&property_name)? { + None => return Err(Error::PropertyNotFound), + Some(canon_name) => canon_name, + }; + Ok(match canon_name { + "General_Category" => { + let canon = match canonical_gencat(&property_value)? { + None => return Err(Error::PropertyValueNotFound), + Some(canon) => canon, + }; + CanonicalClassQuery::GeneralCategory(canon) + } + "Script" => { + let canon = match canonical_script(&property_value)? { + None => return Err(Error::PropertyValueNotFound), + Some(canon) => canon, + }; + CanonicalClassQuery::Script(canon) + } + _ => { + let vals = match property_values(canon_name)? { + None => return Err(Error::PropertyValueNotFound), + Some(vals) => vals, + }; + let canon_val = + match canonical_value(vals, &property_value) { + None => { + return Err(Error::PropertyValueNotFound) + } + Some(canon_val) => canon_val, + }; + CanonicalClassQuery::ByValue { + property_name: canon_name, + property_value: canon_val, + } + } + }) + } + } + } + + fn canonical_binary( + &self, + name: &str, + ) -> Result { + let norm = symbolic_name_normalize(name); + + // This is a special case where 'cf' refers to the 'Format' general + // category, but where the 'cf' abbreviation is also an abbreviation + // for the 'Case_Folding' property. But we want to treat it as + // a general category. (Currently, we don't even support the + // 'Case_Folding' property. But if we do in the future, users will be + // required to spell it out.) + // + // Also 'sc' refers to the 'Currency_Symbol' general category, but is + // also the abbreviation for the 'Script' property. So we avoid calling + // 'canonical_prop' for it too, which would erroneously normalize it + // to 'Script'. + // + // Another case: 'lc' is an abbreviation for the 'Cased_Letter' + // general category, but is also an abbreviation for the 'Lowercase_Mapping' + // property. We don't currently support the latter, so as with 'cf' + // above, we treat 'lc' as 'Cased_Letter'. + if norm != "cf" && norm != "sc" && norm != "lc" { + if let Some(canon) = canonical_prop(&norm)? { + return Ok(CanonicalClassQuery::Binary(canon)); + } + } + if let Some(canon) = canonical_gencat(&norm)? { + return Ok(CanonicalClassQuery::GeneralCategory(canon)); + } + if let Some(canon) = canonical_script(&norm)? { + return Ok(CanonicalClassQuery::Script(canon)); + } + Err(Error::PropertyNotFound) + } +} + +/// Like ClassQuery, but its parameters have been canonicalized. This also +/// differentiates binary properties from flattened general categories and +/// scripts. +#[derive(Debug, Eq, PartialEq)] +enum CanonicalClassQuery { + /// The canonical binary property name. + Binary(&'static str), + /// The canonical general category name. + GeneralCategory(&'static str), + /// The canonical script name. + Script(&'static str), + /// An arbitrary association between property and value, both of which + /// have been canonicalized. + /// + /// Note that by construction, the property name of ByValue will never + /// be General_Category or Script. Those two cases are subsumed by the + /// eponymous variants. + ByValue { + /// The canonical property name. + property_name: &'static str, + /// The canonical property value. + property_value: &'static str, + }, +} + +/// Looks up a Unicode class given a query. If one doesn't exist, then +/// `None` is returned. +pub fn class(query: ClassQuery<'_>) -> Result { + use self::CanonicalClassQuery::*; + + match query.canonicalize()? { + Binary(name) => bool_property(name), + GeneralCategory(name) => gencat(name), + Script(name) => script(name), + ByValue { property_name: "Age", property_value } => { + let mut class = hir::ClassUnicode::empty(); + for set in ages(property_value)? { + class.union(&hir_class(set)); + } + Ok(class) + } + ByValue { property_name: "Script_Extensions", property_value } => { + script_extension(property_value) + } + ByValue { + property_name: "Grapheme_Cluster_Break", + property_value, + } => gcb(property_value), + ByValue { property_name: "Sentence_Break", property_value } => { + sb(property_value) + } + ByValue { property_name: "Word_Break", property_value } => { + wb(property_value) + } + _ => { + // What else should we support? + Err(Error::PropertyNotFound) + } + } +} + +/// Returns a Unicode aware class for \w. +/// +/// This returns an error if the data is not available for \w. +pub fn perl_word() -> Result { + #[cfg(not(feature = "unicode-perl"))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(feature = "unicode-perl")] + fn imp() -> Result { + use crate::unicode_tables::perl_word::PERL_WORD; + Ok(hir_class(PERL_WORD)) + } + + imp() +} + +/// Returns a Unicode aware class for \s. +/// +/// This returns an error if the data is not available for \s. +pub fn perl_space() -> Result { + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] + fn imp() -> Result { + use crate::unicode_tables::perl_space::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + #[cfg(feature = "unicode-bool")] + fn imp() -> Result { + use crate::unicode_tables::property_bool::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + imp() +} + +/// Returns a Unicode aware class for \d. +/// +/// This returns an error if the data is not available for \d. +pub fn perl_digit() -> Result { + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] + fn imp() -> Result { + use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + #[cfg(feature = "unicode-gencat")] + fn imp() -> Result { + use crate::unicode_tables::general_category::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + imp() +} + +/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. +pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { + let hir_ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + hir::ClassUnicode::new(hir_ranges) +} + +/// Returns true only if the given codepoint is in the `\w` character class. +/// +/// If the `unicode-perl` feature is not enabled, then this returns an error. +pub fn is_word_character(c: char) -> Result { + #[cfg(not(feature = "unicode-perl"))] + fn imp(_: char) -> Result { + Err(UnicodeWordError(())) + } + + #[cfg(feature = "unicode-perl")] + fn imp(c: char) -> Result { + use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD}; + + if u8::try_from(c).map_or(false, is_word_byte) { + return Ok(true); + } + Ok(PERL_WORD + .binary_search_by(|&(start, end)| { + use core::cmp::Ordering; + + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok()) + } + + imp(c) +} + +/// A mapping of property values for a specific property. +/// +/// The first element of each tuple is a normalized property value while the +/// second element of each tuple is the corresponding canonical property +/// value. +type PropertyValues = &'static [(&'static str, &'static str)]; + +fn canonical_gencat( + normalized_value: &str, +) -> Result, Error> { + Ok(match normalized_value { + "any" => Some("Any"), + "assigned" => Some("Assigned"), + "ascii" => Some("ASCII"), + _ => { + let gencats = property_values("General_Category")?.unwrap(); + canonical_value(gencats, normalized_value) + } + }) +} + +fn canonical_script( + normalized_value: &str, +) -> Result, Error> { + let scripts = property_values("Script")?.unwrap(); + Ok(canonical_value(scripts, normalized_value)) +} + +/// Find the canonical property name for the given normalized property name. +/// +/// If no such property exists, then `None` is returned. +/// +/// The normalized property name must have been normalized according to +/// UAX44 LM3, which can be done using `symbolic_name_normalize`. +/// +/// If the property names data is not available, then an error is returned. +fn canonical_prop( + normalized_name: &str, +) -> Result, Error> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &str) -> Result, Error> { + Err(Error::PropertyNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &str) -> Result, Error> { + use crate::unicode_tables::property_names::PROPERTY_NAMES; + + Ok(PROPERTY_NAMES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_NAMES[i].1)) + } + + imp(normalized_name) +} + +/// Find the canonical property value for the given normalized property +/// value. +/// +/// The given property values should correspond to the values for the property +/// under question, which can be found using `property_values`. +/// +/// If no such property value exists, then `None` is returned. +/// +/// The normalized property value must have been normalized according to +/// UAX44 LM3, which can be done using `symbolic_name_normalize`. +fn canonical_value( + vals: PropertyValues, + normalized_value: &str, +) -> Option<&'static str> { + vals.binary_search_by_key(&normalized_value, |&(n, _)| n) + .ok() + .map(|i| vals[i].1) +} + +/// Return the table of property values for the given property name. +/// +/// If the property values data is not available, then an error is returned. +fn property_values( + canonical_property_name: &'static str, +) -> Result, Error> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &'static str) -> Result, Error> { + Err(Error::PropertyValueNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &'static str) -> Result, Error> { + use crate::unicode_tables::property_values::PROPERTY_VALUES; + + Ok(PROPERTY_VALUES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_VALUES[i].1)) + } + + imp(canonical_property_name) +} + +// This is only used in some cases, but small enough to just let it be dead +// instead of figuring out (and maintaining) the right set of features. +#[allow(dead_code)] +fn property_set( + name_map: &'static [(&'static str, Range)], + canonical: &'static str, +) -> Option { + name_map + .binary_search_by_key(&canonical, |x| x.0) + .ok() + .map(|i| name_map[i].1) +} + +/// Returns an iterator over Unicode Age sets. Each item corresponds to a set +/// of codepoints that were added in a particular revision of Unicode. The +/// iterator yields items in chronological order. +/// +/// If the given age value isn't valid or if the data isn't available, then an +/// error is returned instead. +fn ages(canonical_age: &str) -> Result, Error> { + #[cfg(not(feature = "unicode-age"))] + fn imp(_: &str) -> Result, Error> { + use core::option::IntoIter; + Err::, _>(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-age")] + fn imp(canonical_age: &str) -> Result, Error> { + use crate::unicode_tables::age; + + const AGES: &[(&str, Range)] = &[ + ("V1_1", age::V1_1), + ("V2_0", age::V2_0), + ("V2_1", age::V2_1), + ("V3_0", age::V3_0), + ("V3_1", age::V3_1), + ("V3_2", age::V3_2), + ("V4_0", age::V4_0), + ("V4_1", age::V4_1), + ("V5_0", age::V5_0), + ("V5_1", age::V5_1), + ("V5_2", age::V5_2), + ("V6_0", age::V6_0), + ("V6_1", age::V6_1), + ("V6_2", age::V6_2), + ("V6_3", age::V6_3), + ("V7_0", age::V7_0), + ("V8_0", age::V8_0), + ("V9_0", age::V9_0), + ("V10_0", age::V10_0), + ("V11_0", age::V11_0), + ("V12_0", age::V12_0), + ("V12_1", age::V12_1), + ("V13_0", age::V13_0), + ("V14_0", age::V14_0), + ("V15_0", age::V15_0), + ]; + assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); + + let pos = AGES.iter().position(|&(age, _)| canonical_age == age); + match pos { + None => Err(Error::PropertyValueNotFound), + Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)), + } + } + + imp(canonical_age) +} + +/// Returns the Unicode HIR class corresponding to the given general category. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given general category could not be found, or if the general +/// category data is not available, then an error is returned. +fn gencat(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-gencat"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-gencat")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::general_category::BY_NAME; + match name { + "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), + "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), + "Assigned" => { + let mut cls = gencat("Unassigned")?; + cls.negate(); + Ok(cls) + } + name => property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound), + } + } + + match canonical_name { + "Decimal_Number" => perl_digit(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given script. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script could not be found, or if the script data is not +/// available, then an error is returned. +fn script(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::script::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given script extension. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script extension could not be found, or if the script data is +/// not available, then an error is returned. +fn script_extension( + canonical_name: &'static str, +) -> Result { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::script_extension::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given Unicode boolean +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given boolean property could not be found, or if the boolean +/// property data is not available, then an error is returned. +fn bool_property( + canonical_name: &'static str, +) -> Result { + #[cfg(not(feature = "unicode-bool"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-bool")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::property_bool::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyNotFound) + } + + match canonical_name { + "Decimal_Number" => perl_digit(), + "White_Space" => perl_space(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given grapheme cluster +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn gcb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::grapheme_cluster_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given word break +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn wb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::word_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given sentence +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn sb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::sentence_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Like symbolic_name_normalize_bytes, but operates on a string. +fn symbolic_name_normalize(x: &str) -> String { + let mut tmp = x.as_bytes().to_vec(); + let len = symbolic_name_normalize_bytes(&mut tmp).len(); + tmp.truncate(len); + // This should always succeed because `symbolic_name_normalize_bytes` + // guarantees that `&tmp[..len]` is always valid UTF-8. + // + // N.B. We could avoid the additional UTF-8 check here, but it's unlikely + // to be worth skipping the additional safety check. A benchmark must + // justify it first. + String::from_utf8(tmp).unwrap() +} + +/// Normalize the given symbolic name in place according to UAX44-LM3. +/// +/// A "symbolic name" typically corresponds to property names and property +/// value aliases. Note, though, that it should not be applied to property +/// string values. +/// +/// The slice returned is guaranteed to be valid UTF-8 for all possible values +/// of `slice`. +/// +/// See: https://unicode.org/reports/tr44/#UAX44-LM3 +fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { + // I couldn't find a place in the standard that specified that property + // names/aliases had a particular structure (unlike character names), but + // we assume that it's ASCII only and drop anything that isn't ASCII. + let mut start = 0; + let mut starts_with_is = false; + if slice.len() >= 2 { + // Ignore any "is" prefix. + starts_with_is = slice[0..2] == b"is"[..] + || slice[0..2] == b"IS"[..] + || slice[0..2] == b"iS"[..] + || slice[0..2] == b"Is"[..]; + if starts_with_is { + start = 2; + } + } + let mut next_write = 0; + for i in start..slice.len() { + // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid + // UTF-8, we ensure that the slice contains only ASCII bytes. In + // particular, we drop every non-ASCII byte from the normalized string. + let b = slice[i]; + if b == b' ' || b == b'_' || b == b'-' { + continue; + } else if b'A' <= b && b <= b'Z' { + slice[next_write] = b + (b'a' - b'A'); + next_write += 1; + } else if b <= 0x7F { + slice[next_write] = b; + next_write += 1; + } + } + // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally + // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross + // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it + // is actually an alias for the 'Other' general category. + if starts_with_is && next_write == 1 && slice[0] == b'c' { + slice[0] = b'i'; + slice[1] = b's'; + slice[2] = b'c'; + next_write = 3; + } + &mut slice[..next_write] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[cfg(feature = "unicode-case")] + fn simple_fold_ok(c: char) -> impl Iterator { + SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() + } + + #[cfg(feature = "unicode-case")] + fn contains_case_map(start: char, end: char) -> bool { + SimpleCaseFolder::new().unwrap().overlaps(start, end) + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_k() { + let xs: Vec = simple_fold_ok('k').collect(); + assert_eq!(xs, alloc::vec!['K', 'K']); + + let xs: Vec = simple_fold_ok('K').collect(); + assert_eq!(xs, alloc::vec!['k', 'K']); + + let xs: Vec = simple_fold_ok('K').collect(); + assert_eq!(xs, alloc::vec!['K', 'k']); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_a() { + let xs: Vec = simple_fold_ok('a').collect(); + assert_eq!(xs, alloc::vec!['A']); + + let xs: Vec = simple_fold_ok('A').collect(); + assert_eq!(xs, alloc::vec!['a']); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn simple_fold_disabled() { + assert!(SimpleCaseFolder::new().is_err()); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn range_contains() { + assert!(contains_case_map('A', 'A')); + assert!(contains_case_map('Z', 'Z')); + assert!(contains_case_map('A', 'Z')); + assert!(contains_case_map('@', 'A')); + assert!(contains_case_map('Z', '[')); + assert!(contains_case_map('☃', 'Ⰰ')); + + assert!(!contains_case_map('[', '[')); + assert!(!contains_case_map('[', '`')); + + assert!(!contains_case_map('☃', '☃')); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn regression_466() { + use super::{CanonicalClassQuery, ClassQuery}; + + let q = ClassQuery::OneLetter('C'); + assert_eq!( + q.canonicalize().unwrap(), + CanonicalClassQuery::GeneralCategory("Other") + ); + } + + #[test] + fn sym_normalize() { + let sym_norm = symbolic_name_normalize; + + assert_eq!(sym_norm("Line_Break"), "linebreak"); + assert_eq!(sym_norm("Line-break"), "linebreak"); + assert_eq!(sym_norm("linebreak"), "linebreak"); + assert_eq!(sym_norm("BA"), "ba"); + assert_eq!(sym_norm("ba"), "ba"); + assert_eq!(sym_norm("Greek"), "greek"); + assert_eq!(sym_norm("isGreek"), "greek"); + assert_eq!(sym_norm("IS_Greek"), "greek"); + assert_eq!(sym_norm("isc"), "isc"); + assert_eq!(sym_norm("is c"), "isc"); + assert_eq!(sym_norm("is_c"), "isc"); + } + + #[test] + fn valid_utf8_symbolic() { + let mut x = b"abc\xFFxyz".to_vec(); + let y = symbolic_name_normalize_bytes(&mut x); + assert_eq!(y, b"abcxyz"); + } +} diff --git a/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE b/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE new file mode 100644 index 0000000..b82826b --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE @@ -0,0 +1,57 @@ +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE + +Unicode Data Files include all data files under the directories +http://www.unicode.org/Public/, http://www.unicode.org/reports/, +http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and +http://www.unicode.org/utility/trac/browser/. + +Unicode Data Files do not include PDF online code charts under the +directory http://www.unicode.org/Public/. + +Software includes any source code published in the Unicode Standard +or under the directories +http://www.unicode.org/Public/, http://www.unicode.org/reports/, +http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and +http://www.unicode.org/utility/trac/browser/. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2018 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in http://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. diff --git a/vendor/regex-syntax/src/unicode_tables/age.rs b/vendor/regex-syntax/src/unicode_tables/age.rs new file mode 100644 index 0000000..71f4861 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/age.rs @@ -0,0 +1,1791 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate age ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("V10_0", V10_0), + ("V11_0", V11_0), + ("V12_0", V12_0), + ("V12_1", V12_1), + ("V13_0", V13_0), + ("V14_0", V14_0), + ("V15_0", V15_0), + ("V1_1", V1_1), + ("V2_0", V2_0), + ("V2_1", V2_1), + ("V3_0", V3_0), + ("V3_1", V3_1), + ("V3_2", V3_2), + ("V4_0", V4_0), + ("V4_1", V4_1), + ("V5_0", V5_0), + ("V5_1", V5_1), + ("V5_2", V5_2), + ("V6_0", V6_0), + ("V6_1", V6_1), + ("V6_2", V6_2), + ("V6_3", V6_3), + ("V7_0", V7_0), + ("V8_0", V8_0), + ("V9_0", V9_0), +]; + +pub const V10_0: &'static [(char, char)] = &[ + ('ࡠ', 'ࡪ'), + ('ৼ', '৽'), + ('\u{afa}', '\u{aff}'), + ('\u{d00}', '\u{d00}'), + ('\u{d3b}', '\u{d3c}'), + ('᳷', '᳷'), + ('\u{1df6}', '\u{1df9}'), + ('₿', '₿'), + ('⏿', '⏿'), + ('⯒', '⯒'), + ('⹅', '⹉'), + ('ㄮ', 'ㄮ'), + ('鿖', '鿪'), + ('𐌭', '𐌯'), + ('𑨀', '\u{11a47}'), + ('𑩐', '𑪃'), + ('𑪆', '𑪜'), + ('𑪞', '𑪢'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𖿡', '𖿡'), + ('𛀂', '𛄞'), + ('𛅰', '𛋻'), + ('🉠', '🉥'), + ('🛓', '🛔'), + ('🛷', '🛸'), + ('🤀', '🤋'), + ('🤟', '🤟'), + ('🤨', '🤯'), + ('🤱', '🤲'), + ('🥌', '🥌'), + ('🥟', '🥫'), + ('🦒', '🦗'), + ('🧐', '🧦'), + ('𬺰', '𮯠'), +]; + +pub const V11_0: &'static [(char, char)] = &[ + ('ՠ', 'ՠ'), + ('ֈ', 'ֈ'), + ('ׯ', 'ׯ'), + ('\u{7fd}', '߿'), + ('\u{8d3}', '\u{8d3}'), + ('\u{9fe}', '\u{9fe}'), + ('੶', '੶'), + ('\u{c04}', '\u{c04}'), + ('಄', '಄'), + ('ᡸ', 'ᡸ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('⮺', '⮼'), + ('⯓', '⯫'), + ('⯰', '⯾'), + ('⹊', '⹎'), + ('ㄯ', 'ㄯ'), + ('鿫', '鿯'), + ('ꞯ', 'ꞯ'), + ('Ꞹ', 'ꞹ'), + ('ꣾ', '\u{a8ff}'), + ('𐨴', '𐨵'), + ('𐩈', '𐩈'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐼀', '𐼧'), + ('𐼰', '𐽙'), + ('\u{110cd}', '\u{110cd}'), + ('𑅄', '𑅆'), + ('\u{1133b}', '\u{1133b}'), + ('\u{1145e}', '\u{1145e}'), + ('𑜚', '𑜚'), + ('𑠀', '𑠻'), + ('𑪝', '𑪝'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻸'), + ('𖹀', '𖺚'), + ('𘟭', '𘟱'), + ('𝋠', '𝋳'), + ('𝍲', '𝍸'), + ('𞱱', '𞲴'), + ('🄯', '🄯'), + ('🛹', '🛹'), + ('🟕', '🟘'), + ('🥍', '🥏'), + ('🥬', '🥰'), + ('🥳', '🥶'), + ('🥺', '🥺'), + ('🥼', '🥿'), + ('🦘', '🦢'), + ('🦰', '🦹'), + ('🧁', '🧂'), + ('🧧', '🧿'), + ('🩠', '🩭'), +]; + +pub const V12_0: &'static [(char, char)] = &[ + ('౷', '౷'), + ('ຆ', 'ຆ'), + ('ຉ', 'ຉ'), + ('ຌ', 'ຌ'), + ('ຎ', 'ຓ'), + ('ຘ', 'ຘ'), + ('ຠ', 'ຠ'), + ('ຨ', 'ຩ'), + ('ຬ', 'ຬ'), + ('\u{eba}', '\u{eba}'), + ('ᳺ', 'ᳺ'), + ('⯉', '⯉'), + ('⯿', '⯿'), + ('⹏', '⹏'), + ('Ꞻ', 'ꞿ'), + ('Ꟃ', 'Ᶎ'), + ('ꭦ', 'ꭧ'), + ('𐿠', '𐿶'), + ('𑑟', '𑑟'), + ('𑚸', '𑚸'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧤'), + ('𑪄', '𑪅'), + ('𑿀', '𑿱'), + ('𑿿', '𑿿'), + ('\u{13430}', '\u{13438}'), + ('𖽅', '𖽊'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽿', '𖾇'), + ('𖿢', '𖿣'), + ('𘟲', '𘟷'), + ('𛅐', '𛅒'), + ('𛅤', '𛅧'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅏'), + ('𞋀', '𞋹'), + ('𞋿', '𞋿'), + ('𞥋', '𞥋'), + ('𞴁', '𞴽'), + ('🅬', '🅬'), + ('🛕', '🛕'), + ('🛺', '🛺'), + ('🟠', '🟫'), + ('🤍', '🤏'), + ('🤿', '🤿'), + ('🥱', '🥱'), + ('🥻', '🥻'), + ('🦥', '🦪'), + ('🦮', '🦯'), + ('🦺', '🦿'), + ('🧃', '🧊'), + ('🧍', '🧏'), + ('🨀', '🩓'), + ('🩰', '🩳'), + ('🩸', '🩺'), + ('🪀', '🪂'), + ('🪐', '🪕'), +]; + +pub const V12_1: &'static [(char, char)] = &[('㋿', '㋿')]; + +pub const V13_0: &'static [(char, char)] = &[ + ('ࢾ', 'ࣇ'), + ('\u{b55}', '\u{b55}'), + ('ഄ', 'ഄ'), + ('\u{d81}', '\u{d81}'), + ('\u{1abf}', '\u{1ac0}'), + ('⮗', '⮗'), + ('⹐', '⹒'), + ('ㆻ', 'ㆿ'), + ('䶶', '䶿'), + ('鿰', '鿼'), + ('Ꟈ', 'ꟊ'), + ('Ꟶ', 'ꟶ'), + ('\u{a82c}', '\u{a82c}'), + ('ꭨ', '꭫'), + ('𐆜', '𐆜'), + ('𐺀', '𐺩'), + ('\u{10eab}', '𐺭'), + ('𐺰', '𐺱'), + ('𐾰', '𐿋'), + ('𑅇', '𑅇'), + ('𑇎', '\u{111cf}'), + ('𑑚', '𑑚'), + ('𑑠', '𑑡'), + ('𑤀', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '𑥆'), + ('𑥐', '𑥙'), + ('𑾰', '𑾰'), + ('\u{16fe4}', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𘫳', '𘳕'), + ('𘴀', '𘴈'), + ('🄍', '🄏'), + ('🅭', '🅯'), + ('🆭', '🆭'), + ('🛖', '🛗'), + ('🛻', '🛼'), + ('🢰', '🢱'), + ('🤌', '🤌'), + ('🥲', '🥲'), + ('🥷', '🥸'), + ('🦣', '🦤'), + ('🦫', '🦭'), + ('🧋', '🧋'), + ('🩴', '🩴'), + ('🪃', '🪆'), + ('🪖', '🪨'), + ('🪰', '🪶'), + ('🫀', '🫂'), + ('🫐', '🫖'), + ('🬀', '🮒'), + ('🮔', '🯊'), + ('🯰', '🯹'), + ('𪛗', '𪛝'), + ('𰀀', '𱍊'), +]; + +pub const V14_0: &'static [(char, char)] = &[ + ('؝', '؝'), + ('ࡰ', 'ࢎ'), + ('\u{890}', '\u{891}'), + ('\u{898}', '\u{89f}'), + ('ࢵ', 'ࢵ'), + ('ࣈ', '\u{8d2}'), + ('\u{c3c}', '\u{c3c}'), + ('ౝ', 'ౝ'), + ('ೝ', 'ೝ'), + ('ᜍ', 'ᜍ'), + ('᜕', '᜕'), + ('ᜟ', 'ᜟ'), + ('\u{180f}', '\u{180f}'), + ('\u{1ac1}', '\u{1ace}'), + ('ᭌ', 'ᭌ'), + ('᭽', '᭾'), + ('\u{1dfa}', '\u{1dfa}'), + ('⃀', '⃀'), + ('Ⱟ', 'Ⱟ'), + ('ⱟ', 'ⱟ'), + ('⹓', '⹝'), + ('鿽', '鿿'), + ('Ꟁ', 'ꟁ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꟴ'), + ('﯂', '﯂'), + ('﵀', '﵏'), + ('﷏', '﷏'), + ('﷾', '﷿'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐽰', '𐾉'), + ('\u{11070}', '𑁵'), + ('\u{110c2}', '\u{110c2}'), + ('𑚹', '𑚹'), + ('𑝀', '𑝆'), + ('𑪰', '𑪿'), + ('𒾐', '𒿲'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛄟', '𛄢'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('𜽐', '𜿃'), + ('𝇩', '𝇪'), + ('𝼀', '𝼞'), + ('𞊐', '\u{1e2ae}'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('🛝', '🛟'), + ('🟰', '🟰'), + ('🥹', '🥹'), + ('🧌', '🧌'), + ('🩻', '🩼'), + ('🪩', '🪬'), + ('🪷', '🪺'), + ('🫃', '🫅'), + ('🫗', '🫙'), + ('🫠', '🫧'), + ('🫰', '🫶'), + ('𪛞', '𪛟'), + ('𫜵', '𫜸'), +]; + +pub const V15_0: &'static [(char, char)] = &[ + ('ೳ', 'ೳ'), + ('\u{ece}', '\u{ece}'), + ('\u{10efd}', '\u{10eff}'), + ('𑈿', '\u{11241}'), + ('𑬀', '𑬉'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '𑽙'), + ('𓐯', '𓐯'), + ('\u{13439}', '\u{13455}'), + ('𛄲', '𛄲'), + ('𛅕', '𛅕'), + ('𝋀', '𝋓'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞓐', '𞓹'), + ('🛜', '🛜'), + ('🝴', '🝶'), + ('🝻', '🝿'), + ('🟙', '🟙'), + ('🩵', '🩷'), + ('🪇', '🪈'), + ('🪭', '🪯'), + ('🪻', '🪽'), + ('🪿', '🪿'), + ('🫎', '🫏'), + ('🫚', '🫛'), + ('🫨', '🫨'), + ('🫷', '🫸'), + ('𫜹', '𫜹'), + ('𱍐', '𲎯'), +]; + +pub const V1_1: &'static [(char, char)] = &[ + ('\0', 'ǵ'), + ('Ǻ', 'ȗ'), + ('ɐ', 'ʨ'), + ('ʰ', '˞'), + ('ˠ', '˩'), + ('\u{300}', '\u{345}'), + ('\u{360}', '\u{361}'), + ('ʹ', '͵'), + ('ͺ', 'ͺ'), + (';', ';'), + ('΄', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ώ'), + ('ϐ', 'ϖ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'ϳ'), + ('Ё', 'Ќ'), + ('Ў', 'я'), + ('ё', 'ќ'), + ('ў', '\u{486}'), + ('Ґ', 'ӄ'), + ('Ӈ', 'ӈ'), + ('Ӌ', 'ӌ'), + ('Ӑ', 'ӫ'), + ('Ӯ', 'ӵ'), + ('Ӹ', 'ӹ'), + ('Ա', 'Ֆ'), + ('ՙ', '՟'), + ('ա', 'և'), + ('։', '։'), + ('\u{5b0}', '\u{5b9}'), + ('\u{5bb}', '׃'), + ('א', 'ת'), + ('װ', '״'), + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('ء', 'غ'), + ('ـ', '\u{652}'), + ('٠', '٭'), + ('\u{670}', 'ڷ'), + ('ں', 'ھ'), + ('ۀ', 'ێ'), + ('ې', '\u{6ed}'), + ('۰', '۹'), + ('\u{901}', 'ः'), + ('अ', 'ह'), + ('\u{93c}', '\u{94d}'), + ('ॐ', '\u{954}'), + ('क़', '॰'), + ('\u{981}', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', '৺'), + ('\u{a02}', '\u{a02}'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', 'ੴ'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઋ'), + ('ઍ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૠ'), + ('૦', '૯'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଶ', 'ହ'), + ('\u{b3c}', '\u{b43}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b56}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('୦', '୰'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'வ'), + ('ஷ', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('௧', '௲'), + ('ఁ', 'ః'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'ళ'), + ('వ', 'హ'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౠ', 'ౡ'), + ('౦', '౯'), + ('ಂ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೞ', 'ೞ'), + ('ೠ', 'ೡ'), + ('೦', '೯'), + ('ം', 'ഃ'), + ('അ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ന'), + ('പ', 'ഹ'), + ('\u{d3e}', '\u{d43}'), + ('െ', 'ൈ'), + ('ൊ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('ൠ', 'ൡ'), + ('൦', '൯'), + ('ก', '\u{e3a}'), + ('฿', '๛'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ງ', 'ຈ'), + ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), + ('ດ', 'ທ'), + ('ນ', 'ຟ'), + ('ມ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ວ'), + ('ສ', 'ຫ'), + ('ອ', '\u{eb9}'), + ('\u{ebb}', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ecd}'), + ('໐', '໙'), + ('ໜ', 'ໝ'), + ('Ⴀ', 'Ⴥ'), + ('ა', 'ჶ'), + ('჻', '჻'), + ('ᄀ', 'ᅙ'), + ('ᅟ', 'ᆢ'), + ('ᆨ', 'ᇹ'), + ('Ḁ', 'ẚ'), + ('Ạ', 'ỹ'), + ('ἀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), + ('῝', '`'), + ('ῲ', 'ῴ'), + ('ῶ', '῾'), + ('\u{2000}', '\u{202e}'), + ('‰', '⁆'), + ('\u{206a}', '⁰'), + ('⁴', '₎'), + ('₠', '₪'), + ('\u{20d0}', '\u{20e1}'), + ('℀', 'ℸ'), + ('⅓', 'ↂ'), + ('←', '⇪'), + ('∀', '⋱'), + ('⌀', '⌀'), + ('⌂', '⍺'), + ('␀', '␤'), + ('⑀', '⑊'), + ('①', '⓪'), + ('─', '▕'), + ('■', '◯'), + ('☀', '☓'), + ('☚', '♯'), + ('✁', '✄'), + ('✆', '✉'), + ('✌', '✧'), + ('✩', '❋'), + ('❍', '❍'), + ('❏', '❒'), + ('❖', '❖'), + ('❘', '❞'), + ('❡', '❧'), + ('❶', '➔'), + ('➘', '➯'), + ('➱', '➾'), + ('\u{3000}', '〷'), + ('〿', '〿'), + ('ぁ', 'ゔ'), + ('\u{3099}', 'ゞ'), + ('ァ', 'ヾ'), + ('ㄅ', 'ㄬ'), + ('ㄱ', 'ㆎ'), + ('㆐', '㆟'), + ('㈀', '㈜'), + ('㈠', '㉃'), + ('㉠', '㉻'), + ('㉿', '㊰'), + ('㋀', '㋋'), + ('㋐', '㋾'), + ('㌀', '㍶'), + ('㍻', '㏝'), + ('㏠', '㏾'), + ('一', '龥'), + ('\u{e000}', '鶴'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('\u{fb1e}', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', '﴿'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('\u{fe20}', '\u{fe23}'), + ('︰', '﹄'), + ('﹉', '﹒'), + ('﹔', '﹦'), + ('﹨', '﹫'), + ('ﹰ', 'ﹲ'), + ('ﹴ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('\u{feff}', '\u{feff}'), + ('!', '~'), + ('。', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('¢', '₩'), + ('│', '○'), + ('�', '\u{ffff}'), +]; + +pub const V2_0: &'static [(char, char)] = &[ + ('\u{591}', '\u{5a1}'), + ('\u{5a3}', '\u{5af}'), + ('\u{5c4}', '\u{5c4}'), + ('ༀ', 'ཇ'), + ('ཉ', 'ཀྵ'), + ('\u{f71}', 'ྋ'), + ('\u{f90}', '\u{f95}'), + ('\u{f97}', '\u{f97}'), + ('\u{f99}', '\u{fad}'), + ('\u{fb1}', '\u{fb7}'), + ('\u{fb9}', '\u{fb9}'), + ('ẛ', 'ẛ'), + ('₫', '₫'), + ('가', '힣'), + ('\u{1fffe}', '\u{1ffff}'), + ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), + ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), + ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), + ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), + ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), + ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), + ('\u{efffe}', '\u{10ffff}'), +]; + +pub const V2_1: &'static [(char, char)] = &[('€', '€'), ('', '')]; + +pub const V3_0: &'static [(char, char)] = &[ + ('Ƕ', 'ǹ'), + ('Ș', 'ȟ'), + ('Ȣ', 'ȳ'), + ('ʩ', 'ʭ'), + ('˟', '˟'), + ('˪', 'ˮ'), + ('\u{346}', '\u{34e}'), + ('\u{362}', '\u{362}'), + ('ϗ', 'ϗ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('Ѐ', 'Ѐ'), + ('Ѝ', 'Ѝ'), + ('ѐ', 'ѐ'), + ('ѝ', 'ѝ'), + ('\u{488}', '\u{489}'), + ('Ҍ', 'ҏ'), + ('Ӭ', 'ӭ'), + ('֊', '֊'), + ('\u{653}', '\u{655}'), + ('ڸ', 'ڹ'), + ('ڿ', 'ڿ'), + ('ۏ', 'ۏ'), + ('ۺ', '۾'), + ('܀', '܍'), + ('\u{70f}', 'ܬ'), + ('\u{730}', '\u{74a}'), + ('ހ', '\u{7b0}'), + ('ං', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', '෴'), + ('ཪ', 'ཪ'), + ('\u{f96}', '\u{f96}'), + ('\u{fae}', '\u{fb0}'), + ('\u{fb8}', '\u{fb8}'), + ('\u{fba}', '\u{fbc}'), + ('྾', '࿌'), + ('࿏', '࿏'), + ('က', 'အ'), + ('ဣ', 'ဧ'), + ('ဩ', 'ဪ'), + ('ာ', '\u{1032}'), + ('\u{1036}', '\u{1039}'), + ('၀', '\u{1059}'), + ('ሀ', 'ሆ'), + ('ለ', 'ቆ'), + ('ቈ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኆ'), + ('ኈ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኮ'), + ('ኰ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዎ'), + ('ዐ', 'ዖ'), + ('ዘ', 'ዮ'), + ('ደ', 'ጎ'), + ('ጐ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ጞ'), + ('ጠ', 'ፆ'), + ('ፈ', 'ፚ'), + ('፡', '፼'), + ('Ꭰ', 'Ᏼ'), + ('ᐁ', 'ᙶ'), + ('\u{1680}', '᚜'), + ('ᚠ', 'ᛰ'), + ('ក', 'ៜ'), + ('០', '៩'), + ('᠀', '\u{180e}'), + ('᠐', '᠙'), + ('ᠠ', 'ᡷ'), + ('ᢀ', '\u{18a9}'), + ('\u{202f}', '\u{202f}'), + ('⁈', '⁍'), + ('₭', '₯'), + ('\u{20e2}', '\u{20e3}'), + ('ℹ', '℺'), + ('Ↄ', 'Ↄ'), + ('⇫', '⇳'), + ('⌁', '⌁'), + ('⍻', '⍻'), + ('⍽', '⎚'), + ('␥', '␦'), + ('◰', '◷'), + ('☙', '☙'), + ('♰', '♱'), + ('⠀', '⣿'), + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('⿰', '⿻'), + ('〸', '〺'), + ('〾', '〾'), + ('ㆠ', 'ㆷ'), + ('㐀', '䶵'), + ('ꀀ', 'ꒌ'), + ('꒐', '꒡'), + ('꒤', '꒳'), + ('꒵', '꓀'), + ('꓂', '꓄'), + ('꓆', '꓆'), + ('יִ', 'יִ'), + ('\u{fff9}', '\u{fffb}'), +]; + +pub const V3_1: &'static [(char, char)] = &[ + ('ϴ', 'ϵ'), + ('\u{fdd0}', '\u{fdef}'), + ('𐌀', '𐌞'), + ('𐌠', '𐌣'), + ('𐌰', '𐍊'), + ('𐐀', '𐐥'), + ('𐐨', '𐑍'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄪', '𝇝'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓀'), + ('𝓂', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚣'), + ('𝚨', '𝟉'), + ('𝟎', '𝟿'), + ('𠀀', '𪛖'), + ('丽', '𪘀'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const V3_2: &'static [(char, char)] = &[ + ('Ƞ', 'Ƞ'), + ('\u{34f}', '\u{34f}'), + ('\u{363}', '\u{36f}'), + ('Ϙ', 'ϙ'), + ('϶', '϶'), + ('Ҋ', 'ҋ'), + ('Ӆ', 'ӆ'), + ('Ӊ', 'ӊ'), + ('Ӎ', 'ӎ'), + ('Ԁ', 'ԏ'), + ('ٮ', 'ٯ'), + ('ޱ', 'ޱ'), + ('ჷ', 'ჸ'), + ('ᜀ', 'ᜌ'), + ('ᜎ', '\u{1714}'), + ('ᜠ', '᜶'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('⁇', '⁇'), + ('⁎', '⁒'), + ('⁗', '⁗'), + ('\u{205f}', '\u{2063}'), + ('ⁱ', 'ⁱ'), + ('₰', '₱'), + ('\u{20e4}', '\u{20ea}'), + ('ℽ', '⅋'), + ('⇴', '⇿'), + ('⋲', '⋿'), + ('⍼', '⍼'), + ('⎛', '⏎'), + ('⓫', '⓾'), + ('▖', '▟'), + ('◸', '◿'), + ('☖', '☗'), + ('♲', '♽'), + ('⚀', '⚉'), + ('❨', '❵'), + ('⟐', '⟫'), + ('⟰', '⟿'), + ('⤀', '⫿'), + ('〻', '〽'), + ('ゕ', 'ゖ'), + ('ゟ', '゠'), + ('ヿ', 'ヿ'), + ('ㇰ', 'ㇿ'), + ('㉑', '㉟'), + ('㊱', '㊿'), + ('꒢', '꒣'), + ('꒴', '꒴'), + ('꓁', '꓁'), + ('꓅', '꓅'), + ('侮', '頻'), + ('﷼', '﷼'), + ('\u{fe00}', '\u{fe0f}'), + ('﹅', '﹆'), + ('ﹳ', 'ﹳ'), + ('⦅', '⦆'), +]; + +pub const V4_0: &'static [(char, char)] = &[ + ('ȡ', 'ȡ'), + ('ȴ', 'ȶ'), + ('ʮ', 'ʯ'), + ('˯', '˿'), + ('\u{350}', '\u{357}'), + ('\u{35d}', '\u{35f}'), + ('Ϸ', 'ϻ'), + ('\u{600}', '\u{603}'), + ('؍', '\u{615}'), + ('\u{656}', '\u{658}'), + ('ۮ', 'ۯ'), + ('ۿ', 'ۿ'), + ('ܭ', 'ܯ'), + ('ݍ', 'ݏ'), + ('ऄ', 'ऄ'), + ('ঽ', 'ঽ'), + ('\u{a01}', '\u{a01}'), + ('ਃ', 'ਃ'), + ('ઌ', 'ઌ'), + ('ૡ', '\u{ae3}'), + ('૱', '૱'), + ('ଵ', 'ଵ'), + ('ୱ', 'ୱ'), + ('௳', '௺'), + ('\u{cbc}', 'ಽ'), + ('\u{17dd}', '\u{17dd}'), + ('៰', '៹'), + ('ᤀ', 'ᤜ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥀', '᥀'), + ('᥄', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('᧠', '᧿'), + ('ᴀ', 'ᵫ'), + ('⁓', '⁔'), + ('℻', '℻'), + ('⏏', '⏐'), + ('⓿', '⓿'), + ('☔', '☕'), + ('⚊', '⚑'), + ('⚠', '⚡'), + ('⬀', '⬍'), + ('㈝', '㈞'), + ('㉐', '㉐'), + ('㉼', '㉽'), + ('㋌', '㋏'), + ('㍷', '㍺'), + ('㏞', '㏟'), + ('㏿', '㏿'), + ('䷀', '䷿'), + ('﷽', '﷽'), + ('﹇', '﹈'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐄿'), + ('𐎀', '𐎝'), + ('𐎟', '𐎟'), + ('𐐦', '𐐧'), + ('𐑎', '𐒝'), + ('𐒠', '𐒩'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐠿'), + ('𝌀', '𝍖'), + ('𝓁', '𝓁'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const V4_1: &'static [(char, char)] = &[ + ('ȷ', 'Ɂ'), + ('\u{358}', '\u{35c}'), + ('ϼ', 'Ͽ'), + ('Ӷ', 'ӷ'), + ('\u{5a2}', '\u{5a2}'), + ('\u{5c5}', '\u{5c7}'), + ('؋', '؋'), + ('؞', '؞'), + ('\u{659}', '\u{65e}'), + ('ݐ', 'ݭ'), + ('ॽ', 'ॽ'), + ('ৎ', 'ৎ'), + ('ஶ', 'ஶ'), + ('௦', '௦'), + ('࿐', '࿑'), + ('ჹ', 'ჺ'), + ('ჼ', 'ჼ'), + ('ሇ', 'ሇ'), + ('ቇ', 'ቇ'), + ('ኇ', 'ኇ'), + ('ኯ', 'ኯ'), + ('ዏ', 'ዏ'), + ('ዯ', 'ዯ'), + ('ጏ', 'ጏ'), + ('ጟ', 'ጟ'), + ('ፇ', 'ፇ'), + ('\u{135f}', '፠'), + ('ᎀ', '᎙'), + ('ᦀ', 'ᦩ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧙'), + ('᧞', '᧟'), + ('ᨀ', '\u{1a1b}'), + ('᨞', '᨟'), + ('ᵬ', '\u{1dc3}'), + ('⁕', '⁖'), + ('⁘', '⁞'), + ('ₐ', 'ₔ'), + ('₲', '₵'), + ('\u{20eb}', '\u{20eb}'), + ('ℼ', 'ℼ'), + ('⅌', '⅌'), + ('⏑', '⏛'), + ('☘', '☘'), + ('♾', '♿'), + ('⚒', '⚜'), + ('⚢', '⚱'), + ('⟀', '⟆'), + ('⬎', '⬓'), + ('Ⰰ', 'Ⱞ'), + ('ⰰ', 'ⱞ'), + ('Ⲁ', '⳪'), + ('⳹', 'ⴥ'), + ('ⴰ', 'ⵥ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('⸀', '⸗'), + ('⸜', '⸝'), + ('㇀', '㇏'), + ('㉾', '㉾'), + ('龦', '龻'), + ('꜀', '꜖'), + ('ꠀ', '꠫'), + ('並', '龎'), + ('︐', '︙'), + ('𐅀', '𐆊'), + ('𐎠', '𐏃'), + ('𐏈', '𐏕'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨳'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '𐩇'), + ('𐩐', '𐩘'), + ('𝈀', '𝉅'), + ('𝚤', '𝚥'), +]; + +pub const V5_0: &'static [(char, char)] = &[ + ('ɂ', 'ɏ'), + ('ͻ', 'ͽ'), + ('ӏ', 'ӏ'), + ('Ӻ', 'ӿ'), + ('Ԑ', 'ԓ'), + ('\u{5ba}', '\u{5ba}'), + ('߀', 'ߺ'), + ('ॻ', 'ॼ'), + ('ॾ', 'ॿ'), + ('\u{ce2}', '\u{ce3}'), + ('ೱ', 'ೲ'), + ('\u{1b00}', 'ᭋ'), + ('᭐', '᭼'), + ('\u{1dc4}', '\u{1dca}'), + ('\u{1dfe}', '\u{1dff}'), + ('\u{20ec}', '\u{20ef}'), + ('⅍', 'ⅎ'), + ('ↄ', 'ↄ'), + ('⏜', '⏧'), + ('⚲', '⚲'), + ('⟇', '⟊'), + ('⬔', '⬚'), + ('⬠', '⬣'), + ('Ⱡ', 'ⱬ'), + ('ⱴ', 'ⱷ'), + ('ꜗ', 'ꜚ'), + ('꜠', '꜡'), + ('ꡀ', '꡷'), + ('𐤀', '𐤙'), + ('𐤟', '𐤟'), + ('𒀀', '𒍮'), + ('𒐀', '𒑢'), + ('𒑰', '𒑳'), + ('𝍠', '𝍱'), + ('𝟊', '𝟋'), +]; + +pub const V5_1: &'static [(char, char)] = &[ + ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), + ('Ϗ', 'Ϗ'), + ('\u{487}', '\u{487}'), + ('Ԕ', 'ԣ'), + ('؆', '؊'), + ('\u{616}', '\u{61a}'), + ('ػ', 'ؿ'), + ('ݮ', 'ݿ'), + ('ॱ', 'ॲ'), + ('\u{a51}', '\u{a51}'), + ('\u{a75}', '\u{a75}'), + ('\u{b44}', '\u{b44}'), + ('\u{b62}', '\u{b63}'), + ('ௐ', 'ௐ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౙ'), + ('\u{c62}', '\u{c63}'), + ('౸', '౿'), + ('ഽ', 'ഽ'), + ('\u{d44}', '\u{d44}'), + ('\u{d62}', '\u{d63}'), + ('൰', '൵'), + ('൹', 'ൿ'), + ('ཫ', 'ཬ'), + ('࿎', '࿎'), + ('࿒', '࿔'), + ('ဢ', 'ဢ'), + ('ဨ', 'ဨ'), + ('ါ', 'ါ'), + ('\u{1033}', '\u{1035}'), + ('\u{103a}', 'ဿ'), + ('ၚ', '႙'), + ('႞', '႟'), + ('ᢪ', 'ᢪ'), + ('\u{1b80}', '᮪'), + ('ᮮ', '᮹'), + ('ᰀ', '\u{1c37}'), + ('᰻', '᱉'), + ('ᱍ', '᱿'), + ('\u{1dcb}', '\u{1de6}'), + ('ẜ', 'ẟ'), + ('Ỻ', 'ỿ'), + ('\u{2064}', '\u{2064}'), + ('\u{20f0}', '\u{20f0}'), + ('⅏', '⅏'), + ('ↅ', 'ↈ'), + ('⚝', '⚝'), + ('⚳', '⚼'), + ('⛀', '⛃'), + ('⟌', '⟌'), + ('⟬', '⟯'), + ('⬛', '⬟'), + ('⬤', '⭌'), + ('⭐', '⭔'), + ('Ɑ', 'Ɐ'), + ('ⱱ', 'ⱳ'), + ('ⱸ', 'ⱽ'), + ('\u{2de0}', '\u{2dff}'), + ('⸘', '⸛'), + ('⸞', '⸰'), + ('ㄭ', 'ㄭ'), + ('㇐', '㇣'), + ('龼', '鿃'), + ('ꔀ', 'ꘫ'), + ('Ꙁ', 'ꙟ'), + ('Ꙣ', '꙳'), + ('\u{a67c}', 'ꚗ'), + ('ꜛ', 'ꜟ'), + ('Ꜣ', 'ꞌ'), + ('ꟻ', 'ꟿ'), + ('ꢀ', '\u{a8c4}'), + ('꣎', '꣙'), + ('꤀', '꥓'), + ('꥟', '꥟'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('꩜', '꩟'), + ('\u{fe24}', '\u{fe26}'), + ('𐆐', '𐆛'), + ('𐇐', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐤠', '𐤹'), + ('𐤿', '𐤿'), + ('𝄩', '𝄩'), + ('🀀', '🀫'), + ('🀰', '🂓'), +]; + +pub const V5_2: &'static [(char, char)] = &[ + ('Ԥ', 'ԥ'), + ('ࠀ', '\u{82d}'), + ('࠰', '࠾'), + ('\u{900}', '\u{900}'), + ('ॎ', 'ॎ'), + ('\u{955}', '\u{955}'), + ('ॹ', 'ॺ'), + ('৻', '৻'), + ('࿕', '࿘'), + ('ႚ', '\u{109d}'), + ('ᅚ', 'ᅞ'), + ('ᆣ', 'ᆧ'), + ('ᇺ', 'ᇿ'), + ('᐀', '᐀'), + ('ᙷ', 'ᙿ'), + ('ᢰ', 'ᣵ'), + ('ᦪ', 'ᦫ'), + ('᧚', '᧚'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('᪠', '᪭'), + ('\u{1cd0}', 'ᳲ'), + ('\u{1dfd}', '\u{1dfd}'), + ('₶', '₸'), + ('⅐', '⅒'), + ('↉', '↉'), + ('⏨', '⏨'), + ('⚞', '⚟'), + ('⚽', '⚿'), + ('⛄', '⛍'), + ('⛏', '⛡'), + ('⛣', '⛣'), + ('⛨', '⛿'), + ('❗', '❗'), + ('⭕', '⭙'), + ('Ɒ', 'Ɒ'), + ('Ȿ', 'Ɀ'), + ('Ⳬ', '\u{2cf1}'), + ('⸱', '⸱'), + ('㉄', '㉏'), + ('鿄', '鿋'), + ('ꓐ', '꓿'), + ('ꚠ', '꛷'), + ('꠰', '꠹'), + ('\u{a8e0}', 'ꣻ'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧍'), + ('ꧏ', '꧙'), + ('꧞', '꧟'), + ('ꩠ', 'ꩻ'), + ('ꪀ', 'ꫂ'), + ('ꫛ', '꫟'), + ('ꯀ', '\u{abed}'), + ('꯰', '꯹'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('恵', '舘'), + ('𐡀', '𐡕'), + ('𐡗', '𐡟'), + ('𐤚', '𐤛'), + ('𐩠', '𐩿'), + ('𐬀', '𐬵'), + ('𐬹', '𐭕'), + ('𐭘', '𐭲'), + ('𐭸', '𐭿'), + ('𐰀', '𐱈'), + ('𐹠', '𐹾'), + ('\u{11080}', '𑃁'), + ('𓀀', '𓐮'), + ('🄀', '🄊'), + ('🄐', '🄮'), + ('🄱', '🄱'), + ('🄽', '🄽'), + ('🄿', '🄿'), + ('🅂', '🅂'), + ('🅆', '🅆'), + ('🅊', '🅎'), + ('🅗', '🅗'), + ('🅟', '🅟'), + ('🅹', '🅹'), + ('🅻', '🅼'), + ('🅿', '🅿'), + ('🆊', '🆍'), + ('🆐', '🆐'), + ('🈀', '🈀'), + ('🈐', '🈱'), + ('🉀', '🉈'), + ('𪜀', '𫜴'), +]; + +pub const V6_0: &'static [(char, char)] = &[ + ('Ԧ', 'ԧ'), + ('ؠ', 'ؠ'), + ('\u{65f}', '\u{65f}'), + ('ࡀ', '\u{85b}'), + ('࡞', '࡞'), + ('\u{93a}', 'ऻ'), + ('ॏ', 'ॏ'), + ('\u{956}', '\u{957}'), + ('ॳ', 'ॷ'), + ('୲', '୷'), + ('ഩ', 'ഩ'), + ('ഺ', 'ഺ'), + ('ൎ', 'ൎ'), + ('ྌ', '\u{f8f}'), + ('࿙', '࿚'), + ('\u{135d}', '\u{135e}'), + ('ᯀ', '᯳'), + ('᯼', '᯿'), + ('\u{1dfc}', '\u{1dfc}'), + ('ₕ', 'ₜ'), + ('₹', '₹'), + ('⏩', '⏳'), + ('⛎', '⛎'), + ('⛢', '⛢'), + ('⛤', '⛧'), + ('✅', '✅'), + ('✊', '✋'), + ('✨', '✨'), + ('❌', '❌'), + ('❎', '❎'), + ('❓', '❕'), + ('❟', '❠'), + ('➕', '➗'), + ('➰', '➰'), + ('➿', '➿'), + ('⟎', '⟏'), + ('⵰', '⵰'), + ('\u{2d7f}', '\u{2d7f}'), + ('ㆸ', 'ㆺ'), + ('Ꙡ', 'ꙡ'), + ('Ɥ', 'ꞎ'), + ('Ꞑ', 'ꞑ'), + ('Ꞡ', 'ꞩ'), + ('ꟺ', 'ꟺ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('﮲', '﯁'), + ('𑀀', '𑁍'), + ('𑁒', '𑁯'), + ('𖠀', '𖨸'), + ('𛀀', '𛀁'), + ('🂠', '🂮'), + ('🂱', '🂾'), + ('🃁', '🃏'), + ('🃑', '🃟'), + ('🄰', '🄰'), + ('🄲', '🄼'), + ('🄾', '🄾'), + ('🅀', '🅁'), + ('🅃', '🅅'), + ('🅇', '🅉'), + ('🅏', '🅖'), + ('🅘', '🅞'), + ('🅠', '🅩'), + ('🅰', '🅸'), + ('🅺', '🅺'), + ('🅽', '🅾'), + ('🆀', '🆉'), + ('🆎', '🆏'), + ('🆑', '🆚'), + ('🇦', '🇿'), + ('🈁', '🈂'), + ('🈲', '🈺'), + ('🉐', '🉑'), + ('🌀', '🌠'), + ('🌰', '🌵'), + ('🌷', '🍼'), + ('🎀', '🎓'), + ('🎠', '🏄'), + ('🏆', '🏊'), + ('🏠', '🏰'), + ('🐀', '🐾'), + ('👀', '👀'), + ('👂', '📷'), + ('📹', '📼'), + ('🔀', '🔽'), + ('🕐', '🕧'), + ('🗻', '🗿'), + ('😁', '😐'), + ('😒', '😔'), + ('😖', '😖'), + ('😘', '😘'), + ('😚', '😚'), + ('😜', '😞'), + ('😠', '😥'), + ('😨', '😫'), + ('😭', '😭'), + ('😰', '😳'), + ('😵', '🙀'), + ('🙅', '🙏'), + ('🚀', '🛅'), + ('🜀', '🝳'), + ('𫝀', '𫠝'), +]; + +pub const V6_1: &'static [(char, char)] = &[ + ('֏', '֏'), + ('\u{604}', '\u{604}'), + ('ࢠ', 'ࢠ'), + ('ࢢ', 'ࢬ'), + ('\u{8e4}', '\u{8fe}'), + ('૰', '૰'), + ('ໞ', 'ໟ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ჽ', 'ჿ'), + ('\u{1bab}', '\u{1bad}'), + ('ᮺ', 'ᮿ'), + ('᳀', '᳇'), + ('ᳳ', 'ᳶ'), + ('⟋', '⟋'), + ('⟍', '⟍'), + ('Ⳳ', 'ⳳ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⵦ', 'ⵧ'), + ('⸲', '⸻'), + ('鿌', '鿌'), + ('\u{a674}', '\u{a67b}'), + ('\u{a69f}', '\u{a69f}'), + ('Ꞓ', 'ꞓ'), + ('Ɦ', 'Ɦ'), + ('ꟸ', 'ꟹ'), + ('ꫠ', '\u{aaf6}'), + ('郞', '隷'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑅃'), + ('\u{11180}', '𑇈'), + ('𑇐', '𑇙'), + ('𑚀', '\u{116b7}'), + ('𑛀', '𑛉'), + ('𖼀', '𖽄'), + ('𖽐', '𖽾'), + ('\u{16f8f}', '𖾟'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), + ('🅪', '🅫'), + ('🕀', '🕃'), + ('😀', '😀'), + ('😑', '😑'), + ('😕', '😕'), + ('😗', '😗'), + ('😙', '😙'), + ('😛', '😛'), + ('😟', '😟'), + ('😦', '😧'), + ('😬', '😬'), + ('😮', '😯'), + ('😴', '😴'), +]; + +pub const V6_2: &'static [(char, char)] = &[('₺', '₺')]; + +pub const V6_3: &'static [(char, char)] = + &[('\u{61c}', '\u{61c}'), ('\u{2066}', '\u{2069}')]; + +pub const V7_0: &'static [(char, char)] = &[ + ('Ϳ', 'Ϳ'), + ('Ԩ', 'ԯ'), + ('֍', '֎'), + ('\u{605}', '\u{605}'), + ('ࢡ', 'ࢡ'), + ('ࢭ', 'ࢲ'), + ('\u{8ff}', '\u{8ff}'), + ('ॸ', 'ॸ'), + ('ঀ', 'ঀ'), + ('\u{c00}', '\u{c00}'), + ('ఴ', 'ఴ'), + ('\u{c81}', '\u{c81}'), + ('\u{d01}', '\u{d01}'), + ('෦', '෯'), + ('ᛱ', 'ᛸ'), + ('ᤝ', 'ᤞ'), + ('\u{1ab0}', '\u{1abe}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1de7}', '\u{1df5}'), + ('₻', '₽'), + ('⏴', '⏺'), + ('✀', '✀'), + ('⭍', '⭏'), + ('⭚', '⭳'), + ('⭶', '⮕'), + ('⮘', '⮹'), + ('⮽', '⯈'), + ('⯊', '⯑'), + ('⸼', '⹂'), + ('Ꚙ', 'ꚝ'), + ('ꞔ', 'ꞟ'), + ('Ɜ', 'Ɬ'), + ('Ʞ', 'Ʇ'), + ('ꟷ', 'ꟷ'), + ('ꧠ', 'ꧾ'), + ('\u{aa7c}', 'ꩿ'), + ('ꬰ', 'ꭟ'), + ('ꭤ', 'ꭥ'), + ('\u{fe27}', '\u{fe2d}'), + ('𐆋', '𐆌'), + ('𐆠', '𐆠'), + ('\u{102e0}', '𐋻'), + ('𐌟', '𐌟'), + ('𐍐', '\u{1037a}'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕯', '𐕯'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐡠', '𐢞'), + ('𐢧', '𐢯'), + ('𐪀', '𐪟'), + ('𐫀', '\u{10ae6}'), + ('𐫫', '𐫶'), + ('𐮀', '𐮑'), + ('𐮙', '𐮜'), + ('𐮩', '𐮯'), + ('\u{1107f}', '\u{1107f}'), + ('𑅐', '𑅶'), + ('𑇍', '𑇍'), + ('𑇚', '𑇚'), + ('𑇡', '𑇴'), + ('𑈀', '𑈑'), + ('𑈓', '𑈽'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11301}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133c}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑒀', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '𑗉'), + ('𑘀', '𑙄'), + ('𑙐', '𑙙'), + ('𑢠', '𑣲'), + ('𑣿', '𑣿'), + ('𑫀', '𑫸'), + ('𒍯', '𒎘'), + ('𒑣', '𒑮'), + ('𒑴', '𒑴'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩮', '𖩯'), + ('𖫐', '𖫭'), + ('\u{16af0}', '𖫵'), + ('𖬀', '𖭅'), + ('𖭐', '𖭙'), + ('𖭛', '𖭡'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𛲜', '\u{1bca3}'), + ('𞠀', '𞣄'), + ('𞣇', '\u{1e8d6}'), + ('🂿', '🂿'), + ('🃠', '🃵'), + ('🄋', '🄌'), + ('🌡', '🌬'), + ('🌶', '🌶'), + ('🍽', '🍽'), + ('🎔', '🎟'), + ('🏅', '🏅'), + ('🏋', '🏎'), + ('🏔', '🏟'), + ('🏱', '🏷'), + ('🐿', '🐿'), + ('👁', '👁'), + ('📸', '📸'), + ('📽', '📾'), + ('🔾', '🔿'), + ('🕄', '🕊'), + ('🕨', '🕹'), + ('🕻', '🖣'), + ('🖥', '🗺'), + ('🙁', '🙂'), + ('🙐', '🙿'), + ('🛆', '🛏'), + ('🛠', '🛬'), + ('🛰', '🛳'), + ('🞀', '🟔'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), +]; + +pub const V8_0: &'static [(char, char)] = &[ + ('ࢳ', 'ࢴ'), + ('\u{8e3}', '\u{8e3}'), + ('ૹ', 'ૹ'), + ('ౚ', 'ౚ'), + ('ൟ', 'ൟ'), + ('Ᏽ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('₾', '₾'), + ('↊', '↋'), + ('⯬', '⯯'), + ('鿍', '鿕'), + ('\u{a69e}', '\u{a69e}'), + ('ꞏ', 'ꞏ'), + ('Ʝ', 'ꞷ'), + ('꣼', 'ꣽ'), + ('ꭠ', 'ꭣ'), + ('ꭰ', 'ꮿ'), + ('\u{fe2e}', '\u{fe2f}'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐣻', '𐣿'), + ('𐦼', '𐦽'), + ('𐧀', '𐧏'), + ('𐧒', '𐧿'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐳺', '𐳿'), + ('\u{111c9}', '\u{111cc}'), + ('𑇛', '𑇟'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊩'), + ('\u{11300}', '\u{11300}'), + ('𑍐', '𑍐'), + ('𑗊', '\u{115dd}'), + ('𑜀', '𑜙'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜿'), + ('𒎙', '𒎙'), + ('𒒀', '𒕃'), + ('𔐀', '𔙆'), + ('𝇞', '𝇨'), + ('𝠀', '𝪋'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('🌭', '🌯'), + ('🍾', '🍿'), + ('🏏', '🏓'), + ('🏸', '🏿'), + ('📿', '📿'), + ('🕋', '🕏'), + ('🙃', '🙄'), + ('🛐', '🛐'), + ('🤐', '🤘'), + ('🦀', '🦄'), + ('🧀', '🧀'), + ('𫠠', '𬺡'), +]; + +pub const V9_0: &'static [(char, char)] = &[ + ('ࢶ', 'ࢽ'), + ('\u{8d4}', '\u{8e2}'), + ('ಀ', 'ಀ'), + ('൏', '൏'), + ('ൔ', 'ൖ'), + ('൘', '൞'), + ('൶', '൸'), + ('ᲀ', 'ᲈ'), + ('\u{1dfb}', '\u{1dfb}'), + ('⏻', '⏾'), + ('⹃', '⹄'), + ('Ɪ', 'Ɪ'), + ('\u{a8c5}', '\u{a8c5}'), + ('𐆍', '𐆎'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('\u{1123e}', '\u{1123e}'), + ('𑐀', '𑑙'), + ('𑑛', '𑑛'), + ('𑑝', '𑑝'), + ('𑙠', '𑙬'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱅'), + ('𑱐', '𑱬'), + ('𑱰', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𖿠', '𖿠'), + ('𗀀', '𘟬'), + ('𘠀', '𘫲'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞤀', '\u{1e94a}'), + ('𞥐', '𞥙'), + ('𞥞', '𞥟'), + ('🆛', '🆬'), + ('🈻', '🈻'), + ('🕺', '🕺'), + ('🖤', '🖤'), + ('🛑', '🛒'), + ('🛴', '🛶'), + ('🤙', '🤞'), + ('🤠', '🤧'), + ('🤰', '🤰'), + ('🤳', '🤾'), + ('🥀', '🥋'), + ('🥐', '🥞'), + ('🦅', '🦑'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs b/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs new file mode 100644 index 0000000..23f9364 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs @@ -0,0 +1,2888 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate case-folding-simple ucd-15.0.0 --chars --all-pairs +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[ + ('A', &['a']), + ('B', &['b']), + ('C', &['c']), + ('D', &['d']), + ('E', &['e']), + ('F', &['f']), + ('G', &['g']), + ('H', &['h']), + ('I', &['i']), + ('J', &['j']), + ('K', &['k', 'K']), + ('L', &['l']), + ('M', &['m']), + ('N', &['n']), + ('O', &['o']), + ('P', &['p']), + ('Q', &['q']), + ('R', &['r']), + ('S', &['s', 'ſ']), + ('T', &['t']), + ('U', &['u']), + ('V', &['v']), + ('W', &['w']), + ('X', &['x']), + ('Y', &['y']), + ('Z', &['z']), + ('a', &['A']), + ('b', &['B']), + ('c', &['C']), + ('d', &['D']), + ('e', &['E']), + ('f', &['F']), + ('g', &['G']), + ('h', &['H']), + ('i', &['I']), + ('j', &['J']), + ('k', &['K', 'K']), + ('l', &['L']), + ('m', &['M']), + ('n', &['N']), + ('o', &['O']), + ('p', &['P']), + ('q', &['Q']), + ('r', &['R']), + ('s', &['S', 'ſ']), + ('t', &['T']), + ('u', &['U']), + ('v', &['V']), + ('w', &['W']), + ('x', &['X']), + ('y', &['Y']), + ('z', &['Z']), + ('µ', &['Μ', 'μ']), + ('À', &['à']), + ('Á', &['á']), + ('Â', &['â']), + ('Ã', &['ã']), + ('Ä', &['ä']), + ('Å', &['å', 'Å']), + ('Æ', &['æ']), + ('Ç', &['ç']), + ('È', &['è']), + ('É', &['é']), + ('Ê', &['ê']), + ('Ë', &['ë']), + ('Ì', &['ì']), + ('Í', &['í']), + ('Î', &['î']), + ('Ï', &['ï']), + ('Ð', &['ð']), + ('Ñ', &['ñ']), + ('Ò', &['ò']), + ('Ó', &['ó']), + ('Ô', &['ô']), + ('Õ', &['õ']), + ('Ö', &['ö']), + ('Ø', &['ø']), + ('Ù', &['ù']), + ('Ú', &['ú']), + ('Û', &['û']), + ('Ü', &['ü']), + ('Ý', &['ý']), + ('Þ', &['þ']), + ('ß', &['ẞ']), + ('à', &['À']), + ('á', &['Á']), + ('â', &['Â']), + ('ã', &['Ã']), + ('ä', &['Ä']), + ('å', &['Å', 'Å']), + ('æ', &['Æ']), + ('ç', &['Ç']), + ('è', &['È']), + ('é', &['É']), + ('ê', &['Ê']), + ('ë', &['Ë']), + ('ì', &['Ì']), + ('í', &['Í']), + ('î', &['Î']), + ('ï', &['Ï']), + ('ð', &['Ð']), + ('ñ', &['Ñ']), + ('ò', &['Ò']), + ('ó', &['Ó']), + ('ô', &['Ô']), + ('õ', &['Õ']), + ('ö', &['Ö']), + ('ø', &['Ø']), + ('ù', &['Ù']), + ('ú', &['Ú']), + ('û', &['Û']), + ('ü', &['Ü']), + ('ý', &['Ý']), + ('þ', &['Þ']), + ('ÿ', &['Ÿ']), + ('Ā', &['ā']), + ('ā', &['Ā']), + ('Ă', &['ă']), + ('ă', &['Ă']), + ('Ą', &['ą']), + ('ą', &['Ą']), + ('Ć', &['ć']), + ('ć', &['Ć']), + ('Ĉ', &['ĉ']), + ('ĉ', &['Ĉ']), + ('Ċ', &['ċ']), + ('ċ', &['Ċ']), + ('Č', &['č']), + ('č', &['Č']), + ('Ď', &['ď']), + ('ď', &['Ď']), + ('Đ', &['đ']), + ('đ', &['Đ']), + ('Ē', &['ē']), + ('ē', &['Ē']), + ('Ĕ', &['ĕ']), + ('ĕ', &['Ĕ']), + ('Ė', &['ė']), + ('ė', &['Ė']), + ('Ę', &['ę']), + ('ę', &['Ę']), + ('Ě', &['ě']), + ('ě', &['Ě']), + ('Ĝ', &['ĝ']), + ('ĝ', &['Ĝ']), + ('Ğ', &['ğ']), + ('ğ', &['Ğ']), + ('Ġ', &['ġ']), + ('ġ', &['Ġ']), + ('Ģ', &['ģ']), + ('ģ', &['Ģ']), + ('Ĥ', &['ĥ']), + ('ĥ', &['Ĥ']), + ('Ħ', &['ħ']), + ('ħ', &['Ħ']), + ('Ĩ', &['ĩ']), + ('ĩ', &['Ĩ']), + ('Ī', &['ī']), + ('ī', &['Ī']), + ('Ĭ', &['ĭ']), + ('ĭ', &['Ĭ']), + ('Į', &['į']), + ('į', &['Į']), + ('IJ', &['ij']), + ('ij', &['IJ']), + ('Ĵ', &['ĵ']), + ('ĵ', &['Ĵ']), + ('Ķ', &['ķ']), + ('ķ', &['Ķ']), + ('Ĺ', &['ĺ']), + ('ĺ', &['Ĺ']), + ('Ļ', &['ļ']), + ('ļ', &['Ļ']), + ('Ľ', &['ľ']), + ('ľ', &['Ľ']), + ('Ŀ', &['ŀ']), + ('ŀ', &['Ŀ']), + ('Ł', &['ł']), + ('ł', &['Ł']), + ('Ń', &['ń']), + ('ń', &['Ń']), + ('Ņ', &['ņ']), + ('ņ', &['Ņ']), + ('Ň', &['ň']), + ('ň', &['Ň']), + ('Ŋ', &['ŋ']), + ('ŋ', &['Ŋ']), + ('Ō', &['ō']), + ('ō', &['Ō']), + ('Ŏ', &['ŏ']), + ('ŏ', &['Ŏ']), + ('Ő', &['ő']), + ('ő', &['Ő']), + ('Œ', &['œ']), + ('œ', &['Œ']), + ('Ŕ', &['ŕ']), + ('ŕ', &['Ŕ']), + ('Ŗ', &['ŗ']), + ('ŗ', &['Ŗ']), + ('Ř', &['ř']), + ('ř', &['Ř']), + ('Ś', &['ś']), + ('ś', &['Ś']), + ('Ŝ', &['ŝ']), + ('ŝ', &['Ŝ']), + ('Ş', &['ş']), + ('ş', &['Ş']), + ('Š', &['š']), + ('š', &['Š']), + ('Ţ', &['ţ']), + ('ţ', &['Ţ']), + ('Ť', &['ť']), + ('ť', &['Ť']), + ('Ŧ', &['ŧ']), + ('ŧ', &['Ŧ']), + ('Ũ', &['ũ']), + ('ũ', &['Ũ']), + ('Ū', &['ū']), + ('ū', &['Ū']), + ('Ŭ', &['ŭ']), + ('ŭ', &['Ŭ']), + ('Ů', &['ů']), + ('ů', &['Ů']), + ('Ű', &['ű']), + ('ű', &['Ű']), + ('Ų', &['ų']), + ('ų', &['Ų']), + ('Ŵ', &['ŵ']), + ('ŵ', &['Ŵ']), + ('Ŷ', &['ŷ']), + ('ŷ', &['Ŷ']), + ('Ÿ', &['ÿ']), + ('Ź', &['ź']), + ('ź', &['Ź']), + ('Ż', &['ż']), + ('ż', &['Ż']), + ('Ž', &['ž']), + ('ž', &['Ž']), + ('ſ', &['S', 's']), + ('ƀ', &['Ƀ']), + ('Ɓ', &['ɓ']), + ('Ƃ', &['ƃ']), + ('ƃ', &['Ƃ']), + ('Ƅ', &['ƅ']), + ('ƅ', &['Ƅ']), + ('Ɔ', &['ɔ']), + ('Ƈ', &['ƈ']), + ('ƈ', &['Ƈ']), + ('Ɖ', &['ɖ']), + ('Ɗ', &['ɗ']), + ('Ƌ', &['ƌ']), + ('ƌ', &['Ƌ']), + ('Ǝ', &['ǝ']), + ('Ə', &['ə']), + ('Ɛ', &['ɛ']), + ('Ƒ', &['ƒ']), + ('ƒ', &['Ƒ']), + ('Ɠ', &['ɠ']), + ('Ɣ', &['ɣ']), + ('ƕ', &['Ƕ']), + ('Ɩ', &['ɩ']), + ('Ɨ', &['ɨ']), + ('Ƙ', &['ƙ']), + ('ƙ', &['Ƙ']), + ('ƚ', &['Ƚ']), + ('Ɯ', &['ɯ']), + ('Ɲ', &['ɲ']), + ('ƞ', &['Ƞ']), + ('Ɵ', &['ɵ']), + ('Ơ', &['ơ']), + ('ơ', &['Ơ']), + ('Ƣ', &['ƣ']), + ('ƣ', &['Ƣ']), + ('Ƥ', &['ƥ']), + ('ƥ', &['Ƥ']), + ('Ʀ', &['ʀ']), + ('Ƨ', &['ƨ']), + ('ƨ', &['Ƨ']), + ('Ʃ', &['ʃ']), + ('Ƭ', &['ƭ']), + ('ƭ', &['Ƭ']), + ('Ʈ', &['ʈ']), + ('Ư', &['ư']), + ('ư', &['Ư']), + ('Ʊ', &['ʊ']), + ('Ʋ', &['ʋ']), + ('Ƴ', &['ƴ']), + ('ƴ', &['Ƴ']), + ('Ƶ', &['ƶ']), + ('ƶ', &['Ƶ']), + ('Ʒ', &['ʒ']), + ('Ƹ', &['ƹ']), + ('ƹ', &['Ƹ']), + ('Ƽ', &['ƽ']), + ('ƽ', &['Ƽ']), + ('ƿ', &['Ƿ']), + ('DŽ', &['Dž', 'dž']), + ('Dž', &['DŽ', 'dž']), + ('dž', &['DŽ', 'Dž']), + ('LJ', &['Lj', 'lj']), + ('Lj', &['LJ', 'lj']), + ('lj', &['LJ', 'Lj']), + ('NJ', &['Nj', 'nj']), + ('Nj', &['NJ', 'nj']), + ('nj', &['NJ', 'Nj']), + ('Ǎ', &['ǎ']), + ('ǎ', &['Ǎ']), + ('Ǐ', &['ǐ']), + ('ǐ', &['Ǐ']), + ('Ǒ', &['ǒ']), + ('ǒ', &['Ǒ']), + ('Ǔ', &['ǔ']), + ('ǔ', &['Ǔ']), + ('Ǖ', &['ǖ']), + ('ǖ', &['Ǖ']), + ('Ǘ', &['ǘ']), + ('ǘ', &['Ǘ']), + ('Ǚ', &['ǚ']), + ('ǚ', &['Ǚ']), + ('Ǜ', &['ǜ']), + ('ǜ', &['Ǜ']), + ('ǝ', &['Ǝ']), + ('Ǟ', &['ǟ']), + ('ǟ', &['Ǟ']), + ('Ǡ', &['ǡ']), + ('ǡ', &['Ǡ']), + ('Ǣ', &['ǣ']), + ('ǣ', &['Ǣ']), + ('Ǥ', &['ǥ']), + ('ǥ', &['Ǥ']), + ('Ǧ', &['ǧ']), + ('ǧ', &['Ǧ']), + ('Ǩ', &['ǩ']), + ('ǩ', &['Ǩ']), + ('Ǫ', &['ǫ']), + ('ǫ', &['Ǫ']), + ('Ǭ', &['ǭ']), + ('ǭ', &['Ǭ']), + ('Ǯ', &['ǯ']), + ('ǯ', &['Ǯ']), + ('DZ', &['Dz', 'dz']), + ('Dz', &['DZ', 'dz']), + ('dz', &['DZ', 'Dz']), + ('Ǵ', &['ǵ']), + ('ǵ', &['Ǵ']), + ('Ƕ', &['ƕ']), + ('Ƿ', &['ƿ']), + ('Ǹ', &['ǹ']), + ('ǹ', &['Ǹ']), + ('Ǻ', &['ǻ']), + ('ǻ', &['Ǻ']), + ('Ǽ', &['ǽ']), + ('ǽ', &['Ǽ']), + ('Ǿ', &['ǿ']), + ('ǿ', &['Ǿ']), + ('Ȁ', &['ȁ']), + ('ȁ', &['Ȁ']), + ('Ȃ', &['ȃ']), + ('ȃ', &['Ȃ']), + ('Ȅ', &['ȅ']), + ('ȅ', &['Ȅ']), + ('Ȇ', &['ȇ']), + ('ȇ', &['Ȇ']), + ('Ȉ', &['ȉ']), + ('ȉ', &['Ȉ']), + ('Ȋ', &['ȋ']), + ('ȋ', &['Ȋ']), + ('Ȍ', &['ȍ']), + ('ȍ', &['Ȍ']), + ('Ȏ', &['ȏ']), + ('ȏ', &['Ȏ']), + ('Ȑ', &['ȑ']), + ('ȑ', &['Ȑ']), + ('Ȓ', &['ȓ']), + ('ȓ', &['Ȓ']), + ('Ȕ', &['ȕ']), + ('ȕ', &['Ȕ']), + ('Ȗ', &['ȗ']), + ('ȗ', &['Ȗ']), + ('Ș', &['ș']), + ('ș', &['Ș']), + ('Ț', &['ț']), + ('ț', &['Ț']), + ('Ȝ', &['ȝ']), + ('ȝ', &['Ȝ']), + ('Ȟ', &['ȟ']), + ('ȟ', &['Ȟ']), + ('Ƞ', &['ƞ']), + ('Ȣ', &['ȣ']), + ('ȣ', &['Ȣ']), + ('Ȥ', &['ȥ']), + ('ȥ', &['Ȥ']), + ('Ȧ', &['ȧ']), + ('ȧ', &['Ȧ']), + ('Ȩ', &['ȩ']), + ('ȩ', &['Ȩ']), + ('Ȫ', &['ȫ']), + ('ȫ', &['Ȫ']), + ('Ȭ', &['ȭ']), + ('ȭ', &['Ȭ']), + ('Ȯ', &['ȯ']), + ('ȯ', &['Ȯ']), + ('Ȱ', &['ȱ']), + ('ȱ', &['Ȱ']), + ('Ȳ', &['ȳ']), + ('ȳ', &['Ȳ']), + ('Ⱥ', &['ⱥ']), + ('Ȼ', &['ȼ']), + ('ȼ', &['Ȼ']), + ('Ƚ', &['ƚ']), + ('Ⱦ', &['ⱦ']), + ('ȿ', &['Ȿ']), + ('ɀ', &['Ɀ']), + ('Ɂ', &['ɂ']), + ('ɂ', &['Ɂ']), + ('Ƀ', &['ƀ']), + ('Ʉ', &['ʉ']), + ('Ʌ', &['ʌ']), + ('Ɇ', &['ɇ']), + ('ɇ', &['Ɇ']), + ('Ɉ', &['ɉ']), + ('ɉ', &['Ɉ']), + ('Ɋ', &['ɋ']), + ('ɋ', &['Ɋ']), + ('Ɍ', &['ɍ']), + ('ɍ', &['Ɍ']), + ('Ɏ', &['ɏ']), + ('ɏ', &['Ɏ']), + ('ɐ', &['Ɐ']), + ('ɑ', &['Ɑ']), + ('ɒ', &['Ɒ']), + ('ɓ', &['Ɓ']), + ('ɔ', &['Ɔ']), + ('ɖ', &['Ɖ']), + ('ɗ', &['Ɗ']), + ('ə', &['Ə']), + ('ɛ', &['Ɛ']), + ('ɜ', &['Ɜ']), + ('ɠ', &['Ɠ']), + ('ɡ', &['Ɡ']), + ('ɣ', &['Ɣ']), + ('ɥ', &['Ɥ']), + ('ɦ', &['Ɦ']), + ('ɨ', &['Ɨ']), + ('ɩ', &['Ɩ']), + ('ɪ', &['Ɪ']), + ('ɫ', &['Ɫ']), + ('ɬ', &['Ɬ']), + ('ɯ', &['Ɯ']), + ('ɱ', &['Ɱ']), + ('ɲ', &['Ɲ']), + ('ɵ', &['Ɵ']), + ('ɽ', &['Ɽ']), + ('ʀ', &['Ʀ']), + ('ʂ', &['Ʂ']), + ('ʃ', &['Ʃ']), + ('ʇ', &['Ʇ']), + ('ʈ', &['Ʈ']), + ('ʉ', &['Ʉ']), + ('ʊ', &['Ʊ']), + ('ʋ', &['Ʋ']), + ('ʌ', &['Ʌ']), + ('ʒ', &['Ʒ']), + ('ʝ', &['Ʝ']), + ('ʞ', &['Ʞ']), + ('\u{345}', &['Ι', 'ι', 'ι']), + ('Ͱ', &['ͱ']), + ('ͱ', &['Ͱ']), + ('Ͳ', &['ͳ']), + ('ͳ', &['Ͳ']), + ('Ͷ', &['ͷ']), + ('ͷ', &['Ͷ']), + ('ͻ', &['Ͻ']), + ('ͼ', &['Ͼ']), + ('ͽ', &['Ͽ']), + ('Ϳ', &['ϳ']), + ('Ά', &['ά']), + ('Έ', &['έ']), + ('Ή', &['ή']), + ('Ί', &['ί']), + ('Ό', &['ό']), + ('Ύ', &['ύ']), + ('Ώ', &['ώ']), + ('Α', &['α']), + ('Β', &['β', 'ϐ']), + ('Γ', &['γ']), + ('Δ', &['δ']), + ('Ε', &['ε', 'ϵ']), + ('Ζ', &['ζ']), + ('Η', &['η']), + ('Θ', &['θ', 'ϑ', 'ϴ']), + ('Ι', &['\u{345}', 'ι', 'ι']), + ('Κ', &['κ', 'ϰ']), + ('Λ', &['λ']), + ('Μ', &['µ', 'μ']), + ('Ν', &['ν']), + ('Ξ', &['ξ']), + ('Ο', &['ο']), + ('Π', &['π', 'ϖ']), + ('Ρ', &['ρ', 'ϱ']), + ('Σ', &['ς', 'σ']), + ('Τ', &['τ']), + ('Υ', &['υ']), + ('Φ', &['φ', 'ϕ']), + ('Χ', &['χ']), + ('Ψ', &['ψ']), + ('Ω', &['ω', 'Ω']), + ('Ϊ', &['ϊ']), + ('Ϋ', &['ϋ']), + ('ά', &['Ά']), + ('έ', &['Έ']), + ('ή', &['Ή']), + ('ί', &['Ί']), + ('α', &['Α']), + ('β', &['Β', 'ϐ']), + ('γ', &['Γ']), + ('δ', &['Δ']), + ('ε', &['Ε', 'ϵ']), + ('ζ', &['Ζ']), + ('η', &['Η']), + ('θ', &['Θ', 'ϑ', 'ϴ']), + ('ι', &['\u{345}', 'Ι', 'ι']), + ('κ', &['Κ', 'ϰ']), + ('λ', &['Λ']), + ('μ', &['µ', 'Μ']), + ('ν', &['Ν']), + ('ξ', &['Ξ']), + ('ο', &['Ο']), + ('π', &['Π', 'ϖ']), + ('ρ', &['Ρ', 'ϱ']), + ('ς', &['Σ', 'σ']), + ('σ', &['Σ', 'ς']), + ('τ', &['Τ']), + ('υ', &['Υ']), + ('φ', &['Φ', 'ϕ']), + ('χ', &['Χ']), + ('ψ', &['Ψ']), + ('ω', &['Ω', 'Ω']), + ('ϊ', &['Ϊ']), + ('ϋ', &['Ϋ']), + ('ό', &['Ό']), + ('ύ', &['Ύ']), + ('ώ', &['Ώ']), + ('Ϗ', &['ϗ']), + ('ϐ', &['Β', 'β']), + ('ϑ', &['Θ', 'θ', 'ϴ']), + ('ϕ', &['Φ', 'φ']), + ('ϖ', &['Π', 'π']), + ('ϗ', &['Ϗ']), + ('Ϙ', &['ϙ']), + ('ϙ', &['Ϙ']), + ('Ϛ', &['ϛ']), + ('ϛ', &['Ϛ']), + ('Ϝ', &['ϝ']), + ('ϝ', &['Ϝ']), + ('Ϟ', &['ϟ']), + ('ϟ', &['Ϟ']), + ('Ϡ', &['ϡ']), + ('ϡ', &['Ϡ']), + ('Ϣ', &['ϣ']), + ('ϣ', &['Ϣ']), + ('Ϥ', &['ϥ']), + ('ϥ', &['Ϥ']), + ('Ϧ', &['ϧ']), + ('ϧ', &['Ϧ']), + ('Ϩ', &['ϩ']), + ('ϩ', &['Ϩ']), + ('Ϫ', &['ϫ']), + ('ϫ', &['Ϫ']), + ('Ϭ', &['ϭ']), + ('ϭ', &['Ϭ']), + ('Ϯ', &['ϯ']), + ('ϯ', &['Ϯ']), + ('ϰ', &['Κ', 'κ']), + ('ϱ', &['Ρ', 'ρ']), + ('ϲ', &['Ϲ']), + ('ϳ', &['Ϳ']), + ('ϴ', &['Θ', 'θ', 'ϑ']), + ('ϵ', &['Ε', 'ε']), + ('Ϸ', &['ϸ']), + ('ϸ', &['Ϸ']), + ('Ϲ', &['ϲ']), + ('Ϻ', &['ϻ']), + ('ϻ', &['Ϻ']), + ('Ͻ', &['ͻ']), + ('Ͼ', &['ͼ']), + ('Ͽ', &['ͽ']), + ('Ѐ', &['ѐ']), + ('Ё', &['ё']), + ('Ђ', &['ђ']), + ('Ѓ', &['ѓ']), + ('Є', &['є']), + ('Ѕ', &['ѕ']), + ('І', &['і']), + ('Ї', &['ї']), + ('Ј', &['ј']), + ('Љ', &['љ']), + ('Њ', &['њ']), + ('Ћ', &['ћ']), + ('Ќ', &['ќ']), + ('Ѝ', &['ѝ']), + ('Ў', &['ў']), + ('Џ', &['џ']), + ('А', &['а']), + ('Б', &['б']), + ('В', &['в', 'ᲀ']), + ('Г', &['г']), + ('Д', &['д', 'ᲁ']), + ('Е', &['е']), + ('Ж', &['ж']), + ('З', &['з']), + ('И', &['и']), + ('Й', &['й']), + ('К', &['к']), + ('Л', &['л']), + ('М', &['м']), + ('Н', &['н']), + ('О', &['о', 'ᲂ']), + ('П', &['п']), + ('Р', &['р']), + ('С', &['с', 'ᲃ']), + ('Т', &['т', 'ᲄ', 'ᲅ']), + ('У', &['у']), + ('Ф', &['ф']), + ('Х', &['х']), + ('Ц', &['ц']), + ('Ч', &['ч']), + ('Ш', &['ш']), + ('Щ', &['щ']), + ('Ъ', &['ъ', 'ᲆ']), + ('Ы', &['ы']), + ('Ь', &['ь']), + ('Э', &['э']), + ('Ю', &['ю']), + ('Я', &['я']), + ('а', &['А']), + ('б', &['Б']), + ('в', &['В', 'ᲀ']), + ('г', &['Г']), + ('д', &['Д', 'ᲁ']), + ('е', &['Е']), + ('ж', &['Ж']), + ('з', &['З']), + ('и', &['И']), + ('й', &['Й']), + ('к', &['К']), + ('л', &['Л']), + ('м', &['М']), + ('н', &['Н']), + ('о', &['О', 'ᲂ']), + ('п', &['П']), + ('р', &['Р']), + ('с', &['С', 'ᲃ']), + ('т', &['Т', 'ᲄ', 'ᲅ']), + ('у', &['У']), + ('ф', &['Ф']), + ('х', &['Х']), + ('ц', &['Ц']), + ('ч', &['Ч']), + ('ш', &['Ш']), + ('щ', &['Щ']), + ('ъ', &['Ъ', 'ᲆ']), + ('ы', &['Ы']), + ('ь', &['Ь']), + ('э', &['Э']), + ('ю', &['Ю']), + ('я', &['Я']), + ('ѐ', &['Ѐ']), + ('ё', &['Ё']), + ('ђ', &['Ђ']), + ('ѓ', &['Ѓ']), + ('є', &['Є']), + ('ѕ', &['Ѕ']), + ('і', &['І']), + ('ї', &['Ї']), + ('ј', &['Ј']), + ('љ', &['Љ']), + ('њ', &['Њ']), + ('ћ', &['Ћ']), + ('ќ', &['Ќ']), + ('ѝ', &['Ѝ']), + ('ў', &['Ў']), + ('џ', &['Џ']), + ('Ѡ', &['ѡ']), + ('ѡ', &['Ѡ']), + ('Ѣ', &['ѣ', 'ᲇ']), + ('ѣ', &['Ѣ', 'ᲇ']), + ('Ѥ', &['ѥ']), + ('ѥ', &['Ѥ']), + ('Ѧ', &['ѧ']), + ('ѧ', &['Ѧ']), + ('Ѩ', &['ѩ']), + ('ѩ', &['Ѩ']), + ('Ѫ', &['ѫ']), + ('ѫ', &['Ѫ']), + ('Ѭ', &['ѭ']), + ('ѭ', &['Ѭ']), + ('Ѯ', &['ѯ']), + ('ѯ', &['Ѯ']), + ('Ѱ', &['ѱ']), + ('ѱ', &['Ѱ']), + ('Ѳ', &['ѳ']), + ('ѳ', &['Ѳ']), + ('Ѵ', &['ѵ']), + ('ѵ', &['Ѵ']), + ('Ѷ', &['ѷ']), + ('ѷ', &['Ѷ']), + ('Ѹ', &['ѹ']), + ('ѹ', &['Ѹ']), + ('Ѻ', &['ѻ']), + ('ѻ', &['Ѻ']), + ('Ѽ', &['ѽ']), + ('ѽ', &['Ѽ']), + ('Ѿ', &['ѿ']), + ('ѿ', &['Ѿ']), + ('Ҁ', &['ҁ']), + ('ҁ', &['Ҁ']), + ('Ҋ', &['ҋ']), + ('ҋ', &['Ҋ']), + ('Ҍ', &['ҍ']), + ('ҍ', &['Ҍ']), + ('Ҏ', &['ҏ']), + ('ҏ', &['Ҏ']), + ('Ґ', &['ґ']), + ('ґ', &['Ґ']), + ('Ғ', &['ғ']), + ('ғ', &['Ғ']), + ('Ҕ', &['ҕ']), + ('ҕ', &['Ҕ']), + ('Җ', &['җ']), + ('җ', &['Җ']), + ('Ҙ', &['ҙ']), + ('ҙ', &['Ҙ']), + ('Қ', &['қ']), + ('қ', &['Қ']), + ('Ҝ', &['ҝ']), + ('ҝ', &['Ҝ']), + ('Ҟ', &['ҟ']), + ('ҟ', &['Ҟ']), + ('Ҡ', &['ҡ']), + ('ҡ', &['Ҡ']), + ('Ң', &['ң']), + ('ң', &['Ң']), + ('Ҥ', &['ҥ']), + ('ҥ', &['Ҥ']), + ('Ҧ', &['ҧ']), + ('ҧ', &['Ҧ']), + ('Ҩ', &['ҩ']), + ('ҩ', &['Ҩ']), + ('Ҫ', &['ҫ']), + ('ҫ', &['Ҫ']), + ('Ҭ', &['ҭ']), + ('ҭ', &['Ҭ']), + ('Ү', &['ү']), + ('ү', &['Ү']), + ('Ұ', &['ұ']), + ('ұ', &['Ұ']), + ('Ҳ', &['ҳ']), + ('ҳ', &['Ҳ']), + ('Ҵ', &['ҵ']), + ('ҵ', &['Ҵ']), + ('Ҷ', &['ҷ']), + ('ҷ', &['Ҷ']), + ('Ҹ', &['ҹ']), + ('ҹ', &['Ҹ']), + ('Һ', &['һ']), + ('һ', &['Һ']), + ('Ҽ', &['ҽ']), + ('ҽ', &['Ҽ']), + ('Ҿ', &['ҿ']), + ('ҿ', &['Ҿ']), + ('Ӏ', &['ӏ']), + ('Ӂ', &['ӂ']), + ('ӂ', &['Ӂ']), + ('Ӄ', &['ӄ']), + ('ӄ', &['Ӄ']), + ('Ӆ', &['ӆ']), + ('ӆ', &['Ӆ']), + ('Ӈ', &['ӈ']), + ('ӈ', &['Ӈ']), + ('Ӊ', &['ӊ']), + ('ӊ', &['Ӊ']), + ('Ӌ', &['ӌ']), + ('ӌ', &['Ӌ']), + ('Ӎ', &['ӎ']), + ('ӎ', &['Ӎ']), + ('ӏ', &['Ӏ']), + ('Ӑ', &['ӑ']), + ('ӑ', &['Ӑ']), + ('Ӓ', &['ӓ']), + ('ӓ', &['Ӓ']), + ('Ӕ', &['ӕ']), + ('ӕ', &['Ӕ']), + ('Ӗ', &['ӗ']), + ('ӗ', &['Ӗ']), + ('Ә', &['ә']), + ('ә', &['Ә']), + ('Ӛ', &['ӛ']), + ('ӛ', &['Ӛ']), + ('Ӝ', &['ӝ']), + ('ӝ', &['Ӝ']), + ('Ӟ', &['ӟ']), + ('ӟ', &['Ӟ']), + ('Ӡ', &['ӡ']), + ('ӡ', &['Ӡ']), + ('Ӣ', &['ӣ']), + ('ӣ', &['Ӣ']), + ('Ӥ', &['ӥ']), + ('ӥ', &['Ӥ']), + ('Ӧ', &['ӧ']), + ('ӧ', &['Ӧ']), + ('Ө', &['ө']), + ('ө', &['Ө']), + ('Ӫ', &['ӫ']), + ('ӫ', &['Ӫ']), + ('Ӭ', &['ӭ']), + ('ӭ', &['Ӭ']), + ('Ӯ', &['ӯ']), + ('ӯ', &['Ӯ']), + ('Ӱ', &['ӱ']), + ('ӱ', &['Ӱ']), + ('Ӳ', &['ӳ']), + ('ӳ', &['Ӳ']), + ('Ӵ', &['ӵ']), + ('ӵ', &['Ӵ']), + ('Ӷ', &['ӷ']), + ('ӷ', &['Ӷ']), + ('Ӹ', &['ӹ']), + ('ӹ', &['Ӹ']), + ('Ӻ', &['ӻ']), + ('ӻ', &['Ӻ']), + ('Ӽ', &['ӽ']), + ('ӽ', &['Ӽ']), + ('Ӿ', &['ӿ']), + ('ӿ', &['Ӿ']), + ('Ԁ', &['ԁ']), + ('ԁ', &['Ԁ']), + ('Ԃ', &['ԃ']), + ('ԃ', &['Ԃ']), + ('Ԅ', &['ԅ']), + ('ԅ', &['Ԅ']), + ('Ԇ', &['ԇ']), + ('ԇ', &['Ԇ']), + ('Ԉ', &['ԉ']), + ('ԉ', &['Ԉ']), + ('Ԋ', &['ԋ']), + ('ԋ', &['Ԋ']), + ('Ԍ', &['ԍ']), + ('ԍ', &['Ԍ']), + ('Ԏ', &['ԏ']), + ('ԏ', &['Ԏ']), + ('Ԑ', &['ԑ']), + ('ԑ', &['Ԑ']), + ('Ԓ', &['ԓ']), + ('ԓ', &['Ԓ']), + ('Ԕ', &['ԕ']), + ('ԕ', &['Ԕ']), + ('Ԗ', &['ԗ']), + ('ԗ', &['Ԗ']), + ('Ԙ', &['ԙ']), + ('ԙ', &['Ԙ']), + ('Ԛ', &['ԛ']), + ('ԛ', &['Ԛ']), + ('Ԝ', &['ԝ']), + ('ԝ', &['Ԝ']), + ('Ԟ', &['ԟ']), + ('ԟ', &['Ԟ']), + ('Ԡ', &['ԡ']), + ('ԡ', &['Ԡ']), + ('Ԣ', &['ԣ']), + ('ԣ', &['Ԣ']), + ('Ԥ', &['ԥ']), + ('ԥ', &['Ԥ']), + ('Ԧ', &['ԧ']), + ('ԧ', &['Ԧ']), + ('Ԩ', &['ԩ']), + ('ԩ', &['Ԩ']), + ('Ԫ', &['ԫ']), + ('ԫ', &['Ԫ']), + ('Ԭ', &['ԭ']), + ('ԭ', &['Ԭ']), + ('Ԯ', &['ԯ']), + ('ԯ', &['Ԯ']), + ('Ա', &['ա']), + ('Բ', &['բ']), + ('Գ', &['գ']), + ('Դ', &['դ']), + ('Ե', &['ե']), + ('Զ', &['զ']), + ('Է', &['է']), + ('Ը', &['ը']), + ('Թ', &['թ']), + ('Ժ', &['ժ']), + ('Ի', &['ի']), + ('Լ', &['լ']), + ('Խ', &['խ']), + ('Ծ', &['ծ']), + ('Կ', &['կ']), + ('Հ', &['հ']), + ('Ձ', &['ձ']), + ('Ղ', &['ղ']), + ('Ճ', &['ճ']), + ('Մ', &['մ']), + ('Յ', &['յ']), + ('Ն', &['ն']), + ('Շ', &['շ']), + ('Ո', &['ո']), + ('Չ', &['չ']), + ('Պ', &['պ']), + ('Ջ', &['ջ']), + ('Ռ', &['ռ']), + ('Ս', &['ս']), + ('Վ', &['վ']), + ('Տ', &['տ']), + ('Ր', &['ր']), + ('Ց', &['ց']), + ('Ւ', &['ւ']), + ('Փ', &['փ']), + ('Ք', &['ք']), + ('Օ', &['օ']), + ('Ֆ', &['ֆ']), + ('ա', &['Ա']), + ('բ', &['Բ']), + ('գ', &['Գ']), + ('դ', &['Դ']), + ('ե', &['Ե']), + ('զ', &['Զ']), + ('է', &['Է']), + ('ը', &['Ը']), + ('թ', &['Թ']), + ('ժ', &['Ժ']), + ('ի', &['Ի']), + ('լ', &['Լ']), + ('խ', &['Խ']), + ('ծ', &['Ծ']), + ('կ', &['Կ']), + ('հ', &['Հ']), + ('ձ', &['Ձ']), + ('ղ', &['Ղ']), + ('ճ', &['Ճ']), + ('մ', &['Մ']), + ('յ', &['Յ']), + ('ն', &['Ն']), + ('շ', &['Շ']), + ('ո', &['Ո']), + ('չ', &['Չ']), + ('պ', &['Պ']), + ('ջ', &['Ջ']), + ('ռ', &['Ռ']), + ('ս', &['Ս']), + ('վ', &['Վ']), + ('տ', &['Տ']), + ('ր', &['Ր']), + ('ց', &['Ց']), + ('ւ', &['Ւ']), + ('փ', &['Փ']), + ('ք', &['Ք']), + ('օ', &['Օ']), + ('ֆ', &['Ֆ']), + ('Ⴀ', &['ⴀ']), + ('Ⴁ', &['ⴁ']), + ('Ⴂ', &['ⴂ']), + ('Ⴃ', &['ⴃ']), + ('Ⴄ', &['ⴄ']), + ('Ⴅ', &['ⴅ']), + ('Ⴆ', &['ⴆ']), + ('Ⴇ', &['ⴇ']), + ('Ⴈ', &['ⴈ']), + ('Ⴉ', &['ⴉ']), + ('Ⴊ', &['ⴊ']), + ('Ⴋ', &['ⴋ']), + ('Ⴌ', &['ⴌ']), + ('Ⴍ', &['ⴍ']), + ('Ⴎ', &['ⴎ']), + ('Ⴏ', &['ⴏ']), + ('Ⴐ', &['ⴐ']), + ('Ⴑ', &['ⴑ']), + ('Ⴒ', &['ⴒ']), + ('Ⴓ', &['ⴓ']), + ('Ⴔ', &['ⴔ']), + ('Ⴕ', &['ⴕ']), + ('Ⴖ', &['ⴖ']), + ('Ⴗ', &['ⴗ']), + ('Ⴘ', &['ⴘ']), + ('Ⴙ', &['ⴙ']), + ('Ⴚ', &['ⴚ']), + ('Ⴛ', &['ⴛ']), + ('Ⴜ', &['ⴜ']), + ('Ⴝ', &['ⴝ']), + ('Ⴞ', &['ⴞ']), + ('Ⴟ', &['ⴟ']), + ('Ⴠ', &['ⴠ']), + ('Ⴡ', &['ⴡ']), + ('Ⴢ', &['ⴢ']), + ('Ⴣ', &['ⴣ']), + ('Ⴤ', &['ⴤ']), + ('Ⴥ', &['ⴥ']), + ('Ⴧ', &['ⴧ']), + ('Ⴭ', &['ⴭ']), + ('ა', &['Ა']), + ('ბ', &['Ბ']), + ('გ', &['Გ']), + ('დ', &['Დ']), + ('ე', &['Ე']), + ('ვ', &['Ვ']), + ('ზ', &['Ზ']), + ('თ', &['Თ']), + ('ი', &['Ი']), + ('კ', &['Კ']), + ('ლ', &['Ლ']), + ('მ', &['Მ']), + ('ნ', &['Ნ']), + ('ო', &['Ო']), + ('პ', &['Პ']), + ('ჟ', &['Ჟ']), + ('რ', &['Რ']), + ('ს', &['Ს']), + ('ტ', &['Ტ']), + ('უ', &['Უ']), + ('ფ', &['Ფ']), + ('ქ', &['Ქ']), + ('ღ', &['Ღ']), + ('ყ', &['Ყ']), + ('შ', &['Შ']), + ('ჩ', &['Ჩ']), + ('ც', &['Ც']), + ('ძ', &['Ძ']), + ('წ', &['Წ']), + ('ჭ', &['Ჭ']), + ('ხ', &['Ხ']), + ('ჯ', &['Ჯ']), + ('ჰ', &['Ჰ']), + ('ჱ', &['Ჱ']), + ('ჲ', &['Ჲ']), + ('ჳ', &['Ჳ']), + ('ჴ', &['Ჴ']), + ('ჵ', &['Ჵ']), + ('ჶ', &['Ჶ']), + ('ჷ', &['Ჷ']), + ('ჸ', &['Ჸ']), + ('ჹ', &['Ჹ']), + ('ჺ', &['Ჺ']), + ('ჽ', &['Ჽ']), + ('ჾ', &['Ჾ']), + ('ჿ', &['Ჿ']), + ('Ꭰ', &['ꭰ']), + ('Ꭱ', &['ꭱ']), + ('Ꭲ', &['ꭲ']), + ('Ꭳ', &['ꭳ']), + ('Ꭴ', &['ꭴ']), + ('Ꭵ', &['ꭵ']), + ('Ꭶ', &['ꭶ']), + ('Ꭷ', &['ꭷ']), + ('Ꭸ', &['ꭸ']), + ('Ꭹ', &['ꭹ']), + ('Ꭺ', &['ꭺ']), + ('Ꭻ', &['ꭻ']), + ('Ꭼ', &['ꭼ']), + ('Ꭽ', &['ꭽ']), + ('Ꭾ', &['ꭾ']), + ('Ꭿ', &['ꭿ']), + ('Ꮀ', &['ꮀ']), + ('Ꮁ', &['ꮁ']), + ('Ꮂ', &['ꮂ']), + ('Ꮃ', &['ꮃ']), + ('Ꮄ', &['ꮄ']), + ('Ꮅ', &['ꮅ']), + ('Ꮆ', &['ꮆ']), + ('Ꮇ', &['ꮇ']), + ('Ꮈ', &['ꮈ']), + ('Ꮉ', &['ꮉ']), + ('Ꮊ', &['ꮊ']), + ('Ꮋ', &['ꮋ']), + ('Ꮌ', &['ꮌ']), + ('Ꮍ', &['ꮍ']), + ('Ꮎ', &['ꮎ']), + ('Ꮏ', &['ꮏ']), + ('Ꮐ', &['ꮐ']), + ('Ꮑ', &['ꮑ']), + ('Ꮒ', &['ꮒ']), + ('Ꮓ', &['ꮓ']), + ('Ꮔ', &['ꮔ']), + ('Ꮕ', &['ꮕ']), + ('Ꮖ', &['ꮖ']), + ('Ꮗ', &['ꮗ']), + ('Ꮘ', &['ꮘ']), + ('Ꮙ', &['ꮙ']), + ('Ꮚ', &['ꮚ']), + ('Ꮛ', &['ꮛ']), + ('Ꮜ', &['ꮜ']), + ('Ꮝ', &['ꮝ']), + ('Ꮞ', &['ꮞ']), + ('Ꮟ', &['ꮟ']), + ('Ꮠ', &['ꮠ']), + ('Ꮡ', &['ꮡ']), + ('Ꮢ', &['ꮢ']), + ('Ꮣ', &['ꮣ']), + ('Ꮤ', &['ꮤ']), + ('Ꮥ', &['ꮥ']), + ('Ꮦ', &['ꮦ']), + ('Ꮧ', &['ꮧ']), + ('Ꮨ', &['ꮨ']), + ('Ꮩ', &['ꮩ']), + ('Ꮪ', &['ꮪ']), + ('Ꮫ', &['ꮫ']), + ('Ꮬ', &['ꮬ']), + ('Ꮭ', &['ꮭ']), + ('Ꮮ', &['ꮮ']), + ('Ꮯ', &['ꮯ']), + ('Ꮰ', &['ꮰ']), + ('Ꮱ', &['ꮱ']), + ('Ꮲ', &['ꮲ']), + ('Ꮳ', &['ꮳ']), + ('Ꮴ', &['ꮴ']), + ('Ꮵ', &['ꮵ']), + ('Ꮶ', &['ꮶ']), + ('Ꮷ', &['ꮷ']), + ('Ꮸ', &['ꮸ']), + ('Ꮹ', &['ꮹ']), + ('Ꮺ', &['ꮺ']), + ('Ꮻ', &['ꮻ']), + ('Ꮼ', &['ꮼ']), + ('Ꮽ', &['ꮽ']), + ('Ꮾ', &['ꮾ']), + ('Ꮿ', &['ꮿ']), + ('Ᏸ', &['ᏸ']), + ('Ᏹ', &['ᏹ']), + ('Ᏺ', &['ᏺ']), + ('Ᏻ', &['ᏻ']), + ('Ᏼ', &['ᏼ']), + ('Ᏽ', &['ᏽ']), + ('ᏸ', &['Ᏸ']), + ('ᏹ', &['Ᏹ']), + ('ᏺ', &['Ᏺ']), + ('ᏻ', &['Ᏻ']), + ('ᏼ', &['Ᏼ']), + ('ᏽ', &['Ᏽ']), + ('ᲀ', &['В', 'в']), + ('ᲁ', &['Д', 'д']), + ('ᲂ', &['О', 'о']), + ('ᲃ', &['С', 'с']), + ('ᲄ', &['Т', 'т', 'ᲅ']), + ('ᲅ', &['Т', 'т', 'ᲄ']), + ('ᲆ', &['Ъ', 'ъ']), + ('ᲇ', &['Ѣ', 'ѣ']), + ('ᲈ', &['Ꙋ', 'ꙋ']), + ('Ა', &['ა']), + ('Ბ', &['ბ']), + ('Გ', &['გ']), + ('Დ', &['დ']), + ('Ე', &['ე']), + ('Ვ', &['ვ']), + ('Ზ', &['ზ']), + ('Თ', &['თ']), + ('Ი', &['ი']), + ('Კ', &['კ']), + ('Ლ', &['ლ']), + ('Მ', &['მ']), + ('Ნ', &['ნ']), + ('Ო', &['ო']), + ('Პ', &['პ']), + ('Ჟ', &['ჟ']), + ('Რ', &['რ']), + ('Ს', &['ს']), + ('Ტ', &['ტ']), + ('Უ', &['უ']), + ('Ფ', &['ფ']), + ('Ქ', &['ქ']), + ('Ღ', &['ღ']), + ('Ყ', &['ყ']), + ('Შ', &['შ']), + ('Ჩ', &['ჩ']), + ('Ც', &['ც']), + ('Ძ', &['ძ']), + ('Წ', &['წ']), + ('Ჭ', &['ჭ']), + ('Ხ', &['ხ']), + ('Ჯ', &['ჯ']), + ('Ჰ', &['ჰ']), + ('Ჱ', &['ჱ']), + ('Ჲ', &['ჲ']), + ('Ჳ', &['ჳ']), + ('Ჴ', &['ჴ']), + ('Ჵ', &['ჵ']), + ('Ჶ', &['ჶ']), + ('Ჷ', &['ჷ']), + ('Ჸ', &['ჸ']), + ('Ჹ', &['ჹ']), + ('Ჺ', &['ჺ']), + ('Ჽ', &['ჽ']), + ('Ჾ', &['ჾ']), + ('Ჿ', &['ჿ']), + ('ᵹ', &['Ᵹ']), + ('ᵽ', &['Ᵽ']), + ('ᶎ', &['Ᶎ']), + ('Ḁ', &['ḁ']), + ('ḁ', &['Ḁ']), + ('Ḃ', &['ḃ']), + ('ḃ', &['Ḃ']), + ('Ḅ', &['ḅ']), + ('ḅ', &['Ḅ']), + ('Ḇ', &['ḇ']), + ('ḇ', &['Ḇ']), + ('Ḉ', &['ḉ']), + ('ḉ', &['Ḉ']), + ('Ḋ', &['ḋ']), + ('ḋ', &['Ḋ']), + ('Ḍ', &['ḍ']), + ('ḍ', &['Ḍ']), + ('Ḏ', &['ḏ']), + ('ḏ', &['Ḏ']), + ('Ḑ', &['ḑ']), + ('ḑ', &['Ḑ']), + ('Ḓ', &['ḓ']), + ('ḓ', &['Ḓ']), + ('Ḕ', &['ḕ']), + ('ḕ', &['Ḕ']), + ('Ḗ', &['ḗ']), + ('ḗ', &['Ḗ']), + ('Ḙ', &['ḙ']), + ('ḙ', &['Ḙ']), + ('Ḛ', &['ḛ']), + ('ḛ', &['Ḛ']), + ('Ḝ', &['ḝ']), + ('ḝ', &['Ḝ']), + ('Ḟ', &['ḟ']), + ('ḟ', &['Ḟ']), + ('Ḡ', &['ḡ']), + ('ḡ', &['Ḡ']), + ('Ḣ', &['ḣ']), + ('ḣ', &['Ḣ']), + ('Ḥ', &['ḥ']), + ('ḥ', &['Ḥ']), + ('Ḧ', &['ḧ']), + ('ḧ', &['Ḧ']), + ('Ḩ', &['ḩ']), + ('ḩ', &['Ḩ']), + ('Ḫ', &['ḫ']), + ('ḫ', &['Ḫ']), + ('Ḭ', &['ḭ']), + ('ḭ', &['Ḭ']), + ('Ḯ', &['ḯ']), + ('ḯ', &['Ḯ']), + ('Ḱ', &['ḱ']), + ('ḱ', &['Ḱ']), + ('Ḳ', &['ḳ']), + ('ḳ', &['Ḳ']), + ('Ḵ', &['ḵ']), + ('ḵ', &['Ḵ']), + ('Ḷ', &['ḷ']), + ('ḷ', &['Ḷ']), + ('Ḹ', &['ḹ']), + ('ḹ', &['Ḹ']), + ('Ḻ', &['ḻ']), + ('ḻ', &['Ḻ']), + ('Ḽ', &['ḽ']), + ('ḽ', &['Ḽ']), + ('Ḿ', &['ḿ']), + ('ḿ', &['Ḿ']), + ('Ṁ', &['ṁ']), + ('ṁ', &['Ṁ']), + ('Ṃ', &['ṃ']), + ('ṃ', &['Ṃ']), + ('Ṅ', &['ṅ']), + ('ṅ', &['Ṅ']), + ('Ṇ', &['ṇ']), + ('ṇ', &['Ṇ']), + ('Ṉ', &['ṉ']), + ('ṉ', &['Ṉ']), + ('Ṋ', &['ṋ']), + ('ṋ', &['Ṋ']), + ('Ṍ', &['ṍ']), + ('ṍ', &['Ṍ']), + ('Ṏ', &['ṏ']), + ('ṏ', &['Ṏ']), + ('Ṑ', &['ṑ']), + ('ṑ', &['Ṑ']), + ('Ṓ', &['ṓ']), + ('ṓ', &['Ṓ']), + ('Ṕ', &['ṕ']), + ('ṕ', &['Ṕ']), + ('Ṗ', &['ṗ']), + ('ṗ', &['Ṗ']), + ('Ṙ', &['ṙ']), + ('ṙ', &['Ṙ']), + ('Ṛ', &['ṛ']), + ('ṛ', &['Ṛ']), + ('Ṝ', &['ṝ']), + ('ṝ', &['Ṝ']), + ('Ṟ', &['ṟ']), + ('ṟ', &['Ṟ']), + ('Ṡ', &['ṡ', 'ẛ']), + ('ṡ', &['Ṡ', 'ẛ']), + ('Ṣ', &['ṣ']), + ('ṣ', &['Ṣ']), + ('Ṥ', &['ṥ']), + ('ṥ', &['Ṥ']), + ('Ṧ', &['ṧ']), + ('ṧ', &['Ṧ']), + ('Ṩ', &['ṩ']), + ('ṩ', &['Ṩ']), + ('Ṫ', &['ṫ']), + ('ṫ', &['Ṫ']), + ('Ṭ', &['ṭ']), + ('ṭ', &['Ṭ']), + ('Ṯ', &['ṯ']), + ('ṯ', &['Ṯ']), + ('Ṱ', &['ṱ']), + ('ṱ', &['Ṱ']), + ('Ṳ', &['ṳ']), + ('ṳ', &['Ṳ']), + ('Ṵ', &['ṵ']), + ('ṵ', &['Ṵ']), + ('Ṷ', &['ṷ']), + ('ṷ', &['Ṷ']), + ('Ṹ', &['ṹ']), + ('ṹ', &['Ṹ']), + ('Ṻ', &['ṻ']), + ('ṻ', &['Ṻ']), + ('Ṽ', &['ṽ']), + ('ṽ', &['Ṽ']), + ('Ṿ', &['ṿ']), + ('ṿ', &['Ṿ']), + ('Ẁ', &['ẁ']), + ('ẁ', &['Ẁ']), + ('Ẃ', &['ẃ']), + ('ẃ', &['Ẃ']), + ('Ẅ', &['ẅ']), + ('ẅ', &['Ẅ']), + ('Ẇ', &['ẇ']), + ('ẇ', &['Ẇ']), + ('Ẉ', &['ẉ']), + ('ẉ', &['Ẉ']), + ('Ẋ', &['ẋ']), + ('ẋ', &['Ẋ']), + ('Ẍ', &['ẍ']), + ('ẍ', &['Ẍ']), + ('Ẏ', &['ẏ']), + ('ẏ', &['Ẏ']), + ('Ẑ', &['ẑ']), + ('ẑ', &['Ẑ']), + ('Ẓ', &['ẓ']), + ('ẓ', &['Ẓ']), + ('Ẕ', &['ẕ']), + ('ẕ', &['Ẕ']), + ('ẛ', &['Ṡ', 'ṡ']), + ('ẞ', &['ß']), + ('Ạ', &['ạ']), + ('ạ', &['Ạ']), + ('Ả', &['ả']), + ('ả', &['Ả']), + ('Ấ', &['ấ']), + ('ấ', &['Ấ']), + ('Ầ', &['ầ']), + ('ầ', &['Ầ']), + ('Ẩ', &['ẩ']), + ('ẩ', &['Ẩ']), + ('Ẫ', &['ẫ']), + ('ẫ', &['Ẫ']), + ('Ậ', &['ậ']), + ('ậ', &['Ậ']), + ('Ắ', &['ắ']), + ('ắ', &['Ắ']), + ('Ằ', &['ằ']), + ('ằ', &['Ằ']), + ('Ẳ', &['ẳ']), + ('ẳ', &['Ẳ']), + ('Ẵ', &['ẵ']), + ('ẵ', &['Ẵ']), + ('Ặ', &['ặ']), + ('ặ', &['Ặ']), + ('Ẹ', &['ẹ']), + ('ẹ', &['Ẹ']), + ('Ẻ', &['ẻ']), + ('ẻ', &['Ẻ']), + ('Ẽ', &['ẽ']), + ('ẽ', &['Ẽ']), + ('Ế', &['ế']), + ('ế', &['Ế']), + ('Ề', &['ề']), + ('ề', &['Ề']), + ('Ể', &['ể']), + ('ể', &['Ể']), + ('Ễ', &['ễ']), + ('ễ', &['Ễ']), + ('Ệ', &['ệ']), + ('ệ', &['Ệ']), + ('Ỉ', &['ỉ']), + ('ỉ', &['Ỉ']), + ('Ị', &['ị']), + ('ị', &['Ị']), + ('Ọ', &['ọ']), + ('ọ', &['Ọ']), + ('Ỏ', &['ỏ']), + ('ỏ', &['Ỏ']), + ('Ố', &['ố']), + ('ố', &['Ố']), + ('Ồ', &['ồ']), + ('ồ', &['Ồ']), + ('Ổ', &['ổ']), + ('ổ', &['Ổ']), + ('Ỗ', &['ỗ']), + ('ỗ', &['Ỗ']), + ('Ộ', &['ộ']), + ('ộ', &['Ộ']), + ('Ớ', &['ớ']), + ('ớ', &['Ớ']), + ('Ờ', &['ờ']), + ('ờ', &['Ờ']), + ('Ở', &['ở']), + ('ở', &['Ở']), + ('Ỡ', &['ỡ']), + ('ỡ', &['Ỡ']), + ('Ợ', &['ợ']), + ('ợ', &['Ợ']), + ('Ụ', &['ụ']), + ('ụ', &['Ụ']), + ('Ủ', &['ủ']), + ('ủ', &['Ủ']), + ('Ứ', &['ứ']), + ('ứ', &['Ứ']), + ('Ừ', &['ừ']), + ('ừ', &['Ừ']), + ('Ử', &['ử']), + ('ử', &['Ử']), + ('Ữ', &['ữ']), + ('ữ', &['Ữ']), + ('Ự', &['ự']), + ('ự', &['Ự']), + ('Ỳ', &['ỳ']), + ('ỳ', &['Ỳ']), + ('Ỵ', &['ỵ']), + ('ỵ', &['Ỵ']), + ('Ỷ', &['ỷ']), + ('ỷ', &['Ỷ']), + ('Ỹ', &['ỹ']), + ('ỹ', &['Ỹ']), + ('Ỻ', &['ỻ']), + ('ỻ', &['Ỻ']), + ('Ỽ', &['ỽ']), + ('ỽ', &['Ỽ']), + ('Ỿ', &['ỿ']), + ('ỿ', &['Ỿ']), + ('ἀ', &['Ἀ']), + ('ἁ', &['Ἁ']), + ('ἂ', &['Ἂ']), + ('ἃ', &['Ἃ']), + ('ἄ', &['Ἄ']), + ('ἅ', &['Ἅ']), + ('ἆ', &['Ἆ']), + ('ἇ', &['Ἇ']), + ('Ἀ', &['ἀ']), + ('Ἁ', &['ἁ']), + ('Ἂ', &['ἂ']), + ('Ἃ', &['ἃ']), + ('Ἄ', &['ἄ']), + ('Ἅ', &['ἅ']), + ('Ἆ', &['ἆ']), + ('Ἇ', &['ἇ']), + ('ἐ', &['Ἐ']), + ('ἑ', &['Ἑ']), + ('ἒ', &['Ἒ']), + ('ἓ', &['Ἓ']), + ('ἔ', &['Ἔ']), + ('ἕ', &['Ἕ']), + ('Ἐ', &['ἐ']), + ('Ἑ', &['ἑ']), + ('Ἒ', &['ἒ']), + ('Ἓ', &['ἓ']), + ('Ἔ', &['ἔ']), + ('Ἕ', &['ἕ']), + ('ἠ', &['Ἠ']), + ('ἡ', &['Ἡ']), + ('ἢ', &['Ἢ']), + ('ἣ', &['Ἣ']), + ('ἤ', &['Ἤ']), + ('ἥ', &['Ἥ']), + ('ἦ', &['Ἦ']), + ('ἧ', &['Ἧ']), + ('Ἠ', &['ἠ']), + ('Ἡ', &['ἡ']), + ('Ἢ', &['ἢ']), + ('Ἣ', &['ἣ']), + ('Ἤ', &['ἤ']), + ('Ἥ', &['ἥ']), + ('Ἦ', &['ἦ']), + ('Ἧ', &['ἧ']), + ('ἰ', &['Ἰ']), + ('ἱ', &['Ἱ']), + ('ἲ', &['Ἲ']), + ('ἳ', &['Ἳ']), + ('ἴ', &['Ἴ']), + ('ἵ', &['Ἵ']), + ('ἶ', &['Ἶ']), + ('ἷ', &['Ἷ']), + ('Ἰ', &['ἰ']), + ('Ἱ', &['ἱ']), + ('Ἲ', &['ἲ']), + ('Ἳ', &['ἳ']), + ('Ἴ', &['ἴ']), + ('Ἵ', &['ἵ']), + ('Ἶ', &['ἶ']), + ('Ἷ', &['ἷ']), + ('ὀ', &['Ὀ']), + ('ὁ', &['Ὁ']), + ('ὂ', &['Ὂ']), + ('ὃ', &['Ὃ']), + ('ὄ', &['Ὄ']), + ('ὅ', &['Ὅ']), + ('Ὀ', &['ὀ']), + ('Ὁ', &['ὁ']), + ('Ὂ', &['ὂ']), + ('Ὃ', &['ὃ']), + ('Ὄ', &['ὄ']), + ('Ὅ', &['ὅ']), + ('ὑ', &['Ὑ']), + ('ὓ', &['Ὓ']), + ('ὕ', &['Ὕ']), + ('ὗ', &['Ὗ']), + ('Ὑ', &['ὑ']), + ('Ὓ', &['ὓ']), + ('Ὕ', &['ὕ']), + ('Ὗ', &['ὗ']), + ('ὠ', &['Ὠ']), + ('ὡ', &['Ὡ']), + ('ὢ', &['Ὢ']), + ('ὣ', &['Ὣ']), + ('ὤ', &['Ὤ']), + ('ὥ', &['Ὥ']), + ('ὦ', &['Ὦ']), + ('ὧ', &['Ὧ']), + ('Ὠ', &['ὠ']), + ('Ὡ', &['ὡ']), + ('Ὢ', &['ὢ']), + ('Ὣ', &['ὣ']), + ('Ὤ', &['ὤ']), + ('Ὥ', &['ὥ']), + ('Ὦ', &['ὦ']), + ('Ὧ', &['ὧ']), + ('ὰ', &['Ὰ']), + ('ά', &['Ά']), + ('ὲ', &['Ὲ']), + ('έ', &['Έ']), + ('ὴ', &['Ὴ']), + ('ή', &['Ή']), + ('ὶ', &['Ὶ']), + ('ί', &['Ί']), + ('ὸ', &['Ὸ']), + ('ό', &['Ό']), + ('ὺ', &['Ὺ']), + ('ύ', &['Ύ']), + ('ὼ', &['Ὼ']), + ('ώ', &['Ώ']), + ('ᾀ', &['ᾈ']), + ('ᾁ', &['ᾉ']), + ('ᾂ', &['ᾊ']), + ('ᾃ', &['ᾋ']), + ('ᾄ', &['ᾌ']), + ('ᾅ', &['ᾍ']), + ('ᾆ', &['ᾎ']), + ('ᾇ', &['ᾏ']), + ('ᾈ', &['ᾀ']), + ('ᾉ', &['ᾁ']), + ('ᾊ', &['ᾂ']), + ('ᾋ', &['ᾃ']), + ('ᾌ', &['ᾄ']), + ('ᾍ', &['ᾅ']), + ('ᾎ', &['ᾆ']), + ('ᾏ', &['ᾇ']), + ('ᾐ', &['ᾘ']), + ('ᾑ', &['ᾙ']), + ('ᾒ', &['ᾚ']), + ('ᾓ', &['ᾛ']), + ('ᾔ', &['ᾜ']), + ('ᾕ', &['ᾝ']), + ('ᾖ', &['ᾞ']), + ('ᾗ', &['ᾟ']), + ('ᾘ', &['ᾐ']), + ('ᾙ', &['ᾑ']), + ('ᾚ', &['ᾒ']), + ('ᾛ', &['ᾓ']), + ('ᾜ', &['ᾔ']), + ('ᾝ', &['ᾕ']), + ('ᾞ', &['ᾖ']), + ('ᾟ', &['ᾗ']), + ('ᾠ', &['ᾨ']), + ('ᾡ', &['ᾩ']), + ('ᾢ', &['ᾪ']), + ('ᾣ', &['ᾫ']), + ('ᾤ', &['ᾬ']), + ('ᾥ', &['ᾭ']), + ('ᾦ', &['ᾮ']), + ('ᾧ', &['ᾯ']), + ('ᾨ', &['ᾠ']), + ('ᾩ', &['ᾡ']), + ('ᾪ', &['ᾢ']), + ('ᾫ', &['ᾣ']), + ('ᾬ', &['ᾤ']), + ('ᾭ', &['ᾥ']), + ('ᾮ', &['ᾦ']), + ('ᾯ', &['ᾧ']), + ('ᾰ', &['Ᾰ']), + ('ᾱ', &['Ᾱ']), + ('ᾳ', &['ᾼ']), + ('Ᾰ', &['ᾰ']), + ('Ᾱ', &['ᾱ']), + ('Ὰ', &['ὰ']), + ('Ά', &['ά']), + ('ᾼ', &['ᾳ']), + ('ι', &['\u{345}', 'Ι', 'ι']), + ('ῃ', &['ῌ']), + ('Ὲ', &['ὲ']), + ('Έ', &['έ']), + ('Ὴ', &['ὴ']), + ('Ή', &['ή']), + ('ῌ', &['ῃ']), + ('ῐ', &['Ῐ']), + ('ῑ', &['Ῑ']), + ('Ῐ', &['ῐ']), + ('Ῑ', &['ῑ']), + ('Ὶ', &['ὶ']), + ('Ί', &['ί']), + ('ῠ', &['Ῠ']), + ('ῡ', &['Ῡ']), + ('ῥ', &['Ῥ']), + ('Ῠ', &['ῠ']), + ('Ῡ', &['ῡ']), + ('Ὺ', &['ὺ']), + ('Ύ', &['ύ']), + ('Ῥ', &['ῥ']), + ('ῳ', &['ῼ']), + ('Ὸ', &['ὸ']), + ('Ό', &['ό']), + ('Ὼ', &['ὼ']), + ('Ώ', &['ώ']), + ('ῼ', &['ῳ']), + ('Ω', &['Ω', 'ω']), + ('K', &['K', 'k']), + ('Å', &['Å', 'å']), + ('Ⅎ', &['ⅎ']), + ('ⅎ', &['Ⅎ']), + ('Ⅰ', &['ⅰ']), + ('Ⅱ', &['ⅱ']), + ('Ⅲ', &['ⅲ']), + ('Ⅳ', &['ⅳ']), + ('Ⅴ', &['ⅴ']), + ('Ⅵ', &['ⅵ']), + ('Ⅶ', &['ⅶ']), + ('Ⅷ', &['ⅷ']), + ('Ⅸ', &['ⅸ']), + ('Ⅹ', &['ⅹ']), + ('Ⅺ', &['ⅺ']), + ('Ⅻ', &['ⅻ']), + ('Ⅼ', &['ⅼ']), + ('Ⅽ', &['ⅽ']), + ('Ⅾ', &['ⅾ']), + ('Ⅿ', &['ⅿ']), + ('ⅰ', &['Ⅰ']), + ('ⅱ', &['Ⅱ']), + ('ⅲ', &['Ⅲ']), + ('ⅳ', &['Ⅳ']), + ('ⅴ', &['Ⅴ']), + ('ⅵ', &['Ⅵ']), + ('ⅶ', &['Ⅶ']), + ('ⅷ', &['Ⅷ']), + ('ⅸ', &['Ⅸ']), + ('ⅹ', &['Ⅹ']), + ('ⅺ', &['Ⅺ']), + ('ⅻ', &['Ⅻ']), + ('ⅼ', &['Ⅼ']), + ('ⅽ', &['Ⅽ']), + ('ⅾ', &['Ⅾ']), + ('ⅿ', &['Ⅿ']), + ('Ↄ', &['ↄ']), + ('ↄ', &['Ↄ']), + ('Ⓐ', &['ⓐ']), + ('Ⓑ', &['ⓑ']), + ('Ⓒ', &['ⓒ']), + ('Ⓓ', &['ⓓ']), + ('Ⓔ', &['ⓔ']), + ('Ⓕ', &['ⓕ']), + ('Ⓖ', &['ⓖ']), + ('Ⓗ', &['ⓗ']), + ('Ⓘ', &['ⓘ']), + ('Ⓙ', &['ⓙ']), + ('Ⓚ', &['ⓚ']), + ('Ⓛ', &['ⓛ']), + ('Ⓜ', &['ⓜ']), + ('Ⓝ', &['ⓝ']), + ('Ⓞ', &['ⓞ']), + ('Ⓟ', &['ⓟ']), + ('Ⓠ', &['ⓠ']), + ('Ⓡ', &['ⓡ']), + ('Ⓢ', &['ⓢ']), + ('Ⓣ', &['ⓣ']), + ('Ⓤ', &['ⓤ']), + ('Ⓥ', &['ⓥ']), + ('Ⓦ', &['ⓦ']), + ('Ⓧ', &['ⓧ']), + ('Ⓨ', &['ⓨ']), + ('Ⓩ', &['ⓩ']), + ('ⓐ', &['Ⓐ']), + ('ⓑ', &['Ⓑ']), + ('ⓒ', &['Ⓒ']), + ('ⓓ', &['Ⓓ']), + ('ⓔ', &['Ⓔ']), + ('ⓕ', &['Ⓕ']), + ('ⓖ', &['Ⓖ']), + ('ⓗ', &['Ⓗ']), + ('ⓘ', &['Ⓘ']), + ('ⓙ', &['Ⓙ']), + ('ⓚ', &['Ⓚ']), + ('ⓛ', &['Ⓛ']), + ('ⓜ', &['Ⓜ']), + ('ⓝ', &['Ⓝ']), + ('ⓞ', &['Ⓞ']), + ('ⓟ', &['Ⓟ']), + ('ⓠ', &['Ⓠ']), + ('ⓡ', &['Ⓡ']), + ('ⓢ', &['Ⓢ']), + ('ⓣ', &['Ⓣ']), + ('ⓤ', &['Ⓤ']), + ('ⓥ', &['Ⓥ']), + ('ⓦ', &['Ⓦ']), + ('ⓧ', &['Ⓧ']), + ('ⓨ', &['Ⓨ']), + ('ⓩ', &['Ⓩ']), + ('Ⰰ', &['ⰰ']), + ('Ⰱ', &['ⰱ']), + ('Ⰲ', &['ⰲ']), + ('Ⰳ', &['ⰳ']), + ('Ⰴ', &['ⰴ']), + ('Ⰵ', &['ⰵ']), + ('Ⰶ', &['ⰶ']), + ('Ⰷ', &['ⰷ']), + ('Ⰸ', &['ⰸ']), + ('Ⰹ', &['ⰹ']), + ('Ⰺ', &['ⰺ']), + ('Ⰻ', &['ⰻ']), + ('Ⰼ', &['ⰼ']), + ('Ⰽ', &['ⰽ']), + ('Ⰾ', &['ⰾ']), + ('Ⰿ', &['ⰿ']), + ('Ⱀ', &['ⱀ']), + ('Ⱁ', &['ⱁ']), + ('Ⱂ', &['ⱂ']), + ('Ⱃ', &['ⱃ']), + ('Ⱄ', &['ⱄ']), + ('Ⱅ', &['ⱅ']), + ('Ⱆ', &['ⱆ']), + ('Ⱇ', &['ⱇ']), + ('Ⱈ', &['ⱈ']), + ('Ⱉ', &['ⱉ']), + ('Ⱊ', &['ⱊ']), + ('Ⱋ', &['ⱋ']), + ('Ⱌ', &['ⱌ']), + ('Ⱍ', &['ⱍ']), + ('Ⱎ', &['ⱎ']), + ('Ⱏ', &['ⱏ']), + ('Ⱐ', &['ⱐ']), + ('Ⱑ', &['ⱑ']), + ('Ⱒ', &['ⱒ']), + ('Ⱓ', &['ⱓ']), + ('Ⱔ', &['ⱔ']), + ('Ⱕ', &['ⱕ']), + ('Ⱖ', &['ⱖ']), + ('Ⱗ', &['ⱗ']), + ('Ⱘ', &['ⱘ']), + ('Ⱙ', &['ⱙ']), + ('Ⱚ', &['ⱚ']), + ('Ⱛ', &['ⱛ']), + ('Ⱜ', &['ⱜ']), + ('Ⱝ', &['ⱝ']), + ('Ⱞ', &['ⱞ']), + ('Ⱟ', &['ⱟ']), + ('ⰰ', &['Ⰰ']), + ('ⰱ', &['Ⰱ']), + ('ⰲ', &['Ⰲ']), + ('ⰳ', &['Ⰳ']), + ('ⰴ', &['Ⰴ']), + ('ⰵ', &['Ⰵ']), + ('ⰶ', &['Ⰶ']), + ('ⰷ', &['Ⰷ']), + ('ⰸ', &['Ⰸ']), + ('ⰹ', &['Ⰹ']), + ('ⰺ', &['Ⰺ']), + ('ⰻ', &['Ⰻ']), + ('ⰼ', &['Ⰼ']), + ('ⰽ', &['Ⰽ']), + ('ⰾ', &['Ⰾ']), + ('ⰿ', &['Ⰿ']), + ('ⱀ', &['Ⱀ']), + ('ⱁ', &['Ⱁ']), + ('ⱂ', &['Ⱂ']), + ('ⱃ', &['Ⱃ']), + ('ⱄ', &['Ⱄ']), + ('ⱅ', &['Ⱅ']), + ('ⱆ', &['Ⱆ']), + ('ⱇ', &['Ⱇ']), + ('ⱈ', &['Ⱈ']), + ('ⱉ', &['Ⱉ']), + ('ⱊ', &['Ⱊ']), + ('ⱋ', &['Ⱋ']), + ('ⱌ', &['Ⱌ']), + ('ⱍ', &['Ⱍ']), + ('ⱎ', &['Ⱎ']), + ('ⱏ', &['Ⱏ']), + ('ⱐ', &['Ⱐ']), + ('ⱑ', &['Ⱑ']), + ('ⱒ', &['Ⱒ']), + ('ⱓ', &['Ⱓ']), + ('ⱔ', &['Ⱔ']), + ('ⱕ', &['Ⱕ']), + ('ⱖ', &['Ⱖ']), + ('ⱗ', &['Ⱗ']), + ('ⱘ', &['Ⱘ']), + ('ⱙ', &['Ⱙ']), + ('ⱚ', &['Ⱚ']), + ('ⱛ', &['Ⱛ']), + ('ⱜ', &['Ⱜ']), + ('ⱝ', &['Ⱝ']), + ('ⱞ', &['Ⱞ']), + ('ⱟ', &['Ⱟ']), + ('Ⱡ', &['ⱡ']), + ('ⱡ', &['Ⱡ']), + ('Ɫ', &['ɫ']), + ('Ᵽ', &['ᵽ']), + ('Ɽ', &['ɽ']), + ('ⱥ', &['Ⱥ']), + ('ⱦ', &['Ⱦ']), + ('Ⱨ', &['ⱨ']), + ('ⱨ', &['Ⱨ']), + ('Ⱪ', &['ⱪ']), + ('ⱪ', &['Ⱪ']), + ('Ⱬ', &['ⱬ']), + ('ⱬ', &['Ⱬ']), + ('Ɑ', &['ɑ']), + ('Ɱ', &['ɱ']), + ('Ɐ', &['ɐ']), + ('Ɒ', &['ɒ']), + ('Ⱳ', &['ⱳ']), + ('ⱳ', &['Ⱳ']), + ('Ⱶ', &['ⱶ']), + ('ⱶ', &['Ⱶ']), + ('Ȿ', &['ȿ']), + ('Ɀ', &['ɀ']), + ('Ⲁ', &['ⲁ']), + ('ⲁ', &['Ⲁ']), + ('Ⲃ', &['ⲃ']), + ('ⲃ', &['Ⲃ']), + ('Ⲅ', &['ⲅ']), + ('ⲅ', &['Ⲅ']), + ('Ⲇ', &['ⲇ']), + ('ⲇ', &['Ⲇ']), + ('Ⲉ', &['ⲉ']), + ('ⲉ', &['Ⲉ']), + ('Ⲋ', &['ⲋ']), + ('ⲋ', &['Ⲋ']), + ('Ⲍ', &['ⲍ']), + ('ⲍ', &['Ⲍ']), + ('Ⲏ', &['ⲏ']), + ('ⲏ', &['Ⲏ']), + ('Ⲑ', &['ⲑ']), + ('ⲑ', &['Ⲑ']), + ('Ⲓ', &['ⲓ']), + ('ⲓ', &['Ⲓ']), + ('Ⲕ', &['ⲕ']), + ('ⲕ', &['Ⲕ']), + ('Ⲗ', &['ⲗ']), + ('ⲗ', &['Ⲗ']), + ('Ⲙ', &['ⲙ']), + ('ⲙ', &['Ⲙ']), + ('Ⲛ', &['ⲛ']), + ('ⲛ', &['Ⲛ']), + ('Ⲝ', &['ⲝ']), + ('ⲝ', &['Ⲝ']), + ('Ⲟ', &['ⲟ']), + ('ⲟ', &['Ⲟ']), + ('Ⲡ', &['ⲡ']), + ('ⲡ', &['Ⲡ']), + ('Ⲣ', &['ⲣ']), + ('ⲣ', &['Ⲣ']), + ('Ⲥ', &['ⲥ']), + ('ⲥ', &['Ⲥ']), + ('Ⲧ', &['ⲧ']), + ('ⲧ', &['Ⲧ']), + ('Ⲩ', &['ⲩ']), + ('ⲩ', &['Ⲩ']), + ('Ⲫ', &['ⲫ']), + ('ⲫ', &['Ⲫ']), + ('Ⲭ', &['ⲭ']), + ('ⲭ', &['Ⲭ']), + ('Ⲯ', &['ⲯ']), + ('ⲯ', &['Ⲯ']), + ('Ⲱ', &['ⲱ']), + ('ⲱ', &['Ⲱ']), + ('Ⲳ', &['ⲳ']), + ('ⲳ', &['Ⲳ']), + ('Ⲵ', &['ⲵ']), + ('ⲵ', &['Ⲵ']), + ('Ⲷ', &['ⲷ']), + ('ⲷ', &['Ⲷ']), + ('Ⲹ', &['ⲹ']), + ('ⲹ', &['Ⲹ']), + ('Ⲻ', &['ⲻ']), + ('ⲻ', &['Ⲻ']), + ('Ⲽ', &['ⲽ']), + ('ⲽ', &['Ⲽ']), + ('Ⲿ', &['ⲿ']), + ('ⲿ', &['Ⲿ']), + ('Ⳁ', &['ⳁ']), + ('ⳁ', &['Ⳁ']), + ('Ⳃ', &['ⳃ']), + ('ⳃ', &['Ⳃ']), + ('Ⳅ', &['ⳅ']), + ('ⳅ', &['Ⳅ']), + ('Ⳇ', &['ⳇ']), + ('ⳇ', &['Ⳇ']), + ('Ⳉ', &['ⳉ']), + ('ⳉ', &['Ⳉ']), + ('Ⳋ', &['ⳋ']), + ('ⳋ', &['Ⳋ']), + ('Ⳍ', &['ⳍ']), + ('ⳍ', &['Ⳍ']), + ('Ⳏ', &['ⳏ']), + ('ⳏ', &['Ⳏ']), + ('Ⳑ', &['ⳑ']), + ('ⳑ', &['Ⳑ']), + ('Ⳓ', &['ⳓ']), + ('ⳓ', &['Ⳓ']), + ('Ⳕ', &['ⳕ']), + ('ⳕ', &['Ⳕ']), + ('Ⳗ', &['ⳗ']), + ('ⳗ', &['Ⳗ']), + ('Ⳙ', &['ⳙ']), + ('ⳙ', &['Ⳙ']), + ('Ⳛ', &['ⳛ']), + ('ⳛ', &['Ⳛ']), + ('Ⳝ', &['ⳝ']), + ('ⳝ', &['Ⳝ']), + ('Ⳟ', &['ⳟ']), + ('ⳟ', &['Ⳟ']), + ('Ⳡ', &['ⳡ']), + ('ⳡ', &['Ⳡ']), + ('Ⳣ', &['ⳣ']), + ('ⳣ', &['Ⳣ']), + ('Ⳬ', &['ⳬ']), + ('ⳬ', &['Ⳬ']), + ('Ⳮ', &['ⳮ']), + ('ⳮ', &['Ⳮ']), + ('Ⳳ', &['ⳳ']), + ('ⳳ', &['Ⳳ']), + ('ⴀ', &['Ⴀ']), + ('ⴁ', &['Ⴁ']), + ('ⴂ', &['Ⴂ']), + ('ⴃ', &['Ⴃ']), + ('ⴄ', &['Ⴄ']), + ('ⴅ', &['Ⴅ']), + ('ⴆ', &['Ⴆ']), + ('ⴇ', &['Ⴇ']), + ('ⴈ', &['Ⴈ']), + ('ⴉ', &['Ⴉ']), + ('ⴊ', &['Ⴊ']), + ('ⴋ', &['Ⴋ']), + ('ⴌ', &['Ⴌ']), + ('ⴍ', &['Ⴍ']), + ('ⴎ', &['Ⴎ']), + ('ⴏ', &['Ⴏ']), + ('ⴐ', &['Ⴐ']), + ('ⴑ', &['Ⴑ']), + ('ⴒ', &['Ⴒ']), + ('ⴓ', &['Ⴓ']), + ('ⴔ', &['Ⴔ']), + ('ⴕ', &['Ⴕ']), + ('ⴖ', &['Ⴖ']), + ('ⴗ', &['Ⴗ']), + ('ⴘ', &['Ⴘ']), + ('ⴙ', &['Ⴙ']), + ('ⴚ', &['Ⴚ']), + ('ⴛ', &['Ⴛ']), + ('ⴜ', &['Ⴜ']), + ('ⴝ', &['Ⴝ']), + ('ⴞ', &['Ⴞ']), + ('ⴟ', &['Ⴟ']), + ('ⴠ', &['Ⴠ']), + ('ⴡ', &['Ⴡ']), + ('ⴢ', &['Ⴢ']), + ('ⴣ', &['Ⴣ']), + ('ⴤ', &['Ⴤ']), + ('ⴥ', &['Ⴥ']), + ('ⴧ', &['Ⴧ']), + ('ⴭ', &['Ⴭ']), + ('Ꙁ', &['ꙁ']), + ('ꙁ', &['Ꙁ']), + ('Ꙃ', &['ꙃ']), + ('ꙃ', &['Ꙃ']), + ('Ꙅ', &['ꙅ']), + ('ꙅ', &['Ꙅ']), + ('Ꙇ', &['ꙇ']), + ('ꙇ', &['Ꙇ']), + ('Ꙉ', &['ꙉ']), + ('ꙉ', &['Ꙉ']), + ('Ꙋ', &['ᲈ', 'ꙋ']), + ('ꙋ', &['ᲈ', 'Ꙋ']), + ('Ꙍ', &['ꙍ']), + ('ꙍ', &['Ꙍ']), + ('Ꙏ', &['ꙏ']), + ('ꙏ', &['Ꙏ']), + ('Ꙑ', &['ꙑ']), + ('ꙑ', &['Ꙑ']), + ('Ꙓ', &['ꙓ']), + ('ꙓ', &['Ꙓ']), + ('Ꙕ', &['ꙕ']), + ('ꙕ', &['Ꙕ']), + ('Ꙗ', &['ꙗ']), + ('ꙗ', &['Ꙗ']), + ('Ꙙ', &['ꙙ']), + ('ꙙ', &['Ꙙ']), + ('Ꙛ', &['ꙛ']), + ('ꙛ', &['Ꙛ']), + ('Ꙝ', &['ꙝ']), + ('ꙝ', &['Ꙝ']), + ('Ꙟ', &['ꙟ']), + ('ꙟ', &['Ꙟ']), + ('Ꙡ', &['ꙡ']), + ('ꙡ', &['Ꙡ']), + ('Ꙣ', &['ꙣ']), + ('ꙣ', &['Ꙣ']), + ('Ꙥ', &['ꙥ']), + ('ꙥ', &['Ꙥ']), + ('Ꙧ', &['ꙧ']), + ('ꙧ', &['Ꙧ']), + ('Ꙩ', &['ꙩ']), + ('ꙩ', &['Ꙩ']), + ('Ꙫ', &['ꙫ']), + ('ꙫ', &['Ꙫ']), + ('Ꙭ', &['ꙭ']), + ('ꙭ', &['Ꙭ']), + ('Ꚁ', &['ꚁ']), + ('ꚁ', &['Ꚁ']), + ('Ꚃ', &['ꚃ']), + ('ꚃ', &['Ꚃ']), + ('Ꚅ', &['ꚅ']), + ('ꚅ', &['Ꚅ']), + ('Ꚇ', &['ꚇ']), + ('ꚇ', &['Ꚇ']), + ('Ꚉ', &['ꚉ']), + ('ꚉ', &['Ꚉ']), + ('Ꚋ', &['ꚋ']), + ('ꚋ', &['Ꚋ']), + ('Ꚍ', &['ꚍ']), + ('ꚍ', &['Ꚍ']), + ('Ꚏ', &['ꚏ']), + ('ꚏ', &['Ꚏ']), + ('Ꚑ', &['ꚑ']), + ('ꚑ', &['Ꚑ']), + ('Ꚓ', &['ꚓ']), + ('ꚓ', &['Ꚓ']), + ('Ꚕ', &['ꚕ']), + ('ꚕ', &['Ꚕ']), + ('Ꚗ', &['ꚗ']), + ('ꚗ', &['Ꚗ']), + ('Ꚙ', &['ꚙ']), + ('ꚙ', &['Ꚙ']), + ('Ꚛ', &['ꚛ']), + ('ꚛ', &['Ꚛ']), + ('Ꜣ', &['ꜣ']), + ('ꜣ', &['Ꜣ']), + ('Ꜥ', &['ꜥ']), + ('ꜥ', &['Ꜥ']), + ('Ꜧ', &['ꜧ']), + ('ꜧ', &['Ꜧ']), + ('Ꜩ', &['ꜩ']), + ('ꜩ', &['Ꜩ']), + ('Ꜫ', &['ꜫ']), + ('ꜫ', &['Ꜫ']), + ('Ꜭ', &['ꜭ']), + ('ꜭ', &['Ꜭ']), + ('Ꜯ', &['ꜯ']), + ('ꜯ', &['Ꜯ']), + ('Ꜳ', &['ꜳ']), + ('ꜳ', &['Ꜳ']), + ('Ꜵ', &['ꜵ']), + ('ꜵ', &['Ꜵ']), + ('Ꜷ', &['ꜷ']), + ('ꜷ', &['Ꜷ']), + ('Ꜹ', &['ꜹ']), + ('ꜹ', &['Ꜹ']), + ('Ꜻ', &['ꜻ']), + ('ꜻ', &['Ꜻ']), + ('Ꜽ', &['ꜽ']), + ('ꜽ', &['Ꜽ']), + ('Ꜿ', &['ꜿ']), + ('ꜿ', &['Ꜿ']), + ('Ꝁ', &['ꝁ']), + ('ꝁ', &['Ꝁ']), + ('Ꝃ', &['ꝃ']), + ('ꝃ', &['Ꝃ']), + ('Ꝅ', &['ꝅ']), + ('ꝅ', &['Ꝅ']), + ('Ꝇ', &['ꝇ']), + ('ꝇ', &['Ꝇ']), + ('Ꝉ', &['ꝉ']), + ('ꝉ', &['Ꝉ']), + ('Ꝋ', &['ꝋ']), + ('ꝋ', &['Ꝋ']), + ('Ꝍ', &['ꝍ']), + ('ꝍ', &['Ꝍ']), + ('Ꝏ', &['ꝏ']), + ('ꝏ', &['Ꝏ']), + ('Ꝑ', &['ꝑ']), + ('ꝑ', &['Ꝑ']), + ('Ꝓ', &['ꝓ']), + ('ꝓ', &['Ꝓ']), + ('Ꝕ', &['ꝕ']), + ('ꝕ', &['Ꝕ']), + ('Ꝗ', &['ꝗ']), + ('ꝗ', &['Ꝗ']), + ('Ꝙ', &['ꝙ']), + ('ꝙ', &['Ꝙ']), + ('Ꝛ', &['ꝛ']), + ('ꝛ', &['Ꝛ']), + ('Ꝝ', &['ꝝ']), + ('ꝝ', &['Ꝝ']), + ('Ꝟ', &['ꝟ']), + ('ꝟ', &['Ꝟ']), + ('Ꝡ', &['ꝡ']), + ('ꝡ', &['Ꝡ']), + ('Ꝣ', &['ꝣ']), + ('ꝣ', &['Ꝣ']), + ('Ꝥ', &['ꝥ']), + ('ꝥ', &['Ꝥ']), + ('Ꝧ', &['ꝧ']), + ('ꝧ', &['Ꝧ']), + ('Ꝩ', &['ꝩ']), + ('ꝩ', &['Ꝩ']), + ('Ꝫ', &['ꝫ']), + ('ꝫ', &['Ꝫ']), + ('Ꝭ', &['ꝭ']), + ('ꝭ', &['Ꝭ']), + ('Ꝯ', &['ꝯ']), + ('ꝯ', &['Ꝯ']), + ('Ꝺ', &['ꝺ']), + ('ꝺ', &['Ꝺ']), + ('Ꝼ', &['ꝼ']), + ('ꝼ', &['Ꝼ']), + ('Ᵹ', &['ᵹ']), + ('Ꝿ', &['ꝿ']), + ('ꝿ', &['Ꝿ']), + ('Ꞁ', &['ꞁ']), + ('ꞁ', &['Ꞁ']), + ('Ꞃ', &['ꞃ']), + ('ꞃ', &['Ꞃ']), + ('Ꞅ', &['ꞅ']), + ('ꞅ', &['Ꞅ']), + ('Ꞇ', &['ꞇ']), + ('ꞇ', &['Ꞇ']), + ('Ꞌ', &['ꞌ']), + ('ꞌ', &['Ꞌ']), + ('Ɥ', &['ɥ']), + ('Ꞑ', &['ꞑ']), + ('ꞑ', &['Ꞑ']), + ('Ꞓ', &['ꞓ']), + ('ꞓ', &['Ꞓ']), + ('ꞔ', &['Ꞔ']), + ('Ꞗ', &['ꞗ']), + ('ꞗ', &['Ꞗ']), + ('Ꞙ', &['ꞙ']), + ('ꞙ', &['Ꞙ']), + ('Ꞛ', &['ꞛ']), + ('ꞛ', &['Ꞛ']), + ('Ꞝ', &['ꞝ']), + ('ꞝ', &['Ꞝ']), + ('Ꞟ', &['ꞟ']), + ('ꞟ', &['Ꞟ']), + ('Ꞡ', &['ꞡ']), + ('ꞡ', &['Ꞡ']), + ('Ꞣ', &['ꞣ']), + ('ꞣ', &['Ꞣ']), + ('Ꞥ', &['ꞥ']), + ('ꞥ', &['Ꞥ']), + ('Ꞧ', &['ꞧ']), + ('ꞧ', &['Ꞧ']), + ('Ꞩ', &['ꞩ']), + ('ꞩ', &['Ꞩ']), + ('Ɦ', &['ɦ']), + ('Ɜ', &['ɜ']), + ('Ɡ', &['ɡ']), + ('Ɬ', &['ɬ']), + ('Ɪ', &['ɪ']), + ('Ʞ', &['ʞ']), + ('Ʇ', &['ʇ']), + ('Ʝ', &['ʝ']), + ('Ꭓ', &['ꭓ']), + ('Ꞵ', &['ꞵ']), + ('ꞵ', &['Ꞵ']), + ('Ꞷ', &['ꞷ']), + ('ꞷ', &['Ꞷ']), + ('Ꞹ', &['ꞹ']), + ('ꞹ', &['Ꞹ']), + ('Ꞻ', &['ꞻ']), + ('ꞻ', &['Ꞻ']), + ('Ꞽ', &['ꞽ']), + ('ꞽ', &['Ꞽ']), + ('Ꞿ', &['ꞿ']), + ('ꞿ', &['Ꞿ']), + ('Ꟁ', &['ꟁ']), + ('ꟁ', &['Ꟁ']), + ('Ꟃ', &['ꟃ']), + ('ꟃ', &['Ꟃ']), + ('Ꞔ', &['ꞔ']), + ('Ʂ', &['ʂ']), + ('Ᶎ', &['ᶎ']), + ('Ꟈ', &['ꟈ']), + ('ꟈ', &['Ꟈ']), + ('Ꟊ', &['ꟊ']), + ('ꟊ', &['Ꟊ']), + ('Ꟑ', &['ꟑ']), + ('ꟑ', &['Ꟑ']), + ('Ꟗ', &['ꟗ']), + ('ꟗ', &['Ꟗ']), + ('Ꟙ', &['ꟙ']), + ('ꟙ', &['Ꟙ']), + ('Ꟶ', &['ꟶ']), + ('ꟶ', &['Ꟶ']), + ('ꭓ', &['Ꭓ']), + ('ꭰ', &['Ꭰ']), + ('ꭱ', &['Ꭱ']), + ('ꭲ', &['Ꭲ']), + ('ꭳ', &['Ꭳ']), + ('ꭴ', &['Ꭴ']), + ('ꭵ', &['Ꭵ']), + ('ꭶ', &['Ꭶ']), + ('ꭷ', &['Ꭷ']), + ('ꭸ', &['Ꭸ']), + ('ꭹ', &['Ꭹ']), + ('ꭺ', &['Ꭺ']), + ('ꭻ', &['Ꭻ']), + ('ꭼ', &['Ꭼ']), + ('ꭽ', &['Ꭽ']), + ('ꭾ', &['Ꭾ']), + ('ꭿ', &['Ꭿ']), + ('ꮀ', &['Ꮀ']), + ('ꮁ', &['Ꮁ']), + ('ꮂ', &['Ꮂ']), + ('ꮃ', &['Ꮃ']), + ('ꮄ', &['Ꮄ']), + ('ꮅ', &['Ꮅ']), + ('ꮆ', &['Ꮆ']), + ('ꮇ', &['Ꮇ']), + ('ꮈ', &['Ꮈ']), + ('ꮉ', &['Ꮉ']), + ('ꮊ', &['Ꮊ']), + ('ꮋ', &['Ꮋ']), + ('ꮌ', &['Ꮌ']), + ('ꮍ', &['Ꮍ']), + ('ꮎ', &['Ꮎ']), + ('ꮏ', &['Ꮏ']), + ('ꮐ', &['Ꮐ']), + ('ꮑ', &['Ꮑ']), + ('ꮒ', &['Ꮒ']), + ('ꮓ', &['Ꮓ']), + ('ꮔ', &['Ꮔ']), + ('ꮕ', &['Ꮕ']), + ('ꮖ', &['Ꮖ']), + ('ꮗ', &['Ꮗ']), + ('ꮘ', &['Ꮘ']), + ('ꮙ', &['Ꮙ']), + ('ꮚ', &['Ꮚ']), + ('ꮛ', &['Ꮛ']), + ('ꮜ', &['Ꮜ']), + ('ꮝ', &['Ꮝ']), + ('ꮞ', &['Ꮞ']), + ('ꮟ', &['Ꮟ']), + ('ꮠ', &['Ꮠ']), + ('ꮡ', &['Ꮡ']), + ('ꮢ', &['Ꮢ']), + ('ꮣ', &['Ꮣ']), + ('ꮤ', &['Ꮤ']), + ('ꮥ', &['Ꮥ']), + ('ꮦ', &['Ꮦ']), + ('ꮧ', &['Ꮧ']), + ('ꮨ', &['Ꮨ']), + ('ꮩ', &['Ꮩ']), + ('ꮪ', &['Ꮪ']), + ('ꮫ', &['Ꮫ']), + ('ꮬ', &['Ꮬ']), + ('ꮭ', &['Ꮭ']), + ('ꮮ', &['Ꮮ']), + ('ꮯ', &['Ꮯ']), + ('ꮰ', &['Ꮰ']), + ('ꮱ', &['Ꮱ']), + ('ꮲ', &['Ꮲ']), + ('ꮳ', &['Ꮳ']), + ('ꮴ', &['Ꮴ']), + ('ꮵ', &['Ꮵ']), + ('ꮶ', &['Ꮶ']), + ('ꮷ', &['Ꮷ']), + ('ꮸ', &['Ꮸ']), + ('ꮹ', &['Ꮹ']), + ('ꮺ', &['Ꮺ']), + ('ꮻ', &['Ꮻ']), + ('ꮼ', &['Ꮼ']), + ('ꮽ', &['Ꮽ']), + ('ꮾ', &['Ꮾ']), + ('ꮿ', &['Ꮿ']), + ('A', &['a']), + ('B', &['b']), + ('C', &['c']), + ('D', &['d']), + ('E', &['e']), + ('F', &['f']), + ('G', &['g']), + ('H', &['h']), + ('I', &['i']), + ('J', &['j']), + ('K', &['k']), + ('L', &['l']), + ('M', &['m']), + ('N', &['n']), + ('O', &['o']), + ('P', &['p']), + ('Q', &['q']), + ('R', &['r']), + ('S', &['s']), + ('T', &['t']), + ('U', &['u']), + ('V', &['v']), + ('W', &['w']), + ('X', &['x']), + ('Y', &['y']), + ('Z', &['z']), + ('a', &['A']), + ('b', &['B']), + ('c', &['C']), + ('d', &['D']), + ('e', &['E']), + ('f', &['F']), + ('g', &['G']), + ('h', &['H']), + ('i', &['I']), + ('j', &['J']), + ('k', &['K']), + ('l', &['L']), + ('m', &['M']), + ('n', &['N']), + ('o', &['O']), + ('p', &['P']), + ('q', &['Q']), + ('r', &['R']), + ('s', &['S']), + ('t', &['T']), + ('u', &['U']), + ('v', &['V']), + ('w', &['W']), + ('x', &['X']), + ('y', &['Y']), + ('z', &['Z']), + ('𐐀', &['𐐨']), + ('𐐁', &['𐐩']), + ('𐐂', &['𐐪']), + ('𐐃', &['𐐫']), + ('𐐄', &['𐐬']), + ('𐐅', &['𐐭']), + ('𐐆', &['𐐮']), + ('𐐇', &['𐐯']), + ('𐐈', &['𐐰']), + ('𐐉', &['𐐱']), + ('𐐊', &['𐐲']), + ('𐐋', &['𐐳']), + ('𐐌', &['𐐴']), + ('𐐍', &['𐐵']), + ('𐐎', &['𐐶']), + ('𐐏', &['𐐷']), + ('𐐐', &['𐐸']), + ('𐐑', &['𐐹']), + ('𐐒', &['𐐺']), + ('𐐓', &['𐐻']), + ('𐐔', &['𐐼']), + ('𐐕', &['𐐽']), + ('𐐖', &['𐐾']), + ('𐐗', &['𐐿']), + ('𐐘', &['𐑀']), + ('𐐙', &['𐑁']), + ('𐐚', &['𐑂']), + ('𐐛', &['𐑃']), + ('𐐜', &['𐑄']), + ('𐐝', &['𐑅']), + ('𐐞', &['𐑆']), + ('𐐟', &['𐑇']), + ('𐐠', &['𐑈']), + ('𐐡', &['𐑉']), + ('𐐢', &['𐑊']), + ('𐐣', &['𐑋']), + ('𐐤', &['𐑌']), + ('𐐥', &['𐑍']), + ('𐐦', &['𐑎']), + ('𐐧', &['𐑏']), + ('𐐨', &['𐐀']), + ('𐐩', &['𐐁']), + ('𐐪', &['𐐂']), + ('𐐫', &['𐐃']), + ('𐐬', &['𐐄']), + ('𐐭', &['𐐅']), + ('𐐮', &['𐐆']), + ('𐐯', &['𐐇']), + ('𐐰', &['𐐈']), + ('𐐱', &['𐐉']), + ('𐐲', &['𐐊']), + ('𐐳', &['𐐋']), + ('𐐴', &['𐐌']), + ('𐐵', &['𐐍']), + ('𐐶', &['𐐎']), + ('𐐷', &['𐐏']), + ('𐐸', &['𐐐']), + ('𐐹', &['𐐑']), + ('𐐺', &['𐐒']), + ('𐐻', &['𐐓']), + ('𐐼', &['𐐔']), + ('𐐽', &['𐐕']), + ('𐐾', &['𐐖']), + ('𐐿', &['𐐗']), + ('𐑀', &['𐐘']), + ('𐑁', &['𐐙']), + ('𐑂', &['𐐚']), + ('𐑃', &['𐐛']), + ('𐑄', &['𐐜']), + ('𐑅', &['𐐝']), + ('𐑆', &['𐐞']), + ('𐑇', &['𐐟']), + ('𐑈', &['𐐠']), + ('𐑉', &['𐐡']), + ('𐑊', &['𐐢']), + ('𐑋', &['𐐣']), + ('𐑌', &['𐐤']), + ('𐑍', &['𐐥']), + ('𐑎', &['𐐦']), + ('𐑏', &['𐐧']), + ('𐒰', &['𐓘']), + ('𐒱', &['𐓙']), + ('𐒲', &['𐓚']), + ('𐒳', &['𐓛']), + ('𐒴', &['𐓜']), + ('𐒵', &['𐓝']), + ('𐒶', &['𐓞']), + ('𐒷', &['𐓟']), + ('𐒸', &['𐓠']), + ('𐒹', &['𐓡']), + ('𐒺', &['𐓢']), + ('𐒻', &['𐓣']), + ('𐒼', &['𐓤']), + ('𐒽', &['𐓥']), + ('𐒾', &['𐓦']), + ('𐒿', &['𐓧']), + ('𐓀', &['𐓨']), + ('𐓁', &['𐓩']), + ('𐓂', &['𐓪']), + ('𐓃', &['𐓫']), + ('𐓄', &['𐓬']), + ('𐓅', &['𐓭']), + ('𐓆', &['𐓮']), + ('𐓇', &['𐓯']), + ('𐓈', &['𐓰']), + ('𐓉', &['𐓱']), + ('𐓊', &['𐓲']), + ('𐓋', &['𐓳']), + ('𐓌', &['𐓴']), + ('𐓍', &['𐓵']), + ('𐓎', &['𐓶']), + ('𐓏', &['𐓷']), + ('𐓐', &['𐓸']), + ('𐓑', &['𐓹']), + ('𐓒', &['𐓺']), + ('𐓓', &['𐓻']), + ('𐓘', &['𐒰']), + ('𐓙', &['𐒱']), + ('𐓚', &['𐒲']), + ('𐓛', &['𐒳']), + ('𐓜', &['𐒴']), + ('𐓝', &['𐒵']), + ('𐓞', &['𐒶']), + ('𐓟', &['𐒷']), + ('𐓠', &['𐒸']), + ('𐓡', &['𐒹']), + ('𐓢', &['𐒺']), + ('𐓣', &['𐒻']), + ('𐓤', &['𐒼']), + ('𐓥', &['𐒽']), + ('𐓦', &['𐒾']), + ('𐓧', &['𐒿']), + ('𐓨', &['𐓀']), + ('𐓩', &['𐓁']), + ('𐓪', &['𐓂']), + ('𐓫', &['𐓃']), + ('𐓬', &['𐓄']), + ('𐓭', &['𐓅']), + ('𐓮', &['𐓆']), + ('𐓯', &['𐓇']), + ('𐓰', &['𐓈']), + ('𐓱', &['𐓉']), + ('𐓲', &['𐓊']), + ('𐓳', &['𐓋']), + ('𐓴', &['𐓌']), + ('𐓵', &['𐓍']), + ('𐓶', &['𐓎']), + ('𐓷', &['𐓏']), + ('𐓸', &['𐓐']), + ('𐓹', &['𐓑']), + ('𐓺', &['𐓒']), + ('𐓻', &['𐓓']), + ('𐕰', &['𐖗']), + ('𐕱', &['𐖘']), + ('𐕲', &['𐖙']), + ('𐕳', &['𐖚']), + ('𐕴', &['𐖛']), + ('𐕵', &['𐖜']), + ('𐕶', &['𐖝']), + ('𐕷', &['𐖞']), + ('𐕸', &['𐖟']), + ('𐕹', &['𐖠']), + ('𐕺', &['𐖡']), + ('𐕼', &['𐖣']), + ('𐕽', &['𐖤']), + ('𐕾', &['𐖥']), + ('𐕿', &['𐖦']), + ('𐖀', &['𐖧']), + ('𐖁', &['𐖨']), + ('𐖂', &['𐖩']), + ('𐖃', &['𐖪']), + ('𐖄', &['𐖫']), + ('𐖅', &['𐖬']), + ('𐖆', &['𐖭']), + ('𐖇', &['𐖮']), + ('𐖈', &['𐖯']), + ('𐖉', &['𐖰']), + ('𐖊', &['𐖱']), + ('𐖌', &['𐖳']), + ('𐖍', &['𐖴']), + ('𐖎', &['𐖵']), + ('𐖏', &['𐖶']), + ('𐖐', &['𐖷']), + ('𐖑', &['𐖸']), + ('𐖒', &['𐖹']), + ('𐖔', &['𐖻']), + ('𐖕', &['𐖼']), + ('𐖗', &['𐕰']), + ('𐖘', &['𐕱']), + ('𐖙', &['𐕲']), + ('𐖚', &['𐕳']), + ('𐖛', &['𐕴']), + ('𐖜', &['𐕵']), + ('𐖝', &['𐕶']), + ('𐖞', &['𐕷']), + ('𐖟', &['𐕸']), + ('𐖠', &['𐕹']), + ('𐖡', &['𐕺']), + ('𐖣', &['𐕼']), + ('𐖤', &['𐕽']), + ('𐖥', &['𐕾']), + ('𐖦', &['𐕿']), + ('𐖧', &['𐖀']), + ('𐖨', &['𐖁']), + ('𐖩', &['𐖂']), + ('𐖪', &['𐖃']), + ('𐖫', &['𐖄']), + ('𐖬', &['𐖅']), + ('𐖭', &['𐖆']), + ('𐖮', &['𐖇']), + ('𐖯', &['𐖈']), + ('𐖰', &['𐖉']), + ('𐖱', &['𐖊']), + ('𐖳', &['𐖌']), + ('𐖴', &['𐖍']), + ('𐖵', &['𐖎']), + ('𐖶', &['𐖏']), + ('𐖷', &['𐖐']), + ('𐖸', &['𐖑']), + ('𐖹', &['𐖒']), + ('𐖻', &['𐖔']), + ('𐖼', &['𐖕']), + ('𐲀', &['𐳀']), + ('𐲁', &['𐳁']), + ('𐲂', &['𐳂']), + ('𐲃', &['𐳃']), + ('𐲄', &['𐳄']), + ('𐲅', &['𐳅']), + ('𐲆', &['𐳆']), + ('𐲇', &['𐳇']), + ('𐲈', &['𐳈']), + ('𐲉', &['𐳉']), + ('𐲊', &['𐳊']), + ('𐲋', &['𐳋']), + ('𐲌', &['𐳌']), + ('𐲍', &['𐳍']), + ('𐲎', &['𐳎']), + ('𐲏', &['𐳏']), + ('𐲐', &['𐳐']), + ('𐲑', &['𐳑']), + ('𐲒', &['𐳒']), + ('𐲓', &['𐳓']), + ('𐲔', &['𐳔']), + ('𐲕', &['𐳕']), + ('𐲖', &['𐳖']), + ('𐲗', &['𐳗']), + ('𐲘', &['𐳘']), + ('𐲙', &['𐳙']), + ('𐲚', &['𐳚']), + ('𐲛', &['𐳛']), + ('𐲜', &['𐳜']), + ('𐲝', &['𐳝']), + ('𐲞', &['𐳞']), + ('𐲟', &['𐳟']), + ('𐲠', &['𐳠']), + ('𐲡', &['𐳡']), + ('𐲢', &['𐳢']), + ('𐲣', &['𐳣']), + ('𐲤', &['𐳤']), + ('𐲥', &['𐳥']), + ('𐲦', &['𐳦']), + ('𐲧', &['𐳧']), + ('𐲨', &['𐳨']), + ('𐲩', &['𐳩']), + ('𐲪', &['𐳪']), + ('𐲫', &['𐳫']), + ('𐲬', &['𐳬']), + ('𐲭', &['𐳭']), + ('𐲮', &['𐳮']), + ('𐲯', &['𐳯']), + ('𐲰', &['𐳰']), + ('𐲱', &['𐳱']), + ('𐲲', &['𐳲']), + ('𐳀', &['𐲀']), + ('𐳁', &['𐲁']), + ('𐳂', &['𐲂']), + ('𐳃', &['𐲃']), + ('𐳄', &['𐲄']), + ('𐳅', &['𐲅']), + ('𐳆', &['𐲆']), + ('𐳇', &['𐲇']), + ('𐳈', &['𐲈']), + ('𐳉', &['𐲉']), + ('𐳊', &['𐲊']), + ('𐳋', &['𐲋']), + ('𐳌', &['𐲌']), + ('𐳍', &['𐲍']), + ('𐳎', &['𐲎']), + ('𐳏', &['𐲏']), + ('𐳐', &['𐲐']), + ('𐳑', &['𐲑']), + ('𐳒', &['𐲒']), + ('𐳓', &['𐲓']), + ('𐳔', &['𐲔']), + ('𐳕', &['𐲕']), + ('𐳖', &['𐲖']), + ('𐳗', &['𐲗']), + ('𐳘', &['𐲘']), + ('𐳙', &['𐲙']), + ('𐳚', &['𐲚']), + ('𐳛', &['𐲛']), + ('𐳜', &['𐲜']), + ('𐳝', &['𐲝']), + ('𐳞', &['𐲞']), + ('𐳟', &['𐲟']), + ('𐳠', &['𐲠']), + ('𐳡', &['𐲡']), + ('𐳢', &['𐲢']), + ('𐳣', &['𐲣']), + ('𐳤', &['𐲤']), + ('𐳥', &['𐲥']), + ('𐳦', &['𐲦']), + ('𐳧', &['𐲧']), + ('𐳨', &['𐲨']), + ('𐳩', &['𐲩']), + ('𐳪', &['𐲪']), + ('𐳫', &['𐲫']), + ('𐳬', &['𐲬']), + ('𐳭', &['𐲭']), + ('𐳮', &['𐲮']), + ('𐳯', &['𐲯']), + ('𐳰', &['𐲰']), + ('𐳱', &['𐲱']), + ('𐳲', &['𐲲']), + ('𑢠', &['𑣀']), + ('𑢡', &['𑣁']), + ('𑢢', &['𑣂']), + ('𑢣', &['𑣃']), + ('𑢤', &['𑣄']), + ('𑢥', &['𑣅']), + ('𑢦', &['𑣆']), + ('𑢧', &['𑣇']), + ('𑢨', &['𑣈']), + ('𑢩', &['𑣉']), + ('𑢪', &['𑣊']), + ('𑢫', &['𑣋']), + ('𑢬', &['𑣌']), + ('𑢭', &['𑣍']), + ('𑢮', &['𑣎']), + ('𑢯', &['𑣏']), + ('𑢰', &['𑣐']), + ('𑢱', &['𑣑']), + ('𑢲', &['𑣒']), + ('𑢳', &['𑣓']), + ('𑢴', &['𑣔']), + ('𑢵', &['𑣕']), + ('𑢶', &['𑣖']), + ('𑢷', &['𑣗']), + ('𑢸', &['𑣘']), + ('𑢹', &['𑣙']), + ('𑢺', &['𑣚']), + ('𑢻', &['𑣛']), + ('𑢼', &['𑣜']), + ('𑢽', &['𑣝']), + ('𑢾', &['𑣞']), + ('𑢿', &['𑣟']), + ('𑣀', &['𑢠']), + ('𑣁', &['𑢡']), + ('𑣂', &['𑢢']), + ('𑣃', &['𑢣']), + ('𑣄', &['𑢤']), + ('𑣅', &['𑢥']), + ('𑣆', &['𑢦']), + ('𑣇', &['𑢧']), + ('𑣈', &['𑢨']), + ('𑣉', &['𑢩']), + ('𑣊', &['𑢪']), + ('𑣋', &['𑢫']), + ('𑣌', &['𑢬']), + ('𑣍', &['𑢭']), + ('𑣎', &['𑢮']), + ('𑣏', &['𑢯']), + ('𑣐', &['𑢰']), + ('𑣑', &['𑢱']), + ('𑣒', &['𑢲']), + ('𑣓', &['𑢳']), + ('𑣔', &['𑢴']), + ('𑣕', &['𑢵']), + ('𑣖', &['𑢶']), + ('𑣗', &['𑢷']), + ('𑣘', &['𑢸']), + ('𑣙', &['𑢹']), + ('𑣚', &['𑢺']), + ('𑣛', &['𑢻']), + ('𑣜', &['𑢼']), + ('𑣝', &['𑢽']), + ('𑣞', &['𑢾']), + ('𑣟', &['𑢿']), + ('𖹀', &['𖹠']), + ('𖹁', &['𖹡']), + ('𖹂', &['𖹢']), + ('𖹃', &['𖹣']), + ('𖹄', &['𖹤']), + ('𖹅', &['𖹥']), + ('𖹆', &['𖹦']), + ('𖹇', &['𖹧']), + ('𖹈', &['𖹨']), + ('𖹉', &['𖹩']), + ('𖹊', &['𖹪']), + ('𖹋', &['𖹫']), + ('𖹌', &['𖹬']), + ('𖹍', &['𖹭']), + ('𖹎', &['𖹮']), + ('𖹏', &['𖹯']), + ('𖹐', &['𖹰']), + ('𖹑', &['𖹱']), + ('𖹒', &['𖹲']), + ('𖹓', &['𖹳']), + ('𖹔', &['𖹴']), + ('𖹕', &['𖹵']), + ('𖹖', &['𖹶']), + ('𖹗', &['𖹷']), + ('𖹘', &['𖹸']), + ('𖹙', &['𖹹']), + ('𖹚', &['𖹺']), + ('𖹛', &['𖹻']), + ('𖹜', &['𖹼']), + ('𖹝', &['𖹽']), + ('𖹞', &['𖹾']), + ('𖹟', &['𖹿']), + ('𖹠', &['𖹀']), + ('𖹡', &['𖹁']), + ('𖹢', &['𖹂']), + ('𖹣', &['𖹃']), + ('𖹤', &['𖹄']), + ('𖹥', &['𖹅']), + ('𖹦', &['𖹆']), + ('𖹧', &['𖹇']), + ('𖹨', &['𖹈']), + ('𖹩', &['𖹉']), + ('𖹪', &['𖹊']), + ('𖹫', &['𖹋']), + ('𖹬', &['𖹌']), + ('𖹭', &['𖹍']), + ('𖹮', &['𖹎']), + ('𖹯', &['𖹏']), + ('𖹰', &['𖹐']), + ('𖹱', &['𖹑']), + ('𖹲', &['𖹒']), + ('𖹳', &['𖹓']), + ('𖹴', &['𖹔']), + ('𖹵', &['𖹕']), + ('𖹶', &['𖹖']), + ('𖹷', &['𖹗']), + ('𖹸', &['𖹘']), + ('𖹹', &['𖹙']), + ('𖹺', &['𖹚']), + ('𖹻', &['𖹛']), + ('𖹼', &['𖹜']), + ('𖹽', &['𖹝']), + ('𖹾', &['𖹞']), + ('𖹿', &['𖹟']), + ('𞤀', &['𞤢']), + ('𞤁', &['𞤣']), + ('𞤂', &['𞤤']), + ('𞤃', &['𞤥']), + ('𞤄', &['𞤦']), + ('𞤅', &['𞤧']), + ('𞤆', &['𞤨']), + ('𞤇', &['𞤩']), + ('𞤈', &['𞤪']), + ('𞤉', &['𞤫']), + ('𞤊', &['𞤬']), + ('𞤋', &['𞤭']), + ('𞤌', &['𞤮']), + ('𞤍', &['𞤯']), + ('𞤎', &['𞤰']), + ('𞤏', &['𞤱']), + ('𞤐', &['𞤲']), + ('𞤑', &['𞤳']), + ('𞤒', &['𞤴']), + ('𞤓', &['𞤵']), + ('𞤔', &['𞤶']), + ('𞤕', &['𞤷']), + ('𞤖', &['𞤸']), + ('𞤗', &['𞤹']), + ('𞤘', &['𞤺']), + ('𞤙', &['𞤻']), + ('𞤚', &['𞤼']), + ('𞤛', &['𞤽']), + ('𞤜', &['𞤾']), + ('𞤝', &['𞤿']), + ('𞤞', &['𞥀']), + ('𞤟', &['𞥁']), + ('𞤠', &['𞥂']), + ('𞤡', &['𞥃']), + ('𞤢', &['𞤀']), + ('𞤣', &['𞤁']), + ('𞤤', &['𞤂']), + ('𞤥', &['𞤃']), + ('𞤦', &['𞤄']), + ('𞤧', &['𞤅']), + ('𞤨', &['𞤆']), + ('𞤩', &['𞤇']), + ('𞤪', &['𞤈']), + ('𞤫', &['𞤉']), + ('𞤬', &['𞤊']), + ('𞤭', &['𞤋']), + ('𞤮', &['𞤌']), + ('𞤯', &['𞤍']), + ('𞤰', &['𞤎']), + ('𞤱', &['𞤏']), + ('𞤲', &['𞤐']), + ('𞤳', &['𞤑']), + ('𞤴', &['𞤒']), + ('𞤵', &['𞤓']), + ('𞤶', &['𞤔']), + ('𞤷', &['𞤕']), + ('𞤸', &['𞤖']), + ('𞤹', &['𞤗']), + ('𞤺', &['𞤘']), + ('𞤻', &['𞤙']), + ('𞤼', &['𞤚']), + ('𞤽', &['𞤛']), + ('𞤾', &['𞤜']), + ('𞤿', &['𞤝']), + ('𞥀', &['𞤞']), + ('𞥁', &['𞤟']), + ('𞥂', &['𞤠']), + ('𞥃', &['𞤡']), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/general_category.rs b/vendor/regex-syntax/src/unicode_tables/general_category.rs new file mode 100644 index 0000000..8fc9289 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/general_category.rs @@ -0,0 +1,6552 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category ucd-15.0.0 --chars --exclude surrogate +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Cased_Letter", CASED_LETTER), + ("Close_Punctuation", CLOSE_PUNCTUATION), + ("Connector_Punctuation", CONNECTOR_PUNCTUATION), + ("Control", CONTROL), + ("Currency_Symbol", CURRENCY_SYMBOL), + ("Dash_Punctuation", DASH_PUNCTUATION), + ("Decimal_Number", DECIMAL_NUMBER), + ("Enclosing_Mark", ENCLOSING_MARK), + ("Final_Punctuation", FINAL_PUNCTUATION), + ("Format", FORMAT), + ("Initial_Punctuation", INITIAL_PUNCTUATION), + ("Letter", LETTER), + ("Letter_Number", LETTER_NUMBER), + ("Line_Separator", LINE_SEPARATOR), + ("Lowercase_Letter", LOWERCASE_LETTER), + ("Mark", MARK), + ("Math_Symbol", MATH_SYMBOL), + ("Modifier_Letter", MODIFIER_LETTER), + ("Modifier_Symbol", MODIFIER_SYMBOL), + ("Nonspacing_Mark", NONSPACING_MARK), + ("Number", NUMBER), + ("Open_Punctuation", OPEN_PUNCTUATION), + ("Other", OTHER), + ("Other_Letter", OTHER_LETTER), + ("Other_Number", OTHER_NUMBER), + ("Other_Punctuation", OTHER_PUNCTUATION), + ("Other_Symbol", OTHER_SYMBOL), + ("Paragraph_Separator", PARAGRAPH_SEPARATOR), + ("Private_Use", PRIVATE_USE), + ("Punctuation", PUNCTUATION), + ("Separator", SEPARATOR), + ("Space_Separator", SPACE_SEPARATOR), + ("Spacing_Mark", SPACING_MARK), + ("Symbol", SYMBOL), + ("Titlecase_Letter", TITLECASE_LETTER), + ("Unassigned", UNASSIGNED), + ("Uppercase_Letter", UPPERCASE_LETTER), +]; + +pub const CASED_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('µ', 'µ'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ƺ'), + ('Ƽ', 'ƿ'), + ('DŽ', 'ʓ'), + ('ʕ', 'ʯ'), + ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՠ', 'ֈ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჽ', 'ჿ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᴀ', 'ᴫ'), + ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶚ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ↄ', 'ↄ'), + ('Ⰰ', 'ⱻ'), + ('Ȿ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('Ꙁ', 'ꙭ'), + ('Ꚁ', 'ꚛ'), + ('Ꜣ', 'ꝯ'), + ('ꝱ', 'ꞇ'), + ('Ꞌ', 'ꞎ'), + ('Ꞑ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('Ꟶ', 'ꟶ'), + ('ꟺ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭠ', 'ꭨ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('A', 'Z'), + ('a', 'z'), + ('𐐀', '𐑏'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𑢠', '𑣟'), + ('𖹀', '𖹿'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞤀', '𞥃'), +]; + +pub const CLOSE_PUNCTUATION: &'static [(char, char)] = &[ + (')', ')'), + (']', ']'), + ('}', '}'), + ('༻', '༻'), + ('༽', '༽'), + ('᚜', '᚜'), + ('⁆', '⁆'), + ('⁾', '⁾'), + ('₎', '₎'), + ('⌉', '⌉'), + ('⌋', '⌋'), + ('〉', '〉'), + ('❩', '❩'), + ('❫', '❫'), + ('❭', '❭'), + ('❯', '❯'), + ('❱', '❱'), + ('❳', '❳'), + ('❵', '❵'), + ('⟆', '⟆'), + ('⟧', '⟧'), + ('⟩', '⟩'), + ('⟫', '⟫'), + ('⟭', '⟭'), + ('⟯', '⟯'), + ('⦄', '⦄'), + ('⦆', '⦆'), + ('⦈', '⦈'), + ('⦊', '⦊'), + ('⦌', '⦌'), + ('⦎', '⦎'), + ('⦐', '⦐'), + ('⦒', '⦒'), + ('⦔', '⦔'), + ('⦖', '⦖'), + ('⦘', '⦘'), + ('⧙', '⧙'), + ('⧛', '⧛'), + ('⧽', '⧽'), + ('⸣', '⸣'), + ('⸥', '⸥'), + ('⸧', '⸧'), + ('⸩', '⸩'), + ('⹖', '⹖'), + ('⹘', '⹘'), + ('⹚', '⹚'), + ('⹜', '⹜'), + ('〉', '〉'), + ('》', '》'), + ('」', '」'), + ('』', '』'), + ('】', '】'), + ('〕', '〕'), + ('〗', '〗'), + ('〙', '〙'), + ('〛', '〛'), + ('〞', '〟'), + ('﴾', '﴾'), + ('︘', '︘'), + ('︶', '︶'), + ('︸', '︸'), + ('︺', '︺'), + ('︼', '︼'), + ('︾', '︾'), + ('﹀', '﹀'), + ('﹂', '﹂'), + ('﹄', '﹄'), + ('﹈', '﹈'), + ('﹚', '﹚'), + ('﹜', '﹜'), + ('﹞', '﹞'), + (')', ')'), + (']', ']'), + ('}', '}'), + ('⦆', '⦆'), + ('」', '」'), +]; + +pub const CONNECTOR_PUNCTUATION: &'static [(char, char)] = &[ + ('_', '_'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('︳', '︴'), + ('﹍', '﹏'), + ('_', '_'), +]; + +pub const CONTROL: &'static [(char, char)] = + &[('\0', '\u{1f}'), ('\u{7f}', '\u{9f}')]; + +pub const CURRENCY_SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), + ('¢', '¥'), + ('֏', '֏'), + ('؋', '؋'), + ('߾', '߿'), + ('৲', '৳'), + ('৻', '৻'), + ('૱', '૱'), + ('௹', '௹'), + ('฿', '฿'), + ('៛', '៛'), + ('₠', '⃀'), + ('꠸', '꠸'), + ('﷼', '﷼'), + ('﹩', '﹩'), + ('$', '$'), + ('¢', '£'), + ('¥', '₩'), + ('𑿝', '𑿠'), + ('𞋿', '𞋿'), + ('𞲰', '𞲰'), +]; + +pub const DASH_PUNCTUATION: &'static [(char, char)] = &[ + ('-', '-'), + ('֊', '֊'), + ('־', '־'), + ('᐀', '᐀'), + ('᠆', '᠆'), + ('‐', '―'), + ('⸗', '⸗'), + ('⸚', '⸚'), + ('⸺', '⸻'), + ('⹀', '⹀'), + ('⹝', '⹝'), + ('〜', '〜'), + ('〰', '〰'), + ('゠', '゠'), + ('︱', '︲'), + ('﹘', '﹘'), + ('﹣', '﹣'), + ('-', '-'), + ('𐺭', '𐺭'), +]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑥐', '𑥙'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞥐', '𞥙'), + ('🯰', '🯹'), +]; + +pub const ENCLOSING_MARK: &'static [(char, char)] = &[ + ('\u{488}', '\u{489}'), + ('\u{1abe}', '\u{1abe}'), + ('\u{20dd}', '\u{20e0}'), + ('\u{20e2}', '\u{20e4}'), + ('\u{a670}', '\u{a672}'), +]; + +pub const FINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('»', '»'), + ('’', '’'), + ('”', '”'), + ('›', '›'), + ('⸃', '⸃'), + ('⸅', '⸅'), + ('⸊', '⸊'), + ('⸍', '⸍'), + ('⸝', '⸝'), + ('⸡', '⸡'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{600}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), + ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{13430}', '\u{1343f}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const INITIAL_PUNCTUATION: &'static [(char, char)] = &[ + ('«', '«'), + ('‘', '‘'), + ('‛', '“'), + ('‟', '‟'), + ('‹', '‹'), + ('⸂', '⸂'), + ('⸄', '⸄'), + ('⸉', '⸉'), + ('⸌', '⸌'), + ('⸜', '⸜'), + ('⸠', '⸠'), +]; + +pub const LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('เ', 'ๆ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛱ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ↄ', 'ↄ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ⸯ', 'ⸯ'), + ('々', '〆'), + ('〱', '〵'), + ('〻', '〼'), + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛥ'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍀'), + ('𐍂', '𐍉'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const LETTER_NUMBER: &'static [(char, char)] = &[ + ('ᛮ', 'ᛰ'), + ('Ⅰ', 'ↂ'), + ('ↅ', 'ↈ'), + ('〇', '〇'), + ('〡', '〩'), + ('〸', '〺'), + ('ꛦ', 'ꛯ'), + ('𐅀', '𐅴'), + ('𐍁', '𐍁'), + ('𐍊', '𐍊'), + ('𐏑', '𐏕'), + ('𒐀', '𒑮'), +]; + +pub const LINE_SEPARATOR: &'static [(char, char)] = + &[('\u{2028}', '\u{2028}')]; + +pub const LOWERCASE_LETTER: &'static [(char, char)] = &[ + ('a', 'z'), + ('µ', 'µ'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ĸ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƍ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƛ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƪ', 'ƫ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƺ'), + ('ƽ', 'ƿ'), + ('dž', 'dž'), + ('lj', 'lj'), + ('nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'ǰ'), + ('dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȡ', 'ȡ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȹ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ʓ'), + ('ʕ', 'ʯ'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϼ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ՠ', 'ֈ'), + ('ა', 'ჺ'), + ('ჽ', 'ჿ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᴀ', 'ᴫ'), + ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶚ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẝ'), + ('ẟ', 'ẟ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), + ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ℊ', 'ℊ'), + ('ℎ', 'ℏ'), + ('ℓ', 'ℓ'), + ('ℯ', 'ℯ'), + ('ℴ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℽ'), + ('ⅆ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('ↄ', 'ↄ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱱ', 'ⱱ'), + ('ⱳ', 'ⱴ'), + ('ⱶ', 'ⱻ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳤ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜱ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝯ'), + ('ꝱ', 'ꝸ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞎ', 'ꞎ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞕ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞯ', 'ꞯ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟕ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟶ', 'ꟶ'), + ('ꟺ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭠ', 'ꭨ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𝐚', '𝐳'), + ('𝑎', '𝑔'), + ('𝑖', '𝑧'), + ('𝒂', '𝒛'), + ('𝒶', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝓏'), + ('𝓪', '𝔃'), + ('𝔞', '𝔷'), + ('𝕒', '𝕫'), + ('𝖆', '𝖟'), + ('𝖺', '𝗓'), + ('𝗮', '𝘇'), + ('𝘢', '𝘻'), + ('𝙖', '𝙯'), + ('𝚊', '𝚥'), + ('𝛂', '𝛚'), + ('𝛜', '𝛡'), + ('𝛼', '𝜔'), + ('𝜖', '𝜛'), + ('𝜶', '𝝎'), + ('𝝐', '𝝕'), + ('𝝰', '𝞈'), + ('𝞊', '𝞏'), + ('𝞪', '𝟂'), + ('𝟄', '𝟉'), + ('𝟋', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞤢', '𞥃'), +]; + +pub const MARK: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', 'ः'), + ('\u{93a}', '\u{93c}'), + ('ा', 'ॏ'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'ঃ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('\u{abc}', '\u{abc}'), + ('ા', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ಃ'), + ('\u{cbc}', '\u{cbc}'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('ೳ', 'ೳ'), + ('\u{d00}', 'ഃ'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'ඃ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', '༿'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('ါ', '\u{103e}'), + ('ၖ', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '᜕'), + ('\u{1732}', '᜴'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('\u{1a17}', '\u{1a1b}'), + ('ᩕ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᬄ'), + ('\u{1b34}', '᭄'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'ᮂ'), + ('ᮡ', '\u{1bad}'), + ('\u{1be6}', '᯳'), + ('ᰤ', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('᳷', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('ꠣ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꢀ', 'ꢁ'), + ('ꢴ', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '꥓'), + ('\u{a980}', 'ꦃ'), + ('\u{a9b3}', '꧀'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ꩍ'), + ('ꩻ', 'ꩽ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ꫫ', 'ꫯ'), + ('ꫵ', '\u{aaf6}'), + ('ꯣ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('𑀀', '𑀂'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '𑂂'), + ('𑂰', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11134}'), + ('𑅅', '𑅆'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '𑆂'), + ('𑆳', '𑇀'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '\u{111cf}'), + ('𑈬', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112ea}'), + ('\u{11300}', '𑌃'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐵', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114c3}'), + ('\u{115af}', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('𑘰', '\u{11640}'), + ('\u{116ab}', '\u{116b7}'), + ('\u{1171d}', '\u{1172b}'), + ('𑠬', '\u{1183a}'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193e}'), + ('𑥀', '𑥀'), + ('𑥂', '\u{11943}'), + ('𑧑', '\u{119d7}'), + ('\u{119da}', '\u{119e0}'), + ('𑧤', '𑧤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '𑨹'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a99}'), + ('𑰯', '\u{11c36}'), + ('\u{11c38}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('𑶊', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '\u{11d97}'), + ('\u{11ef3}', '𑻶'), + ('\u{11f00}', '\u{11f01}'), + ('𑼃', '𑼃'), + ('𑼴', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽑', '𖾇'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const MATH_SYMBOL: &'static [(char, char)] = &[ + ('+', '+'), + ('<', '>'), + ('|', '|'), + ('~', '~'), + ('¬', '¬'), + ('±', '±'), + ('×', '×'), + ('÷', '÷'), + ('϶', '϶'), + ('؆', '؈'), + ('⁄', '⁄'), + ('⁒', '⁒'), + ('⁺', '⁼'), + ('₊', '₌'), + ('℘', '℘'), + ('⅀', '⅄'), + ('⅋', '⅋'), + ('←', '↔'), + ('↚', '↛'), + ('↠', '↠'), + ('↣', '↣'), + ('↦', '↦'), + ('↮', '↮'), + ('⇎', '⇏'), + ('⇒', '⇒'), + ('⇔', '⇔'), + ('⇴', '⋿'), + ('⌠', '⌡'), + ('⍼', '⍼'), + ('⎛', '⎳'), + ('⏜', '⏡'), + ('▷', '▷'), + ('◁', '◁'), + ('◸', '◿'), + ('♯', '♯'), + ('⟀', '⟄'), + ('⟇', '⟥'), + ('⟰', '⟿'), + ('⤀', '⦂'), + ('⦙', '⧗'), + ('⧜', '⧻'), + ('⧾', '⫿'), + ('⬰', '⭄'), + ('⭇', '⭌'), + ('﬩', '﬩'), + ('﹢', '﹢'), + ('﹤', '﹦'), + ('+', '+'), + ('<', '>'), + ('|', '|'), + ('~', '~'), + ('¬', '¬'), + ('←', '↓'), + ('𝛁', '𝛁'), + ('𝛛', '𝛛'), + ('𝛻', '𝛻'), + ('𝜕', '𝜕'), + ('𝜵', '𝜵'), + ('𝝏', '𝝏'), + ('𝝯', '𝝯'), + ('𝞉', '𝞉'), + ('𝞩', '𝞩'), + ('𝟃', '𝟃'), + ('𞻰', '𞻱'), +]; + +pub const MODIFIER_LETTER: &'static [(char, char)] = &[ + ('ʰ', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('ʹ', 'ʹ'), + ('ͺ', 'ͺ'), + ('ՙ', 'ՙ'), + ('ـ', 'ـ'), + ('ۥ', 'ۦ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࣉ', 'ࣉ'), + ('ॱ', 'ॱ'), + ('ๆ', 'ๆ'), + ('ໆ', 'ໆ'), + ('ჼ', 'ჼ'), + ('ៗ', 'ៗ'), + ('ᡃ', 'ᡃ'), + ('ᪧ', 'ᪧ'), + ('ᱸ', 'ᱽ'), + ('ᴬ', 'ᵪ'), + ('ᵸ', 'ᵸ'), + ('ᶛ', 'ᶿ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ⱼ', 'ⱽ'), + ('ⵯ', 'ⵯ'), + ('ⸯ', 'ⸯ'), + ('々', '々'), + ('〱', '〵'), + ('〻', '〻'), + ('ゝ', 'ゞ'), + ('ー', 'ヾ'), + ('ꀕ', 'ꀕ'), + ('ꓸ', 'ꓽ'), + ('ꘌ', 'ꘌ'), + ('ꙿ', 'ꙿ'), + ('ꚜ', 'ꚝ'), + ('ꜗ', 'ꜟ'), + ('ꝰ', 'ꝰ'), + ('ꞈ', 'ꞈ'), + ('ꟲ', 'ꟴ'), + ('ꟸ', 'ꟹ'), + ('ꧏ', 'ꧏ'), + ('ꧦ', 'ꧦ'), + ('ꩰ', 'ꩰ'), + ('ꫝ', 'ꫝ'), + ('ꫳ', 'ꫴ'), + ('ꭜ', 'ꭟ'), + ('ꭩ', 'ꭩ'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𖭀', '𖭃'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𞀰', '𞁭'), + ('𞄷', '𞄽'), + ('𞓫', '𞓫'), + ('𞥋', '𞥋'), +]; + +pub const MODIFIER_SYMBOL: &'static [(char, char)] = &[ + ('^', '^'), + ('`', '`'), + ('¨', '¨'), + ('¯', '¯'), + ('´', '´'), + ('¸', '¸'), + ('˂', '˅'), + ('˒', '˟'), + ('˥', '˫'), + ('˭', '˭'), + ('˯', '˿'), + ('͵', '͵'), + ('΄', '΅'), + ('࢈', '࢈'), + ('᾽', '᾽'), + ('᾿', '῁'), + ('῍', '῏'), + ('῝', '῟'), + ('῭', '`'), + ('´', '῾'), + ('゛', '゜'), + ('꜀', '꜖'), + ('꜠', '꜡'), + ('꞉', '꞊'), + ('꭛', '꭛'), + ('꭪', '꭫'), + ('﮲', '﯂'), + ('^', '^'), + ('`', '`'), + (' ̄', ' ̄'), + ('🏻', '🏿'), +]; + +pub const NONSPACING_MARK: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{487}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3f}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b56}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1abf}', '\u{1ace}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b34}'), + ('\u{1b36}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302d}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a66f}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('\u{aaec}', '\u{aaed}'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{11340}', '\u{11340}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{11f00}', '\u{11f01}'), + ('\u{11f36}', '\u{11f3a}'), + ('\u{11f40}', '\u{11f40}'), + ('\u{11f42}', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('²', '³'), + ('¹', '¹'), + ('¼', '¾'), + ('٠', '٩'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('৴', '৹'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('୲', '୷'), + ('௦', '௲'), + ('౦', '౯'), + ('౸', '౾'), + ('೦', '೯'), + ('൘', '൞'), + ('൦', '൸'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༳'), + ('၀', '၉'), + ('႐', '႙'), + ('፩', '፼'), + ('ᛮ', 'ᛰ'), + ('០', '៩'), + ('៰', '៹'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧚'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('⁰', '⁰'), + ('⁴', '⁹'), + ('₀', '₉'), + ('⅐', 'ↂ'), + ('ↅ', '↉'), + ('①', '⒛'), + ('⓪', '⓿'), + ('❶', '➓'), + ('⳽', '⳽'), + ('〇', '〇'), + ('〡', '〩'), + ('〸', '〺'), + ('㆒', '㆕'), + ('㈠', '㈩'), + ('㉈', '㉏'), + ('㉑', '㉟'), + ('㊀', '㊉'), + ('㊱', '㊿'), + ('꘠', '꘩'), + ('ꛦ', 'ꛯ'), + ('꠰', '꠵'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐄇', '𐄳'), + ('𐅀', '𐅸'), + ('𐆊', '𐆋'), + ('𐋡', '𐋻'), + ('𐌠', '𐌣'), + ('𐍁', '𐍁'), + ('𐍊', '𐍊'), + ('𐏑', '𐏕'), + ('𐒠', '𐒩'), + ('𐡘', '𐡟'), + ('𐡹', '𐡿'), + ('𐢧', '𐢯'), + ('𐣻', '𐣿'), + ('𐤖', '𐤛'), + ('𐦼', '𐦽'), + ('𐧀', '𐧏'), + ('𐧒', '𐧿'), + ('𐩀', '𐩈'), + ('𐩽', '𐩾'), + ('𐪝', '𐪟'), + ('𐫫', '𐫯'), + ('𐭘', '𐭟'), + ('𐭸', '𐭿'), + ('𐮩', '𐮯'), + ('𐳺', '𐳿'), + ('𐴰', '𐴹'), + ('𐹠', '𐹾'), + ('𐼝', '𐼦'), + ('𐽑', '𐽔'), + ('𐿅', '𐿋'), + ('𑁒', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑇡', '𑇴'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜻'), + ('𑣠', '𑣲'), + ('𑥐', '𑥙'), + ('𑱐', '𑱬'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𑿀', '𑿔'), + ('𒐀', '𒑮'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𖭛', '𖭡'), + ('𖺀', '𖺖'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝍠', '𝍸'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞣇', '𞣏'), + ('𞥐', '𞥙'), + ('𞱱', '𞲫'), + ('𞲭', '𞲯'), + ('𞲱', '𞲴'), + ('𞴁', '𞴭'), + ('𞴯', '𞴽'), + ('🄀', '🄌'), + ('🯰', '🯹'), +]; + +pub const OPEN_PUNCTUATION: &'static [(char, char)] = &[ + ('(', '('), + ('[', '['), + ('{', '{'), + ('༺', '༺'), + ('༼', '༼'), + ('᚛', '᚛'), + ('‚', '‚'), + ('„', '„'), + ('⁅', '⁅'), + ('⁽', '⁽'), + ('₍', '₍'), + ('⌈', '⌈'), + ('⌊', '⌊'), + ('〈', '〈'), + ('❨', '❨'), + ('❪', '❪'), + ('❬', '❬'), + ('❮', '❮'), + ('❰', '❰'), + ('❲', '❲'), + ('❴', '❴'), + ('⟅', '⟅'), + ('⟦', '⟦'), + ('⟨', '⟨'), + ('⟪', '⟪'), + ('⟬', '⟬'), + ('⟮', '⟮'), + ('⦃', '⦃'), + ('⦅', '⦅'), + ('⦇', '⦇'), + ('⦉', '⦉'), + ('⦋', '⦋'), + ('⦍', '⦍'), + ('⦏', '⦏'), + ('⦑', '⦑'), + ('⦓', '⦓'), + ('⦕', '⦕'), + ('⦗', '⦗'), + ('⧘', '⧘'), + ('⧚', '⧚'), + ('⧼', '⧼'), + ('⸢', '⸢'), + ('⸤', '⸤'), + ('⸦', '⸦'), + ('⸨', '⸨'), + ('⹂', '⹂'), + ('⹕', '⹕'), + ('⹗', '⹗'), + ('⹙', '⹙'), + ('⹛', '⹛'), + ('〈', '〈'), + ('《', '《'), + ('「', '「'), + ('『', '『'), + ('【', '【'), + ('〔', '〔'), + ('〖', '〖'), + ('〘', '〘'), + ('〚', '〚'), + ('〝', '〝'), + ('﴿', '﴿'), + ('︗', '︗'), + ('︵', '︵'), + ('︷', '︷'), + ('︹', '︹'), + ('︻', '︻'), + ('︽', '︽'), + ('︿', '︿'), + ('﹁', '﹁'), + ('﹃', '﹃'), + ('﹇', '﹇'), + ('﹙', '﹙'), + ('﹛', '﹛'), + ('﹝', '﹝'), + ('(', '('), + ('[', '['), + ('{', '{'), + ('⦅', '⦅'), + ('「', '「'), +]; + +pub const OTHER: &'static [(char, char)] = &[ + ('\0', '\u{1f}'), + ('\u{7f}', '\u{9f}'), + ('\u{ad}', '\u{ad}'), + ('\u{378}', '\u{379}'), + ('\u{380}', '\u{383}'), + ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), + ('\u{3a2}', '\u{3a2}'), + ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), + ('\u{58b}', '\u{58c}'), + ('\u{590}', '\u{590}'), + ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ee}'), + ('\u{5f5}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70e}', '\u{70f}'), + ('\u{74b}', '\u{74c}'), + ('\u{7b2}', '\u{7bf}'), + ('\u{7fb}', '\u{7fc}'), + ('\u{82e}', '\u{82f}'), + ('\u{83f}', '\u{83f}'), + ('\u{85c}', '\u{85d}'), + ('\u{85f}', '\u{85f}'), + ('\u{86b}', '\u{86f}'), + ('\u{88f}', '\u{897}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{984}', '\u{984}'), + ('\u{98d}', '\u{98e}'), + ('\u{991}', '\u{992}'), + ('\u{9a9}', '\u{9a9}'), + ('\u{9b1}', '\u{9b1}'), + ('\u{9b3}', '\u{9b5}'), + ('\u{9ba}', '\u{9bb}'), + ('\u{9c5}', '\u{9c6}'), + ('\u{9c9}', '\u{9ca}'), + ('\u{9cf}', '\u{9d6}'), + ('\u{9d8}', '\u{9db}'), + ('\u{9de}', '\u{9de}'), + ('\u{9e4}', '\u{9e5}'), + ('\u{9ff}', '\u{a00}'), + ('\u{a04}', '\u{a04}'), + ('\u{a0b}', '\u{a0e}'), + ('\u{a11}', '\u{a12}'), + ('\u{a29}', '\u{a29}'), + ('\u{a31}', '\u{a31}'), + ('\u{a34}', '\u{a34}'), + ('\u{a37}', '\u{a37}'), + ('\u{a3a}', '\u{a3b}'), + ('\u{a3d}', '\u{a3d}'), + ('\u{a43}', '\u{a46}'), + ('\u{a49}', '\u{a4a}'), + ('\u{a4e}', '\u{a50}'), + ('\u{a52}', '\u{a58}'), + ('\u{a5d}', '\u{a5d}'), + ('\u{a5f}', '\u{a65}'), + ('\u{a77}', '\u{a80}'), + ('\u{a84}', '\u{a84}'), + ('\u{a8e}', '\u{a8e}'), + ('\u{a92}', '\u{a92}'), + ('\u{aa9}', '\u{aa9}'), + ('\u{ab1}', '\u{ab1}'), + ('\u{ab4}', '\u{ab4}'), + ('\u{aba}', '\u{abb}'), + ('\u{ac6}', '\u{ac6}'), + ('\u{aca}', '\u{aca}'), + ('\u{ace}', '\u{acf}'), + ('\u{ad1}', '\u{adf}'), + ('\u{ae4}', '\u{ae5}'), + ('\u{af2}', '\u{af8}'), + ('\u{b00}', '\u{b00}'), + ('\u{b04}', '\u{b04}'), + ('\u{b0d}', '\u{b0e}'), + ('\u{b11}', '\u{b12}'), + ('\u{b29}', '\u{b29}'), + ('\u{b31}', '\u{b31}'), + ('\u{b34}', '\u{b34}'), + ('\u{b3a}', '\u{b3b}'), + ('\u{b45}', '\u{b46}'), + ('\u{b49}', '\u{b4a}'), + ('\u{b4e}', '\u{b54}'), + ('\u{b58}', '\u{b5b}'), + ('\u{b5e}', '\u{b5e}'), + ('\u{b64}', '\u{b65}'), + ('\u{b78}', '\u{b81}'), + ('\u{b84}', '\u{b84}'), + ('\u{b8b}', '\u{b8d}'), + ('\u{b91}', '\u{b91}'), + ('\u{b96}', '\u{b98}'), + ('\u{b9b}', '\u{b9b}'), + ('\u{b9d}', '\u{b9d}'), + ('\u{ba0}', '\u{ba2}'), + ('\u{ba5}', '\u{ba7}'), + ('\u{bab}', '\u{bad}'), + ('\u{bba}', '\u{bbd}'), + ('\u{bc3}', '\u{bc5}'), + ('\u{bc9}', '\u{bc9}'), + ('\u{bce}', '\u{bcf}'), + ('\u{bd1}', '\u{bd6}'), + ('\u{bd8}', '\u{be5}'), + ('\u{bfb}', '\u{bff}'), + ('\u{c0d}', '\u{c0d}'), + ('\u{c11}', '\u{c11}'), + ('\u{c29}', '\u{c29}'), + ('\u{c3a}', '\u{c3b}'), + ('\u{c45}', '\u{c45}'), + ('\u{c49}', '\u{c49}'), + ('\u{c4e}', '\u{c54}'), + ('\u{c57}', '\u{c57}'), + ('\u{c5b}', '\u{c5c}'), + ('\u{c5e}', '\u{c5f}'), + ('\u{c64}', '\u{c65}'), + ('\u{c70}', '\u{c76}'), + ('\u{c8d}', '\u{c8d}'), + ('\u{c91}', '\u{c91}'), + ('\u{ca9}', '\u{ca9}'), + ('\u{cb4}', '\u{cb4}'), + ('\u{cba}', '\u{cbb}'), + ('\u{cc5}', '\u{cc5}'), + ('\u{cc9}', '\u{cc9}'), + ('\u{cce}', '\u{cd4}'), + ('\u{cd7}', '\u{cdc}'), + ('\u{cdf}', '\u{cdf}'), + ('\u{ce4}', '\u{ce5}'), + ('\u{cf0}', '\u{cf0}'), + ('\u{cf4}', '\u{cff}'), + ('\u{d0d}', '\u{d0d}'), + ('\u{d11}', '\u{d11}'), + ('\u{d45}', '\u{d45}'), + ('\u{d49}', '\u{d49}'), + ('\u{d50}', '\u{d53}'), + ('\u{d64}', '\u{d65}'), + ('\u{d80}', '\u{d80}'), + ('\u{d84}', '\u{d84}'), + ('\u{d97}', '\u{d99}'), + ('\u{db2}', '\u{db2}'), + ('\u{dbc}', '\u{dbc}'), + ('\u{dbe}', '\u{dbf}'), + ('\u{dc7}', '\u{dc9}'), + ('\u{dcb}', '\u{dce}'), + ('\u{dd5}', '\u{dd5}'), + ('\u{dd7}', '\u{dd7}'), + ('\u{de0}', '\u{de5}'), + ('\u{df0}', '\u{df1}'), + ('\u{df5}', '\u{e00}'), + ('\u{e3b}', '\u{e3e}'), + ('\u{e5c}', '\u{e80}'), + ('\u{e83}', '\u{e83}'), + ('\u{e85}', '\u{e85}'), + ('\u{e8b}', '\u{e8b}'), + ('\u{ea4}', '\u{ea4}'), + ('\u{ea6}', '\u{ea6}'), + ('\u{ebe}', '\u{ebf}'), + ('\u{ec5}', '\u{ec5}'), + ('\u{ec7}', '\u{ec7}'), + ('\u{ecf}', '\u{ecf}'), + ('\u{eda}', '\u{edb}'), + ('\u{ee0}', '\u{eff}'), + ('\u{f48}', '\u{f48}'), + ('\u{f6d}', '\u{f70}'), + ('\u{f98}', '\u{f98}'), + ('\u{fbd}', '\u{fbd}'), + ('\u{fcd}', '\u{fcd}'), + ('\u{fdb}', '\u{fff}'), + ('\u{10c6}', '\u{10c6}'), + ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), + ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), + ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), + ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), + ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), + ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), + ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), + ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), + ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), + ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), + ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), + ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), + ('\u{1716}', '\u{171e}'), + ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), + ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), + ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), + ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), + ('\u{180e}', '\u{180e}'), + ('\u{181a}', '\u{181f}'), + ('\u{1879}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), + ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), + ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), + ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), + ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), + ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), + ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), + ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), + ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), + ('\u{1acf}', '\u{1aff}'), + ('\u{1b4d}', '\u{1b4f}'), + ('\u{1b7f}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), + ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), + ('\u{1c89}', '\u{1c8f}'), + ('\u{1cbb}', '\u{1cbc}'), + ('\u{1cc8}', '\u{1ccf}'), + ('\u{1cfb}', '\u{1cff}'), + ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), + ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), + ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), + ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), + ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), + ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), + ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), + ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), + ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{206f}'), + ('\u{2072}', '\u{2073}'), + ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), + ('\u{20c1}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), + ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), + ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), + ('\u{2b96}', '\u{2b96}'), + ('\u{2cf4}', '\u{2cf8}'), + ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), + ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), + ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), + ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), + ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), + ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), + ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), + ('\u{2e5e}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), + ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), + ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), + ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), + ('\u{3130}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), + ('\u{31e4}', '\u{31ef}'), + ('\u{321f}', '\u{321f}'), + ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), + ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), + ('\u{a7cb}', '\u{a7cf}'), + ('\u{a7d2}', '\u{a7d2}'), + ('\u{a7d4}', '\u{a7d4}'), + ('\u{a7da}', '\u{a7f1}'), + ('\u{a82d}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), + ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), + ('\u{a8da}', '\u{a8df}'), + ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), + ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), + ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), + ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), + ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), + ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), + ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), + ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab6c}', '\u{ab6f}'), + ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), + ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), + ('\u{d7fc}', '\u{f8ff}'), + ('\u{fa6e}', '\u{fa6f}'), + ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), + ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), + ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), + ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), + ('\u{fbc3}', '\u{fbd2}'), + ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdce}'), + ('\u{fdd0}', '\u{fdef}'), + ('\u{fe1a}', '\u{fe1f}'), + ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), + ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), + ('\u{fefd}', '\u{ff00}'), + ('\u{ffbf}', '\u{ffc1}'), + ('\u{ffc8}', '\u{ffc9}'), + ('\u{ffd0}', '\u{ffd1}'), + ('\u{ffd8}', '\u{ffd9}'), + ('\u{ffdd}', '\u{ffdf}'), + ('\u{ffe7}', '\u{ffe7}'), + ('\u{ffef}', '\u{fffb}'), + ('\u{fffe}', '\u{ffff}'), + ('\u{1000c}', '\u{1000c}'), + ('\u{10027}', '\u{10027}'), + ('\u{1003b}', '\u{1003b}'), + ('\u{1003e}', '\u{1003e}'), + ('\u{1004e}', '\u{1004f}'), + ('\u{1005e}', '\u{1007f}'), + ('\u{100fb}', '\u{100ff}'), + ('\u{10103}', '\u{10106}'), + ('\u{10134}', '\u{10136}'), + ('\u{1018f}', '\u{1018f}'), + ('\u{1019d}', '\u{1019f}'), + ('\u{101a1}', '\u{101cf}'), + ('\u{101fe}', '\u{1027f}'), + ('\u{1029d}', '\u{1029f}'), + ('\u{102d1}', '\u{102df}'), + ('\u{102fc}', '\u{102ff}'), + ('\u{10324}', '\u{1032c}'), + ('\u{1034b}', '\u{1034f}'), + ('\u{1037b}', '\u{1037f}'), + ('\u{1039e}', '\u{1039e}'), + ('\u{103c4}', '\u{103c7}'), + ('\u{103d6}', '\u{103ff}'), + ('\u{1049e}', '\u{1049f}'), + ('\u{104aa}', '\u{104af}'), + ('\u{104d4}', '\u{104d7}'), + ('\u{104fc}', '\u{104ff}'), + ('\u{10528}', '\u{1052f}'), + ('\u{10564}', '\u{1056e}'), + ('\u{1057b}', '\u{1057b}'), + ('\u{1058b}', '\u{1058b}'), + ('\u{10593}', '\u{10593}'), + ('\u{10596}', '\u{10596}'), + ('\u{105a2}', '\u{105a2}'), + ('\u{105b2}', '\u{105b2}'), + ('\u{105ba}', '\u{105ba}'), + ('\u{105bd}', '\u{105ff}'), + ('\u{10737}', '\u{1073f}'), + ('\u{10756}', '\u{1075f}'), + ('\u{10768}', '\u{1077f}'), + ('\u{10786}', '\u{10786}'), + ('\u{107b1}', '\u{107b1}'), + ('\u{107bb}', '\u{107ff}'), + ('\u{10806}', '\u{10807}'), + ('\u{10809}', '\u{10809}'), + ('\u{10836}', '\u{10836}'), + ('\u{10839}', '\u{1083b}'), + ('\u{1083d}', '\u{1083e}'), + ('\u{10856}', '\u{10856}'), + ('\u{1089f}', '\u{108a6}'), + ('\u{108b0}', '\u{108df}'), + ('\u{108f3}', '\u{108f3}'), + ('\u{108f6}', '\u{108fa}'), + ('\u{1091c}', '\u{1091e}'), + ('\u{1093a}', '\u{1093e}'), + ('\u{10940}', '\u{1097f}'), + ('\u{109b8}', '\u{109bb}'), + ('\u{109d0}', '\u{109d1}'), + ('\u{10a04}', '\u{10a04}'), + ('\u{10a07}', '\u{10a0b}'), + ('\u{10a14}', '\u{10a14}'), + ('\u{10a18}', '\u{10a18}'), + ('\u{10a36}', '\u{10a37}'), + ('\u{10a3b}', '\u{10a3e}'), + ('\u{10a49}', '\u{10a4f}'), + ('\u{10a59}', '\u{10a5f}'), + ('\u{10aa0}', '\u{10abf}'), + ('\u{10ae7}', '\u{10aea}'), + ('\u{10af7}', '\u{10aff}'), + ('\u{10b36}', '\u{10b38}'), + ('\u{10b56}', '\u{10b57}'), + ('\u{10b73}', '\u{10b77}'), + ('\u{10b92}', '\u{10b98}'), + ('\u{10b9d}', '\u{10ba8}'), + ('\u{10bb0}', '\u{10bff}'), + ('\u{10c49}', '\u{10c7f}'), + ('\u{10cb3}', '\u{10cbf}'), + ('\u{10cf3}', '\u{10cf9}'), + ('\u{10d28}', '\u{10d2f}'), + ('\u{10d3a}', '\u{10e5f}'), + ('\u{10e7f}', '\u{10e7f}'), + ('\u{10eaa}', '\u{10eaa}'), + ('\u{10eae}', '\u{10eaf}'), + ('\u{10eb2}', '\u{10efc}'), + ('\u{10f28}', '\u{10f2f}'), + ('\u{10f5a}', '\u{10f6f}'), + ('\u{10f8a}', '\u{10faf}'), + ('\u{10fcc}', '\u{10fdf}'), + ('\u{10ff7}', '\u{10fff}'), + ('\u{1104e}', '\u{11051}'), + ('\u{11076}', '\u{1107e}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110c3}', '\u{110cf}'), + ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), + ('\u{11135}', '\u{11135}'), + ('\u{11148}', '\u{1114f}'), + ('\u{11177}', '\u{1117f}'), + ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), + ('\u{11212}', '\u{11212}'), + ('\u{11242}', '\u{1127f}'), + ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), + ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), + ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), + ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), + ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), + ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), + ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133a}'), + ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), + ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), + ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), + ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), + ('\u{1145c}', '\u{1145c}'), + ('\u{11462}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), + ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), + ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), + ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), + ('\u{116ba}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), + ('\u{1171b}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), + ('\u{11747}', '\u{117ff}'), + ('\u{1183c}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), + ('\u{11907}', '\u{11908}'), + ('\u{1190a}', '\u{1190b}'), + ('\u{11914}', '\u{11914}'), + ('\u{11917}', '\u{11917}'), + ('\u{11936}', '\u{11936}'), + ('\u{11939}', '\u{1193a}'), + ('\u{11947}', '\u{1194f}'), + ('\u{1195a}', '\u{1199f}'), + ('\u{119a8}', '\u{119a9}'), + ('\u{119d8}', '\u{119d9}'), + ('\u{119e5}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), + ('\u{11aa3}', '\u{11aaf}'), + ('\u{11af9}', '\u{11aff}'), + ('\u{11b0a}', '\u{11bff}'), + ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), + ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), + ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), + ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), + ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), + ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), + ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11d5f}'), + ('\u{11d66}', '\u{11d66}'), + ('\u{11d69}', '\u{11d69}'), + ('\u{11d8f}', '\u{11d8f}'), + ('\u{11d92}', '\u{11d92}'), + ('\u{11d99}', '\u{11d9f}'), + ('\u{11daa}', '\u{11edf}'), + ('\u{11ef9}', '\u{11eff}'), + ('\u{11f11}', '\u{11f11}'), + ('\u{11f3b}', '\u{11f3d}'), + ('\u{11f5a}', '\u{11faf}'), + ('\u{11fb1}', '\u{11fbf}'), + ('\u{11ff2}', '\u{11ffe}'), + ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), + ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12f8f}'), + ('\u{12ff3}', '\u{12fff}'), + ('\u{13430}', '\u{1343f}'), + ('\u{13456}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), + ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), + ('\u{16a6a}', '\u{16a6d}'), + ('\u{16abf}', '\u{16abf}'), + ('\u{16aca}', '\u{16acf}'), + ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), + ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), + ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), + ('\u{16b90}', '\u{16e3f}'), + ('\u{16e9b}', '\u{16eff}'), + ('\u{16f4b}', '\u{16f4e}'), + ('\u{16f88}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), + ('\u{16fe5}', '\u{16fef}'), + ('\u{16ff2}', '\u{16fff}'), + ('\u{187f8}', '\u{187ff}'), + ('\u{18cd6}', '\u{18cff}'), + ('\u{18d09}', '\u{1afef}'), + ('\u{1aff4}', '\u{1aff4}'), + ('\u{1affc}', '\u{1affc}'), + ('\u{1afff}', '\u{1afff}'), + ('\u{1b123}', '\u{1b131}'), + ('\u{1b133}', '\u{1b14f}'), + ('\u{1b153}', '\u{1b154}'), + ('\u{1b156}', '\u{1b163}'), + ('\u{1b168}', '\u{1b16f}'), + ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), + ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), + ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca0}', '\u{1ceff}'), + ('\u{1cf2e}', '\u{1cf2f}'), + ('\u{1cf47}', '\u{1cf4f}'), + ('\u{1cfc4}', '\u{1cfff}'), + ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{1d1eb}', '\u{1d1ff}'), + ('\u{1d246}', '\u{1d2bf}'), + ('\u{1d2d4}', '\u{1d2df}'), + ('\u{1d2f4}', '\u{1d2ff}'), + ('\u{1d357}', '\u{1d35f}'), + ('\u{1d379}', '\u{1d3ff}'), + ('\u{1d455}', '\u{1d455}'), + ('\u{1d49d}', '\u{1d49d}'), + ('\u{1d4a0}', '\u{1d4a1}'), + ('\u{1d4a3}', '\u{1d4a4}'), + ('\u{1d4a7}', '\u{1d4a8}'), + ('\u{1d4ad}', '\u{1d4ad}'), + ('\u{1d4ba}', '\u{1d4ba}'), + ('\u{1d4bc}', '\u{1d4bc}'), + ('\u{1d4c4}', '\u{1d4c4}'), + ('\u{1d506}', '\u{1d506}'), + ('\u{1d50b}', '\u{1d50c}'), + ('\u{1d515}', '\u{1d515}'), + ('\u{1d51d}', '\u{1d51d}'), + ('\u{1d53a}', '\u{1d53a}'), + ('\u{1d53f}', '\u{1d53f}'), + ('\u{1d545}', '\u{1d545}'), + ('\u{1d547}', '\u{1d549}'), + ('\u{1d551}', '\u{1d551}'), + ('\u{1d6a6}', '\u{1d6a7}'), + ('\u{1d7cc}', '\u{1d7cd}'), + ('\u{1da8c}', '\u{1da9a}'), + ('\u{1daa0}', '\u{1daa0}'), + ('\u{1dab0}', '\u{1deff}'), + ('\u{1df1f}', '\u{1df24}'), + ('\u{1df2b}', '\u{1dfff}'), + ('\u{1e007}', '\u{1e007}'), + ('\u{1e019}', '\u{1e01a}'), + ('\u{1e022}', '\u{1e022}'), + ('\u{1e025}', '\u{1e025}'), + ('\u{1e02b}', '\u{1e02f}'), + ('\u{1e06e}', '\u{1e08e}'), + ('\u{1e090}', '\u{1e0ff}'), + ('\u{1e12d}', '\u{1e12f}'), + ('\u{1e13e}', '\u{1e13f}'), + ('\u{1e14a}', '\u{1e14d}'), + ('\u{1e150}', '\u{1e28f}'), + ('\u{1e2af}', '\u{1e2bf}'), + ('\u{1e2fa}', '\u{1e2fe}'), + ('\u{1e300}', '\u{1e4cf}'), + ('\u{1e4fa}', '\u{1e7df}'), + ('\u{1e7e7}', '\u{1e7e7}'), + ('\u{1e7ec}', '\u{1e7ec}'), + ('\u{1e7ef}', '\u{1e7ef}'), + ('\u{1e7ff}', '\u{1e7ff}'), + ('\u{1e8c5}', '\u{1e8c6}'), + ('\u{1e8d7}', '\u{1e8ff}'), + ('\u{1e94c}', '\u{1e94f}'), + ('\u{1e95a}', '\u{1e95d}'), + ('\u{1e960}', '\u{1ec70}'), + ('\u{1ecb5}', '\u{1ed00}'), + ('\u{1ed3e}', '\u{1edff}'), + ('\u{1ee04}', '\u{1ee04}'), + ('\u{1ee20}', '\u{1ee20}'), + ('\u{1ee23}', '\u{1ee23}'), + ('\u{1ee25}', '\u{1ee26}'), + ('\u{1ee28}', '\u{1ee28}'), + ('\u{1ee33}', '\u{1ee33}'), + ('\u{1ee38}', '\u{1ee38}'), + ('\u{1ee3a}', '\u{1ee3a}'), + ('\u{1ee3c}', '\u{1ee41}'), + ('\u{1ee43}', '\u{1ee46}'), + ('\u{1ee48}', '\u{1ee48}'), + ('\u{1ee4a}', '\u{1ee4a}'), + ('\u{1ee4c}', '\u{1ee4c}'), + ('\u{1ee50}', '\u{1ee50}'), + ('\u{1ee53}', '\u{1ee53}'), + ('\u{1ee55}', '\u{1ee56}'), + ('\u{1ee58}', '\u{1ee58}'), + ('\u{1ee5a}', '\u{1ee5a}'), + ('\u{1ee5c}', '\u{1ee5c}'), + ('\u{1ee5e}', '\u{1ee5e}'), + ('\u{1ee60}', '\u{1ee60}'), + ('\u{1ee63}', '\u{1ee63}'), + ('\u{1ee65}', '\u{1ee66}'), + ('\u{1ee6b}', '\u{1ee6b}'), + ('\u{1ee73}', '\u{1ee73}'), + ('\u{1ee78}', '\u{1ee78}'), + ('\u{1ee7d}', '\u{1ee7d}'), + ('\u{1ee7f}', '\u{1ee7f}'), + ('\u{1ee8a}', '\u{1ee8a}'), + ('\u{1ee9c}', '\u{1eea0}'), + ('\u{1eea4}', '\u{1eea4}'), + ('\u{1eeaa}', '\u{1eeaa}'), + ('\u{1eebc}', '\u{1eeef}'), + ('\u{1eef2}', '\u{1efff}'), + ('\u{1f02c}', '\u{1f02f}'), + ('\u{1f094}', '\u{1f09f}'), + ('\u{1f0af}', '\u{1f0b0}'), + ('\u{1f0c0}', '\u{1f0c0}'), + ('\u{1f0d0}', '\u{1f0d0}'), + ('\u{1f0f6}', '\u{1f0ff}'), + ('\u{1f1ae}', '\u{1f1e5}'), + ('\u{1f203}', '\u{1f20f}'), + ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '\u{1f24f}'), + ('\u{1f252}', '\u{1f25f}'), + ('\u{1f266}', '\u{1f2ff}'), + ('\u{1f6d8}', '\u{1f6db}'), + ('\u{1f6ed}', '\u{1f6ef}'), + ('\u{1f6fd}', '\u{1f6ff}'), + ('\u{1f777}', '\u{1f77a}'), + ('\u{1f7da}', '\u{1f7df}'), + ('\u{1f7ec}', '\u{1f7ef}'), + ('\u{1f7f1}', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8af}'), + ('\u{1f8b2}', '\u{1f8ff}'), + ('\u{1fa54}', '\u{1fa5f}'), + ('\u{1fa6e}', '\u{1fa6f}'), + ('\u{1fa7d}', '\u{1fa7f}'), + ('\u{1fa89}', '\u{1fa8f}'), + ('\u{1fabe}', '\u{1fabe}'), + ('\u{1fac6}', '\u{1facd}'), + ('\u{1fadc}', '\u{1fadf}'), + ('\u{1fae9}', '\u{1faef}'), + ('\u{1faf9}', '\u{1faff}'), + ('\u{1fb93}', '\u{1fb93}'), + ('\u{1fbcb}', '\u{1fbef}'), + ('\u{1fbfa}', '\u{1ffff}'), + ('\u{2a6e0}', '\u{2a6ff}'), + ('\u{2b73a}', '\u{2b73f}'), + ('\u{2b81e}', '\u{2b81f}'), + ('\u{2cea2}', '\u{2ceaf}'), + ('\u{2ebe1}', '\u{2f7ff}'), + ('\u{2fa1e}', '\u{2ffff}'), + ('\u{3134b}', '\u{3134f}'), + ('\u{323b0}', '\u{e00ff}'), + ('\u{e01f0}', '\u{10ffff}'), +]; + +pub const OTHER_LETTER: &'static [(char, char)] = &[ + ('ª', 'ª'), + ('º', 'º'), + ('ƻ', 'ƻ'), + ('ǀ', 'ǃ'), + ('ʔ', 'ʔ'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('ؠ', 'ؿ'), + ('ف', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ࠀ', 'ࠕ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣈ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॲ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('เ', 'ๅ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('ᄀ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛱ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡂ'), + ('ᡄ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱷ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ℵ', 'ℸ'), + ('ⴰ', 'ⵧ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('〆', '〆'), + ('〼', '〼'), + ('ぁ', 'ゖ'), + ('ゟ', 'ゟ'), + ('ァ', 'ヺ'), + ('ヿ', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꀔ'), + ('ꀖ', 'ꒌ'), + ('ꓐ', 'ꓷ'), + ('ꔀ', 'ꘋ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('ꙮ', 'ꙮ'), + ('ꚠ', 'ꛥ'), + ('ꞏ', 'ꞏ'), + ('ꟷ', 'ꟷ'), + ('ꟻ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧠ', 'ꧤ'), + ('ꧧ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩯ'), + ('ꩱ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫜ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫲ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꯀ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('ヲ', 'ッ'), + ('ア', 'ン'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍀'), + ('𐍂', '𐍉'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐑐', '𐒝'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝼊', '𝼊'), + ('𞄀', '𞄬'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓪'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const OTHER_NUMBER: &'static [(char, char)] = &[ + ('²', '³'), + ('¹', '¹'), + ('¼', '¾'), + ('৴', '৹'), + ('୲', '୷'), + ('௰', '௲'), + ('౸', '౾'), + ('൘', '൞'), + ('൰', '൸'), + ('༪', '༳'), + ('፩', '፼'), + ('៰', '៹'), + ('᧚', '᧚'), + ('⁰', '⁰'), + ('⁴', '⁹'), + ('₀', '₉'), + ('⅐', '⅟'), + ('↉', '↉'), + ('①', '⒛'), + ('⓪', '⓿'), + ('❶', '➓'), + ('⳽', '⳽'), + ('㆒', '㆕'), + ('㈠', '㈩'), + ('㉈', '㉏'), + ('㉑', '㉟'), + ('㊀', '㊉'), + ('㊱', '㊿'), + ('꠰', '꠵'), + ('𐄇', '𐄳'), + ('𐅵', '𐅸'), + ('𐆊', '𐆋'), + ('𐋡', '𐋻'), + ('𐌠', '𐌣'), + ('𐡘', '𐡟'), + ('𐡹', '𐡿'), + ('𐢧', '𐢯'), + ('𐣻', '𐣿'), + ('𐤖', '𐤛'), + ('𐦼', '𐦽'), + ('𐧀', '𐧏'), + ('𐧒', '𐧿'), + ('𐩀', '𐩈'), + ('𐩽', '𐩾'), + ('𐪝', '𐪟'), + ('𐫫', '𐫯'), + ('𐭘', '𐭟'), + ('𐭸', '𐭿'), + ('𐮩', '𐮯'), + ('𐳺', '𐳿'), + ('𐹠', '𐹾'), + ('𐼝', '𐼦'), + ('𐽑', '𐽔'), + ('𐿅', '𐿋'), + ('𑁒', '𑁥'), + ('𑇡', '𑇴'), + ('𑜺', '𑜻'), + ('𑣪', '𑣲'), + ('𑱚', '𑱬'), + ('𑿀', '𑿔'), + ('𖭛', '𖭡'), + ('𖺀', '𖺖'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝍠', '𝍸'), + ('𞣇', '𞣏'), + ('𞱱', '𞲫'), + ('𞲭', '𞲯'), + ('𞲱', '𞲴'), + ('𞴁', '𞴭'), + ('𞴯', '𞴽'), + ('🄀', '🄌'), +]; + +pub const OTHER_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), + ('%', '\''), + ('*', '*'), + (',', ','), + ('.', '/'), + (':', ';'), + ('?', '@'), + ('\\', '\\'), + ('¡', '¡'), + ('§', '§'), + ('¶', '·'), + ('¿', '¿'), + (';', ';'), + ('·', '·'), + ('՚', '՟'), + ('։', '։'), + ('׀', '׀'), + ('׃', '׃'), + ('׆', '׆'), + ('׳', '״'), + ('؉', '؊'), + ('،', '؍'), + ('؛', '؛'), + ('؝', '؟'), + ('٪', '٭'), + ('۔', '۔'), + ('܀', '܍'), + ('߷', '߹'), + ('࠰', '࠾'), + ('࡞', '࡞'), + ('।', '॥'), + ('॰', '॰'), + ('৽', '৽'), + ('੶', '੶'), + ('૰', '૰'), + ('౷', '౷'), + ('಄', '಄'), + ('෴', '෴'), + ('๏', '๏'), + ('๚', '๛'), + ('༄', '༒'), + ('༔', '༔'), + ('྅', '྅'), + ('࿐', '࿔'), + ('࿙', '࿚'), + ('၊', '၏'), + ('჻', '჻'), + ('፠', '፨'), + ('᙮', '᙮'), + ('᛫', '᛭'), + ('᜵', '᜶'), + ('។', '៖'), + ('៘', '៚'), + ('᠀', '᠅'), + ('᠇', '᠊'), + ('᥄', '᥅'), + ('᨞', '᨟'), + ('᪠', '᪦'), + ('᪨', '᪭'), + ('᭚', '᭠'), + ('᭽', '᭾'), + ('᯼', '᯿'), + ('᰻', '᰿'), + ('᱾', '᱿'), + ('᳀', '᳇'), + ('᳓', '᳓'), + ('‖', '‗'), + ('†', '‧'), + ('‰', '‸'), + ('※', '‾'), + ('⁁', '⁃'), + ('⁇', '⁑'), + ('⁓', '⁓'), + ('⁕', '⁞'), + ('⳹', '⳼'), + ('⳾', '⳿'), + ('⵰', '⵰'), + ('⸀', '⸁'), + ('⸆', '⸈'), + ('⸋', '⸋'), + ('⸎', '⸖'), + ('⸘', '⸙'), + ('⸛', '⸛'), + ('⸞', '⸟'), + ('⸪', '⸮'), + ('⸰', '⸹'), + ('⸼', '⸿'), + ('⹁', '⹁'), + ('⹃', '⹏'), + ('⹒', '⹔'), + ('、', '〃'), + ('〽', '〽'), + ('・', '・'), + ('꓾', '꓿'), + ('꘍', '꘏'), + ('꙳', '꙳'), + ('꙾', '꙾'), + ('꛲', '꛷'), + ('꡴', '꡷'), + ('꣎', '꣏'), + ('꣸', '꣺'), + ('꣼', '꣼'), + ('꤮', '꤯'), + ('꥟', '꥟'), + ('꧁', '꧍'), + ('꧞', '꧟'), + ('꩜', '꩟'), + ('꫞', '꫟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('︐', '︖'), + ('︙', '︙'), + ('︰', '︰'), + ('﹅', '﹆'), + ('﹉', '﹌'), + ('﹐', '﹒'), + ('﹔', '﹗'), + ('﹟', '﹡'), + ('﹨', '﹨'), + ('﹪', '﹫'), + ('!', '#'), + ('%', '''), + ('*', '*'), + (',', ','), + ('.', '/'), + (':', ';'), + ('?', '@'), + ('\', '\'), + ('。', '。'), + ('、', '・'), + ('𐄀', '𐄂'), + ('𐎟', '𐎟'), + ('𐏐', '𐏐'), + ('𐕯', '𐕯'), + ('𐡗', '𐡗'), + ('𐤟', '𐤟'), + ('𐤿', '𐤿'), + ('𐩐', '𐩘'), + ('𐩿', '𐩿'), + ('𐫰', '𐫶'), + ('𐬹', '𐬿'), + ('𐮙', '𐮜'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁍'), + ('𑂻', '𑂼'), + ('𑂾', '𑃁'), + ('𑅀', '𑅃'), + ('𑅴', '𑅵'), + ('𑇅', '𑇈'), + ('𑇍', '𑇍'), + ('𑇛', '𑇛'), + ('𑇝', '𑇟'), + ('𑈸', '𑈽'), + ('𑊩', '𑊩'), + ('𑑋', '𑑏'), + ('𑑚', '𑑛'), + ('𑑝', '𑑝'), + ('𑓆', '𑓆'), + ('𑗁', '𑗗'), + ('𑙁', '𑙃'), + ('𑙠', '𑙬'), + ('𑚹', '𑚹'), + ('𑜼', '𑜾'), + ('𑠻', '𑠻'), + ('𑥄', '𑥆'), + ('𑧢', '𑧢'), + ('𑨿', '𑩆'), + ('𑪚', '𑪜'), + ('𑪞', '𑪢'), + ('𑬀', '𑬉'), + ('𑱁', '𑱅'), + ('𑱰', '𑱱'), + ('𑻷', '𑻸'), + ('𑽃', '𑽏'), + ('𑿿', '𑿿'), + ('𒑰', '𒑴'), + ('𒿱', '𒿲'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬻'), + ('𖭄', '𖭄'), + ('𖺗', '𖺚'), + ('𖿢', '𖿢'), + ('𛲟', '𛲟'), + ('𝪇', '𝪋'), + ('𞥞', '𞥟'), +]; + +pub const OTHER_SYMBOL: &'static [(char, char)] = &[ + ('¦', '¦'), + ('©', '©'), + ('®', '®'), + ('°', '°'), + ('҂', '҂'), + ('֍', '֎'), + ('؎', '؏'), + ('۞', '۞'), + ('۩', '۩'), + ('۽', '۾'), + ('߶', '߶'), + ('৺', '৺'), + ('୰', '୰'), + ('௳', '௸'), + ('௺', '௺'), + ('౿', '౿'), + ('൏', '൏'), + ('൹', '൹'), + ('༁', '༃'), + ('༓', '༓'), + ('༕', '༗'), + ('༚', '༟'), + ('༴', '༴'), + ('༶', '༶'), + ('༸', '༸'), + ('྾', '࿅'), + ('࿇', '࿌'), + ('࿎', '࿏'), + ('࿕', '࿘'), + ('႞', '႟'), + ('᎐', '᎙'), + ('᙭', '᙭'), + ('᥀', '᥀'), + ('᧞', '᧿'), + ('᭡', '᭪'), + ('᭴', '᭼'), + ('℀', '℁'), + ('℃', '℆'), + ('℈', '℉'), + ('℔', '℔'), + ('№', '℗'), + ('℞', '℣'), + ('℥', '℥'), + ('℧', '℧'), + ('℩', '℩'), + ('℮', '℮'), + ('℺', '℻'), + ('⅊', '⅊'), + ('⅌', '⅍'), + ('⅏', '⅏'), + ('↊', '↋'), + ('↕', '↙'), + ('↜', '↟'), + ('↡', '↢'), + ('↤', '↥'), + ('↧', '↭'), + ('↯', '⇍'), + ('⇐', '⇑'), + ('⇓', '⇓'), + ('⇕', '⇳'), + ('⌀', '⌇'), + ('⌌', '⌟'), + ('⌢', '⌨'), + ('⌫', '⍻'), + ('⍽', '⎚'), + ('⎴', '⏛'), + ('⏢', '␦'), + ('⑀', '⑊'), + ('⒜', 'ⓩ'), + ('─', '▶'), + ('▸', '◀'), + ('◂', '◷'), + ('☀', '♮'), + ('♰', '❧'), + ('➔', '➿'), + ('⠀', '⣿'), + ('⬀', '⬯'), + ('⭅', '⭆'), + ('⭍', '⭳'), + ('⭶', '⮕'), + ('⮗', '⯿'), + ('⳥', '⳪'), + ('⹐', '⹑'), + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('⿰', '⿻'), + ('〄', '〄'), + ('〒', '〓'), + ('〠', '〠'), + ('〶', '〷'), + ('〾', '〿'), + ('㆐', '㆑'), + ('㆖', '㆟'), + ('㇀', '㇣'), + ('㈀', '㈞'), + ('㈪', '㉇'), + ('㉐', '㉐'), + ('㉠', '㉿'), + ('㊊', '㊰'), + ('㋀', '㏿'), + ('䷀', '䷿'), + ('꒐', '꓆'), + ('꠨', '꠫'), + ('꠶', '꠷'), + ('꠹', '꠹'), + ('꩷', '꩹'), + ('﵀', '﵏'), + ('﷏', '﷏'), + ('﷽', '﷿'), + ('¦', '¦'), + ('│', '│'), + ('■', '○'), + ('', '�'), + ('𐄷', '𐄿'), + ('𐅹', '𐆉'), + ('𐆌', '𐆎'), + ('𐆐', '𐆜'), + ('𐆠', '𐆠'), + ('𐇐', '𐇼'), + ('𐡷', '𐡸'), + ('𐫈', '𐫈'), + ('𑜿', '𑜿'), + ('𑿕', '𑿜'), + ('𑿡', '𑿱'), + ('𖬼', '𖬿'), + ('𖭅', '𖭅'), + ('𛲜', '𛲜'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅘𝅥𝅲'), + ('𝅪', '𝅬'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝈀', '𝉁'), + ('𝉅', '𝉅'), + ('𝌀', '𝍖'), + ('𝠀', '𝧿'), + ('𝨷', '𝨺'), + ('𝩭', '𝩴'), + ('𝩶', '𝪃'), + ('𝪅', '𝪆'), + ('𞅏', '𞅏'), + ('𞲬', '𞲬'), + ('𞴮', '𞴮'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄍', '🆭'), + ('🇦', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉐', '🉑'), + ('🉠', '🉥'), + ('🌀', '🏺'), + ('🐀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), +]; + +pub const PARAGRAPH_SEPARATOR: &'static [(char, char)] = + &[('\u{2029}', '\u{2029}')]; + +pub const PRIVATE_USE: &'static [(char, char)] = &[ + ('\u{e000}', '\u{f8ff}'), + ('\u{f0000}', '\u{ffffd}'), + ('\u{100000}', '\u{10fffd}'), +]; + +pub const PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), + ('%', '*'), + (',', '/'), + (':', ';'), + ('?', '@'), + ('[', ']'), + ('_', '_'), + ('{', '{'), + ('}', '}'), + ('¡', '¡'), + ('§', '§'), + ('«', '«'), + ('¶', '·'), + ('»', '»'), + ('¿', '¿'), + (';', ';'), + ('·', '·'), + ('՚', '՟'), + ('։', '֊'), + ('־', '־'), + ('׀', '׀'), + ('׃', '׃'), + ('׆', '׆'), + ('׳', '״'), + ('؉', '؊'), + ('،', '؍'), + ('؛', '؛'), + ('؝', '؟'), + ('٪', '٭'), + ('۔', '۔'), + ('܀', '܍'), + ('߷', '߹'), + ('࠰', '࠾'), + ('࡞', '࡞'), + ('।', '॥'), + ('॰', '॰'), + ('৽', '৽'), + ('੶', '੶'), + ('૰', '૰'), + ('౷', '౷'), + ('಄', '಄'), + ('෴', '෴'), + ('๏', '๏'), + ('๚', '๛'), + ('༄', '༒'), + ('༔', '༔'), + ('༺', '༽'), + ('྅', '྅'), + ('࿐', '࿔'), + ('࿙', '࿚'), + ('၊', '၏'), + ('჻', '჻'), + ('፠', '፨'), + ('᐀', '᐀'), + ('᙮', '᙮'), + ('᚛', '᚜'), + ('᛫', '᛭'), + ('᜵', '᜶'), + ('។', '៖'), + ('៘', '៚'), + ('᠀', '᠊'), + ('᥄', '᥅'), + ('᨞', '᨟'), + ('᪠', '᪦'), + ('᪨', '᪭'), + ('᭚', '᭠'), + ('᭽', '᭾'), + ('᯼', '᯿'), + ('᰻', '᰿'), + ('᱾', '᱿'), + ('᳀', '᳇'), + ('᳓', '᳓'), + ('‐', '‧'), + ('‰', '⁃'), + ('⁅', '⁑'), + ('⁓', '⁞'), + ('⁽', '⁾'), + ('₍', '₎'), + ('⌈', '⌋'), + ('〈', '〉'), + ('❨', '❵'), + ('⟅', '⟆'), + ('⟦', '⟯'), + ('⦃', '⦘'), + ('⧘', '⧛'), + ('⧼', '⧽'), + ('⳹', '⳼'), + ('⳾', '⳿'), + ('⵰', '⵰'), + ('⸀', '⸮'), + ('⸰', '⹏'), + ('⹒', '⹝'), + ('、', '〃'), + ('〈', '】'), + ('〔', '〟'), + ('〰', '〰'), + ('〽', '〽'), + ('゠', '゠'), + ('・', '・'), + ('꓾', '꓿'), + ('꘍', '꘏'), + ('꙳', '꙳'), + ('꙾', '꙾'), + ('꛲', '꛷'), + ('꡴', '꡷'), + ('꣎', '꣏'), + ('꣸', '꣺'), + ('꣼', '꣼'), + ('꤮', '꤯'), + ('꥟', '꥟'), + ('꧁', '꧍'), + ('꧞', '꧟'), + ('꩜', '꩟'), + ('꫞', '꫟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('﴾', '﴿'), + ('︐', '︙'), + ('︰', '﹒'), + ('﹔', '﹡'), + ('﹣', '﹣'), + ('﹨', '﹨'), + ('﹪', '﹫'), + ('!', '#'), + ('%', '*'), + (',', '/'), + (':', ';'), + ('?', '@'), + ('[', ']'), + ('_', '_'), + ('{', '{'), + ('}', '}'), + ('⦅', '・'), + ('𐄀', '𐄂'), + ('𐎟', '𐎟'), + ('𐏐', '𐏐'), + ('𐕯', '𐕯'), + ('𐡗', '𐡗'), + ('𐤟', '𐤟'), + ('𐤿', '𐤿'), + ('𐩐', '𐩘'), + ('𐩿', '𐩿'), + ('𐫰', '𐫶'), + ('𐬹', '𐬿'), + ('𐮙', '𐮜'), + ('𐺭', '𐺭'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁍'), + ('𑂻', '𑂼'), + ('𑂾', '𑃁'), + ('𑅀', '𑅃'), + ('𑅴', '𑅵'), + ('𑇅', '𑇈'), + ('𑇍', '𑇍'), + ('𑇛', '𑇛'), + ('𑇝', '𑇟'), + ('𑈸', '𑈽'), + ('𑊩', '𑊩'), + ('𑑋', '𑑏'), + ('𑑚', '𑑛'), + ('𑑝', '𑑝'), + ('𑓆', '𑓆'), + ('𑗁', '𑗗'), + ('𑙁', '𑙃'), + ('𑙠', '𑙬'), + ('𑚹', '𑚹'), + ('𑜼', '𑜾'), + ('𑠻', '𑠻'), + ('𑥄', '𑥆'), + ('𑧢', '𑧢'), + ('𑨿', '𑩆'), + ('𑪚', '𑪜'), + ('𑪞', '𑪢'), + ('𑬀', '𑬉'), + ('𑱁', '𑱅'), + ('𑱰', '𑱱'), + ('𑻷', '𑻸'), + ('𑽃', '𑽏'), + ('𑿿', '𑿿'), + ('𒑰', '𒑴'), + ('𒿱', '𒿲'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬻'), + ('𖭄', '𖭄'), + ('𖺗', '𖺚'), + ('𖿢', '𖿢'), + ('𛲟', '𛲟'), + ('𝪇', '𝪋'), + ('𞥞', '𞥟'), +]; + +pub const SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const SPACE_SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const SPACING_MARK: &'static [(char, char)] = &[ + ('ः', 'ः'), + ('ऻ', 'ऻ'), + ('ा', 'ी'), + ('ॉ', 'ौ'), + ('ॎ', 'ॏ'), + ('ং', 'ঃ'), + ('\u{9be}', 'ী'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('\u{9d7}', '\u{9d7}'), + ('ਃ', 'ਃ'), + ('ਾ', 'ੀ'), + ('ઃ', 'ઃ'), + ('ા', 'ી'), + ('ૉ', 'ૉ'), + ('ો', 'ૌ'), + ('ଂ', 'ଃ'), + ('\u{b3e}', '\u{b3e}'), + ('ୀ', 'ୀ'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('\u{b57}', '\u{b57}'), + ('\u{bbe}', 'ி'), + ('ு', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('\u{bd7}', '\u{bd7}'), + ('ఁ', 'ః'), + ('ు', 'ౄ'), + ('ಂ', 'ಃ'), + ('ಾ', 'ಾ'), + ('ೀ', 'ೄ'), + ('ೇ', 'ೈ'), + ('ೊ', 'ೋ'), + ('\u{cd5}', '\u{cd6}'), + ('ೳ', 'ೳ'), + ('ം', 'ഃ'), + ('\u{d3e}', 'ീ'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('\u{d57}', '\u{d57}'), + ('ං', 'ඃ'), + ('\u{dcf}', 'ෑ'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('༾', '༿'), + ('ཿ', 'ཿ'), + ('ါ', 'ာ'), + ('ေ', 'ေ'), + ('း', 'း'), + ('ျ', 'ြ'), + ('ၖ', 'ၗ'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('ႃ', 'ႄ'), + ('ႇ', 'ႌ'), + ('ႏ', 'ႏ'), + ('ႚ', 'ႜ'), + ('᜕', '᜕'), + ('᜴', '᜴'), + ('ា', 'ា'), + ('ើ', 'ៅ'), + ('ះ', 'ៈ'), + ('ᤣ', 'ᤦ'), + ('ᤩ', 'ᤫ'), + ('ᤰ', 'ᤱ'), + ('ᤳ', 'ᤸ'), + ('ᨙ', 'ᨚ'), + ('ᩕ', 'ᩕ'), + ('ᩗ', 'ᩗ'), + ('ᩡ', 'ᩡ'), + ('ᩣ', 'ᩤ'), + ('ᩭ', 'ᩲ'), + ('ᬄ', 'ᬄ'), + ('\u{1b35}', '\u{1b35}'), + ('ᬻ', 'ᬻ'), + ('ᬽ', 'ᭁ'), + ('ᭃ', '᭄'), + ('ᮂ', 'ᮂ'), + ('ᮡ', 'ᮡ'), + ('ᮦ', 'ᮧ'), + ('᮪', '᮪'), + ('ᯧ', 'ᯧ'), + ('ᯪ', 'ᯬ'), + ('ᯮ', 'ᯮ'), + ('᯲', '᯳'), + ('ᰤ', 'ᰫ'), + ('ᰴ', 'ᰵ'), + ('᳡', '᳡'), + ('᳷', '᳷'), + ('\u{302e}', '\u{302f}'), + ('ꠣ', 'ꠤ'), + ('ꠧ', 'ꠧ'), + ('ꢀ', 'ꢁ'), + ('ꢴ', 'ꣃ'), + ('ꥒ', '꥓'), + ('ꦃ', 'ꦃ'), + ('ꦴ', 'ꦵ'), + ('ꦺ', 'ꦻ'), + ('ꦾ', '꧀'), + ('ꨯ', 'ꨰ'), + ('ꨳ', 'ꨴ'), + ('ꩍ', 'ꩍ'), + ('ꩻ', 'ꩻ'), + ('ꩽ', 'ꩽ'), + ('ꫫ', 'ꫫ'), + ('ꫮ', 'ꫯ'), + ('ꫵ', 'ꫵ'), + ('ꯣ', 'ꯤ'), + ('ꯦ', 'ꯧ'), + ('ꯩ', 'ꯪ'), + ('꯬', '꯬'), + ('𑀀', '𑀀'), + ('𑀂', '𑀂'), + ('𑂂', '𑂂'), + ('𑂰', '𑂲'), + ('𑂷', '𑂸'), + ('𑄬', '𑄬'), + ('𑅅', '𑅆'), + ('𑆂', '𑆂'), + ('𑆳', '𑆵'), + ('𑆿', '𑇀'), + ('𑇎', '𑇎'), + ('𑈬', '𑈮'), + ('𑈲', '𑈳'), + ('𑈵', '𑈵'), + ('𑋠', '𑋢'), + ('𑌂', '𑌃'), + ('\u{1133e}', '𑌿'), + ('𑍁', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('𑐵', '𑐷'), + ('𑑀', '𑑁'), + ('𑑅', '𑑅'), + ('\u{114b0}', '𑒲'), + ('𑒹', '𑒹'), + ('𑒻', '𑒾'), + ('𑓁', '𑓁'), + ('\u{115af}', '𑖱'), + ('𑖸', '𑖻'), + ('𑖾', '𑖾'), + ('𑘰', '𑘲'), + ('𑘻', '𑘼'), + ('𑘾', '𑘾'), + ('𑚬', '𑚬'), + ('𑚮', '𑚯'), + ('𑚶', '𑚶'), + ('𑜠', '𑜡'), + ('𑜦', '𑜦'), + ('𑠬', '𑠮'), + ('𑠸', '𑠸'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('𑤽', '𑤽'), + ('𑥀', '𑥀'), + ('𑥂', '𑥂'), + ('𑧑', '𑧓'), + ('𑧜', '𑧟'), + ('𑧤', '𑧤'), + ('𑨹', '𑨹'), + ('𑩗', '𑩘'), + ('𑪗', '𑪗'), + ('𑰯', '𑰯'), + ('𑰾', '𑰾'), + ('𑲩', '𑲩'), + ('𑲱', '𑲱'), + ('𑲴', '𑲴'), + ('𑶊', '𑶎'), + ('𑶓', '𑶔'), + ('𑶖', '𑶖'), + ('𑻵', '𑻶'), + ('𑼃', '𑼃'), + ('𑼴', '𑼵'), + ('𑼾', '𑼿'), + ('𑽁', '𑽁'), + ('𖽑', '𖾇'), + ('𖿰', '𖿱'), + ('\u{1d165}', '𝅦'), + ('𝅭', '\u{1d172}'), +]; + +pub const SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), + ('+', '+'), + ('<', '>'), + ('^', '^'), + ('`', '`'), + ('|', '|'), + ('~', '~'), + ('¢', '¦'), + ('¨', '©'), + ('¬', '¬'), + ('®', '±'), + ('´', '´'), + ('¸', '¸'), + ('×', '×'), + ('÷', '÷'), + ('˂', '˅'), + ('˒', '˟'), + ('˥', '˫'), + ('˭', '˭'), + ('˯', '˿'), + ('͵', '͵'), + ('΄', '΅'), + ('϶', '϶'), + ('҂', '҂'), + ('֍', '֏'), + ('؆', '؈'), + ('؋', '؋'), + ('؎', '؏'), + ('۞', '۞'), + ('۩', '۩'), + ('۽', '۾'), + ('߶', '߶'), + ('߾', '߿'), + ('࢈', '࢈'), + ('৲', '৳'), + ('৺', '৻'), + ('૱', '૱'), + ('୰', '୰'), + ('௳', '௺'), + ('౿', '౿'), + ('൏', '൏'), + ('൹', '൹'), + ('฿', '฿'), + ('༁', '༃'), + ('༓', '༓'), + ('༕', '༗'), + ('༚', '༟'), + ('༴', '༴'), + ('༶', '༶'), + ('༸', '༸'), + ('྾', '࿅'), + ('࿇', '࿌'), + ('࿎', '࿏'), + ('࿕', '࿘'), + ('႞', '႟'), + ('᎐', '᎙'), + ('᙭', '᙭'), + ('៛', '៛'), + ('᥀', '᥀'), + ('᧞', '᧿'), + ('᭡', '᭪'), + ('᭴', '᭼'), + ('᾽', '᾽'), + ('᾿', '῁'), + ('῍', '῏'), + ('῝', '῟'), + ('῭', '`'), + ('´', '῾'), + ('⁄', '⁄'), + ('⁒', '⁒'), + ('⁺', '⁼'), + ('₊', '₌'), + ('₠', '⃀'), + ('℀', '℁'), + ('℃', '℆'), + ('℈', '℉'), + ('℔', '℔'), + ('№', '℘'), + ('℞', '℣'), + ('℥', '℥'), + ('℧', '℧'), + ('℩', '℩'), + ('℮', '℮'), + ('℺', '℻'), + ('⅀', '⅄'), + ('⅊', '⅍'), + ('⅏', '⅏'), + ('↊', '↋'), + ('←', '⌇'), + ('⌌', '⌨'), + ('⌫', '␦'), + ('⑀', '⑊'), + ('⒜', 'ⓩ'), + ('─', '❧'), + ('➔', '⟄'), + ('⟇', '⟥'), + ('⟰', '⦂'), + ('⦙', '⧗'), + ('⧜', '⧻'), + ('⧾', '⭳'), + ('⭶', '⮕'), + ('⮗', '⯿'), + ('⳥', '⳪'), + ('⹐', '⹑'), + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('⿰', '⿻'), + ('〄', '〄'), + ('〒', '〓'), + ('〠', '〠'), + ('〶', '〷'), + ('〾', '〿'), + ('゛', '゜'), + ('㆐', '㆑'), + ('㆖', '㆟'), + ('㇀', '㇣'), + ('㈀', '㈞'), + ('㈪', '㉇'), + ('㉐', '㉐'), + ('㉠', '㉿'), + ('㊊', '㊰'), + ('㋀', '㏿'), + ('䷀', '䷿'), + ('꒐', '꓆'), + ('꜀', '꜖'), + ('꜠', '꜡'), + ('꞉', '꞊'), + ('꠨', '꠫'), + ('꠶', '꠹'), + ('꩷', '꩹'), + ('꭛', '꭛'), + ('꭪', '꭫'), + ('﬩', '﬩'), + ('﮲', '﯂'), + ('﵀', '﵏'), + ('﷏', '﷏'), + ('﷼', '﷿'), + ('﹢', '﹢'), + ('﹤', '﹦'), + ('﹩', '﹩'), + ('$', '$'), + ('+', '+'), + ('<', '>'), + ('^', '^'), + ('`', '`'), + ('|', '|'), + ('~', '~'), + ('¢', '₩'), + ('│', '○'), + ('', '�'), + ('𐄷', '𐄿'), + ('𐅹', '𐆉'), + ('𐆌', '𐆎'), + ('𐆐', '𐆜'), + ('𐆠', '𐆠'), + ('𐇐', '𐇼'), + ('𐡷', '𐡸'), + ('𐫈', '𐫈'), + ('𑜿', '𑜿'), + ('𑿕', '𑿱'), + ('𖬼', '𖬿'), + ('𖭅', '𖭅'), + ('𛲜', '𛲜'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅘𝅥𝅲'), + ('𝅪', '𝅬'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝈀', '𝉁'), + ('𝉅', '𝉅'), + ('𝌀', '𝍖'), + ('𝛁', '𝛁'), + ('𝛛', '𝛛'), + ('𝛻', '𝛻'), + ('𝜕', '𝜕'), + ('𝜵', '𝜵'), + ('𝝏', '𝝏'), + ('𝝯', '𝝯'), + ('𝞉', '𝞉'), + ('𝞩', '𝞩'), + ('𝟃', '𝟃'), + ('𝠀', '𝧿'), + ('𝨷', '𝨺'), + ('𝩭', '𝩴'), + ('𝩶', '𝪃'), + ('𝪅', '𝪆'), + ('𞅏', '𞅏'), + ('𞋿', '𞋿'), + ('𞲬', '𞲬'), + ('𞲰', '𞲰'), + ('𞴮', '𞴮'), + ('𞻰', '𞻱'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄍', '🆭'), + ('🇦', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉐', '🉑'), + ('🉠', '🉥'), + ('🌀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), +]; + +pub const TITLECASE_LETTER: &'static [(char, char)] = &[ + ('Dž', 'Dž'), + ('Lj', 'Lj'), + ('Nj', 'Nj'), + ('Dz', 'Dz'), + ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), + ('ᾨ', 'ᾯ'), + ('ᾼ', 'ᾼ'), + ('ῌ', 'ῌ'), + ('ῼ', 'ῼ'), +]; + +pub const UNASSIGNED: &'static [(char, char)] = &[ + ('\u{378}', '\u{379}'), + ('\u{380}', '\u{383}'), + ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), + ('\u{3a2}', '\u{3a2}'), + ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), + ('\u{58b}', '\u{58c}'), + ('\u{590}', '\u{590}'), + ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ee}'), + ('\u{5f5}', '\u{5ff}'), + ('\u{70e}', '\u{70e}'), + ('\u{74b}', '\u{74c}'), + ('\u{7b2}', '\u{7bf}'), + ('\u{7fb}', '\u{7fc}'), + ('\u{82e}', '\u{82f}'), + ('\u{83f}', '\u{83f}'), + ('\u{85c}', '\u{85d}'), + ('\u{85f}', '\u{85f}'), + ('\u{86b}', '\u{86f}'), + ('\u{88f}', '\u{88f}'), + ('\u{892}', '\u{897}'), + ('\u{984}', '\u{984}'), + ('\u{98d}', '\u{98e}'), + ('\u{991}', '\u{992}'), + ('\u{9a9}', '\u{9a9}'), + ('\u{9b1}', '\u{9b1}'), + ('\u{9b3}', '\u{9b5}'), + ('\u{9ba}', '\u{9bb}'), + ('\u{9c5}', '\u{9c6}'), + ('\u{9c9}', '\u{9ca}'), + ('\u{9cf}', '\u{9d6}'), + ('\u{9d8}', '\u{9db}'), + ('\u{9de}', '\u{9de}'), + ('\u{9e4}', '\u{9e5}'), + ('\u{9ff}', '\u{a00}'), + ('\u{a04}', '\u{a04}'), + ('\u{a0b}', '\u{a0e}'), + ('\u{a11}', '\u{a12}'), + ('\u{a29}', '\u{a29}'), + ('\u{a31}', '\u{a31}'), + ('\u{a34}', '\u{a34}'), + ('\u{a37}', '\u{a37}'), + ('\u{a3a}', '\u{a3b}'), + ('\u{a3d}', '\u{a3d}'), + ('\u{a43}', '\u{a46}'), + ('\u{a49}', '\u{a4a}'), + ('\u{a4e}', '\u{a50}'), + ('\u{a52}', '\u{a58}'), + ('\u{a5d}', '\u{a5d}'), + ('\u{a5f}', '\u{a65}'), + ('\u{a77}', '\u{a80}'), + ('\u{a84}', '\u{a84}'), + ('\u{a8e}', '\u{a8e}'), + ('\u{a92}', '\u{a92}'), + ('\u{aa9}', '\u{aa9}'), + ('\u{ab1}', '\u{ab1}'), + ('\u{ab4}', '\u{ab4}'), + ('\u{aba}', '\u{abb}'), + ('\u{ac6}', '\u{ac6}'), + ('\u{aca}', '\u{aca}'), + ('\u{ace}', '\u{acf}'), + ('\u{ad1}', '\u{adf}'), + ('\u{ae4}', '\u{ae5}'), + ('\u{af2}', '\u{af8}'), + ('\u{b00}', '\u{b00}'), + ('\u{b04}', '\u{b04}'), + ('\u{b0d}', '\u{b0e}'), + ('\u{b11}', '\u{b12}'), + ('\u{b29}', '\u{b29}'), + ('\u{b31}', '\u{b31}'), + ('\u{b34}', '\u{b34}'), + ('\u{b3a}', '\u{b3b}'), + ('\u{b45}', '\u{b46}'), + ('\u{b49}', '\u{b4a}'), + ('\u{b4e}', '\u{b54}'), + ('\u{b58}', '\u{b5b}'), + ('\u{b5e}', '\u{b5e}'), + ('\u{b64}', '\u{b65}'), + ('\u{b78}', '\u{b81}'), + ('\u{b84}', '\u{b84}'), + ('\u{b8b}', '\u{b8d}'), + ('\u{b91}', '\u{b91}'), + ('\u{b96}', '\u{b98}'), + ('\u{b9b}', '\u{b9b}'), + ('\u{b9d}', '\u{b9d}'), + ('\u{ba0}', '\u{ba2}'), + ('\u{ba5}', '\u{ba7}'), + ('\u{bab}', '\u{bad}'), + ('\u{bba}', '\u{bbd}'), + ('\u{bc3}', '\u{bc5}'), + ('\u{bc9}', '\u{bc9}'), + ('\u{bce}', '\u{bcf}'), + ('\u{bd1}', '\u{bd6}'), + ('\u{bd8}', '\u{be5}'), + ('\u{bfb}', '\u{bff}'), + ('\u{c0d}', '\u{c0d}'), + ('\u{c11}', '\u{c11}'), + ('\u{c29}', '\u{c29}'), + ('\u{c3a}', '\u{c3b}'), + ('\u{c45}', '\u{c45}'), + ('\u{c49}', '\u{c49}'), + ('\u{c4e}', '\u{c54}'), + ('\u{c57}', '\u{c57}'), + ('\u{c5b}', '\u{c5c}'), + ('\u{c5e}', '\u{c5f}'), + ('\u{c64}', '\u{c65}'), + ('\u{c70}', '\u{c76}'), + ('\u{c8d}', '\u{c8d}'), + ('\u{c91}', '\u{c91}'), + ('\u{ca9}', '\u{ca9}'), + ('\u{cb4}', '\u{cb4}'), + ('\u{cba}', '\u{cbb}'), + ('\u{cc5}', '\u{cc5}'), + ('\u{cc9}', '\u{cc9}'), + ('\u{cce}', '\u{cd4}'), + ('\u{cd7}', '\u{cdc}'), + ('\u{cdf}', '\u{cdf}'), + ('\u{ce4}', '\u{ce5}'), + ('\u{cf0}', '\u{cf0}'), + ('\u{cf4}', '\u{cff}'), + ('\u{d0d}', '\u{d0d}'), + ('\u{d11}', '\u{d11}'), + ('\u{d45}', '\u{d45}'), + ('\u{d49}', '\u{d49}'), + ('\u{d50}', '\u{d53}'), + ('\u{d64}', '\u{d65}'), + ('\u{d80}', '\u{d80}'), + ('\u{d84}', '\u{d84}'), + ('\u{d97}', '\u{d99}'), + ('\u{db2}', '\u{db2}'), + ('\u{dbc}', '\u{dbc}'), + ('\u{dbe}', '\u{dbf}'), + ('\u{dc7}', '\u{dc9}'), + ('\u{dcb}', '\u{dce}'), + ('\u{dd5}', '\u{dd5}'), + ('\u{dd7}', '\u{dd7}'), + ('\u{de0}', '\u{de5}'), + ('\u{df0}', '\u{df1}'), + ('\u{df5}', '\u{e00}'), + ('\u{e3b}', '\u{e3e}'), + ('\u{e5c}', '\u{e80}'), + ('\u{e83}', '\u{e83}'), + ('\u{e85}', '\u{e85}'), + ('\u{e8b}', '\u{e8b}'), + ('\u{ea4}', '\u{ea4}'), + ('\u{ea6}', '\u{ea6}'), + ('\u{ebe}', '\u{ebf}'), + ('\u{ec5}', '\u{ec5}'), + ('\u{ec7}', '\u{ec7}'), + ('\u{ecf}', '\u{ecf}'), + ('\u{eda}', '\u{edb}'), + ('\u{ee0}', '\u{eff}'), + ('\u{f48}', '\u{f48}'), + ('\u{f6d}', '\u{f70}'), + ('\u{f98}', '\u{f98}'), + ('\u{fbd}', '\u{fbd}'), + ('\u{fcd}', '\u{fcd}'), + ('\u{fdb}', '\u{fff}'), + ('\u{10c6}', '\u{10c6}'), + ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), + ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), + ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), + ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), + ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), + ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), + ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), + ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), + ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), + ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), + ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), + ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), + ('\u{1716}', '\u{171e}'), + ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), + ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), + ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), + ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), + ('\u{181a}', '\u{181f}'), + ('\u{1879}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), + ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), + ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), + ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), + ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), + ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), + ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), + ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), + ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), + ('\u{1acf}', '\u{1aff}'), + ('\u{1b4d}', '\u{1b4f}'), + ('\u{1b7f}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), + ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), + ('\u{1c89}', '\u{1c8f}'), + ('\u{1cbb}', '\u{1cbc}'), + ('\u{1cc8}', '\u{1ccf}'), + ('\u{1cfb}', '\u{1cff}'), + ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), + ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), + ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), + ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), + ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), + ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), + ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), + ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), + ('\u{2065}', '\u{2065}'), + ('\u{2072}', '\u{2073}'), + ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), + ('\u{20c1}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), + ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), + ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), + ('\u{2b96}', '\u{2b96}'), + ('\u{2cf4}', '\u{2cf8}'), + ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), + ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), + ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), + ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), + ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), + ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), + ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), + ('\u{2e5e}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), + ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), + ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), + ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), + ('\u{3130}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), + ('\u{31e4}', '\u{31ef}'), + ('\u{321f}', '\u{321f}'), + ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), + ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), + ('\u{a7cb}', '\u{a7cf}'), + ('\u{a7d2}', '\u{a7d2}'), + ('\u{a7d4}', '\u{a7d4}'), + ('\u{a7da}', '\u{a7f1}'), + ('\u{a82d}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), + ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), + ('\u{a8da}', '\u{a8df}'), + ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), + ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), + ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), + ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), + ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), + ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), + ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), + ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab6c}', '\u{ab6f}'), + ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), + ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), + ('\u{d7fc}', '\u{d7ff}'), + ('\u{fa6e}', '\u{fa6f}'), + ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), + ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), + ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), + ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), + ('\u{fbc3}', '\u{fbd2}'), + ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdce}'), + ('\u{fdd0}', '\u{fdef}'), + ('\u{fe1a}', '\u{fe1f}'), + ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), + ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), + ('\u{fefd}', '\u{fefe}'), + ('\u{ff00}', '\u{ff00}'), + ('\u{ffbf}', '\u{ffc1}'), + ('\u{ffc8}', '\u{ffc9}'), + ('\u{ffd0}', '\u{ffd1}'), + ('\u{ffd8}', '\u{ffd9}'), + ('\u{ffdd}', '\u{ffdf}'), + ('\u{ffe7}', '\u{ffe7}'), + ('\u{ffef}', '\u{fff8}'), + ('\u{fffe}', '\u{ffff}'), + ('\u{1000c}', '\u{1000c}'), + ('\u{10027}', '\u{10027}'), + ('\u{1003b}', '\u{1003b}'), + ('\u{1003e}', '\u{1003e}'), + ('\u{1004e}', '\u{1004f}'), + ('\u{1005e}', '\u{1007f}'), + ('\u{100fb}', '\u{100ff}'), + ('\u{10103}', '\u{10106}'), + ('\u{10134}', '\u{10136}'), + ('\u{1018f}', '\u{1018f}'), + ('\u{1019d}', '\u{1019f}'), + ('\u{101a1}', '\u{101cf}'), + ('\u{101fe}', '\u{1027f}'), + ('\u{1029d}', '\u{1029f}'), + ('\u{102d1}', '\u{102df}'), + ('\u{102fc}', '\u{102ff}'), + ('\u{10324}', '\u{1032c}'), + ('\u{1034b}', '\u{1034f}'), + ('\u{1037b}', '\u{1037f}'), + ('\u{1039e}', '\u{1039e}'), + ('\u{103c4}', '\u{103c7}'), + ('\u{103d6}', '\u{103ff}'), + ('\u{1049e}', '\u{1049f}'), + ('\u{104aa}', '\u{104af}'), + ('\u{104d4}', '\u{104d7}'), + ('\u{104fc}', '\u{104ff}'), + ('\u{10528}', '\u{1052f}'), + ('\u{10564}', '\u{1056e}'), + ('\u{1057b}', '\u{1057b}'), + ('\u{1058b}', '\u{1058b}'), + ('\u{10593}', '\u{10593}'), + ('\u{10596}', '\u{10596}'), + ('\u{105a2}', '\u{105a2}'), + ('\u{105b2}', '\u{105b2}'), + ('\u{105ba}', '\u{105ba}'), + ('\u{105bd}', '\u{105ff}'), + ('\u{10737}', '\u{1073f}'), + ('\u{10756}', '\u{1075f}'), + ('\u{10768}', '\u{1077f}'), + ('\u{10786}', '\u{10786}'), + ('\u{107b1}', '\u{107b1}'), + ('\u{107bb}', '\u{107ff}'), + ('\u{10806}', '\u{10807}'), + ('\u{10809}', '\u{10809}'), + ('\u{10836}', '\u{10836}'), + ('\u{10839}', '\u{1083b}'), + ('\u{1083d}', '\u{1083e}'), + ('\u{10856}', '\u{10856}'), + ('\u{1089f}', '\u{108a6}'), + ('\u{108b0}', '\u{108df}'), + ('\u{108f3}', '\u{108f3}'), + ('\u{108f6}', '\u{108fa}'), + ('\u{1091c}', '\u{1091e}'), + ('\u{1093a}', '\u{1093e}'), + ('\u{10940}', '\u{1097f}'), + ('\u{109b8}', '\u{109bb}'), + ('\u{109d0}', '\u{109d1}'), + ('\u{10a04}', '\u{10a04}'), + ('\u{10a07}', '\u{10a0b}'), + ('\u{10a14}', '\u{10a14}'), + ('\u{10a18}', '\u{10a18}'), + ('\u{10a36}', '\u{10a37}'), + ('\u{10a3b}', '\u{10a3e}'), + ('\u{10a49}', '\u{10a4f}'), + ('\u{10a59}', '\u{10a5f}'), + ('\u{10aa0}', '\u{10abf}'), + ('\u{10ae7}', '\u{10aea}'), + ('\u{10af7}', '\u{10aff}'), + ('\u{10b36}', '\u{10b38}'), + ('\u{10b56}', '\u{10b57}'), + ('\u{10b73}', '\u{10b77}'), + ('\u{10b92}', '\u{10b98}'), + ('\u{10b9d}', '\u{10ba8}'), + ('\u{10bb0}', '\u{10bff}'), + ('\u{10c49}', '\u{10c7f}'), + ('\u{10cb3}', '\u{10cbf}'), + ('\u{10cf3}', '\u{10cf9}'), + ('\u{10d28}', '\u{10d2f}'), + ('\u{10d3a}', '\u{10e5f}'), + ('\u{10e7f}', '\u{10e7f}'), + ('\u{10eaa}', '\u{10eaa}'), + ('\u{10eae}', '\u{10eaf}'), + ('\u{10eb2}', '\u{10efc}'), + ('\u{10f28}', '\u{10f2f}'), + ('\u{10f5a}', '\u{10f6f}'), + ('\u{10f8a}', '\u{10faf}'), + ('\u{10fcc}', '\u{10fdf}'), + ('\u{10ff7}', '\u{10fff}'), + ('\u{1104e}', '\u{11051}'), + ('\u{11076}', '\u{1107e}'), + ('\u{110c3}', '\u{110cc}'), + ('\u{110ce}', '\u{110cf}'), + ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), + ('\u{11135}', '\u{11135}'), + ('\u{11148}', '\u{1114f}'), + ('\u{11177}', '\u{1117f}'), + ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), + ('\u{11212}', '\u{11212}'), + ('\u{11242}', '\u{1127f}'), + ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), + ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), + ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), + ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), + ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), + ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), + ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133a}'), + ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), + ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), + ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), + ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), + ('\u{1145c}', '\u{1145c}'), + ('\u{11462}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), + ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), + ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), + ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), + ('\u{116ba}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), + ('\u{1171b}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), + ('\u{11747}', '\u{117ff}'), + ('\u{1183c}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), + ('\u{11907}', '\u{11908}'), + ('\u{1190a}', '\u{1190b}'), + ('\u{11914}', '\u{11914}'), + ('\u{11917}', '\u{11917}'), + ('\u{11936}', '\u{11936}'), + ('\u{11939}', '\u{1193a}'), + ('\u{11947}', '\u{1194f}'), + ('\u{1195a}', '\u{1199f}'), + ('\u{119a8}', '\u{119a9}'), + ('\u{119d8}', '\u{119d9}'), + ('\u{119e5}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), + ('\u{11aa3}', '\u{11aaf}'), + ('\u{11af9}', '\u{11aff}'), + ('\u{11b0a}', '\u{11bff}'), + ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), + ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), + ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), + ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), + ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), + ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), + ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11d5f}'), + ('\u{11d66}', '\u{11d66}'), + ('\u{11d69}', '\u{11d69}'), + ('\u{11d8f}', '\u{11d8f}'), + ('\u{11d92}', '\u{11d92}'), + ('\u{11d99}', '\u{11d9f}'), + ('\u{11daa}', '\u{11edf}'), + ('\u{11ef9}', '\u{11eff}'), + ('\u{11f11}', '\u{11f11}'), + ('\u{11f3b}', '\u{11f3d}'), + ('\u{11f5a}', '\u{11faf}'), + ('\u{11fb1}', '\u{11fbf}'), + ('\u{11ff2}', '\u{11ffe}'), + ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), + ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12f8f}'), + ('\u{12ff3}', '\u{12fff}'), + ('\u{13456}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), + ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), + ('\u{16a6a}', '\u{16a6d}'), + ('\u{16abf}', '\u{16abf}'), + ('\u{16aca}', '\u{16acf}'), + ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), + ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), + ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), + ('\u{16b90}', '\u{16e3f}'), + ('\u{16e9b}', '\u{16eff}'), + ('\u{16f4b}', '\u{16f4e}'), + ('\u{16f88}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), + ('\u{16fe5}', '\u{16fef}'), + ('\u{16ff2}', '\u{16fff}'), + ('\u{187f8}', '\u{187ff}'), + ('\u{18cd6}', '\u{18cff}'), + ('\u{18d09}', '\u{1afef}'), + ('\u{1aff4}', '\u{1aff4}'), + ('\u{1affc}', '\u{1affc}'), + ('\u{1afff}', '\u{1afff}'), + ('\u{1b123}', '\u{1b131}'), + ('\u{1b133}', '\u{1b14f}'), + ('\u{1b153}', '\u{1b154}'), + ('\u{1b156}', '\u{1b163}'), + ('\u{1b168}', '\u{1b16f}'), + ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), + ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), + ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca4}', '\u{1ceff}'), + ('\u{1cf2e}', '\u{1cf2f}'), + ('\u{1cf47}', '\u{1cf4f}'), + ('\u{1cfc4}', '\u{1cfff}'), + ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), + ('\u{1d1eb}', '\u{1d1ff}'), + ('\u{1d246}', '\u{1d2bf}'), + ('\u{1d2d4}', '\u{1d2df}'), + ('\u{1d2f4}', '\u{1d2ff}'), + ('\u{1d357}', '\u{1d35f}'), + ('\u{1d379}', '\u{1d3ff}'), + ('\u{1d455}', '\u{1d455}'), + ('\u{1d49d}', '\u{1d49d}'), + ('\u{1d4a0}', '\u{1d4a1}'), + ('\u{1d4a3}', '\u{1d4a4}'), + ('\u{1d4a7}', '\u{1d4a8}'), + ('\u{1d4ad}', '\u{1d4ad}'), + ('\u{1d4ba}', '\u{1d4ba}'), + ('\u{1d4bc}', '\u{1d4bc}'), + ('\u{1d4c4}', '\u{1d4c4}'), + ('\u{1d506}', '\u{1d506}'), + ('\u{1d50b}', '\u{1d50c}'), + ('\u{1d515}', '\u{1d515}'), + ('\u{1d51d}', '\u{1d51d}'), + ('\u{1d53a}', '\u{1d53a}'), + ('\u{1d53f}', '\u{1d53f}'), + ('\u{1d545}', '\u{1d545}'), + ('\u{1d547}', '\u{1d549}'), + ('\u{1d551}', '\u{1d551}'), + ('\u{1d6a6}', '\u{1d6a7}'), + ('\u{1d7cc}', '\u{1d7cd}'), + ('\u{1da8c}', '\u{1da9a}'), + ('\u{1daa0}', '\u{1daa0}'), + ('\u{1dab0}', '\u{1deff}'), + ('\u{1df1f}', '\u{1df24}'), + ('\u{1df2b}', '\u{1dfff}'), + ('\u{1e007}', '\u{1e007}'), + ('\u{1e019}', '\u{1e01a}'), + ('\u{1e022}', '\u{1e022}'), + ('\u{1e025}', '\u{1e025}'), + ('\u{1e02b}', '\u{1e02f}'), + ('\u{1e06e}', '\u{1e08e}'), + ('\u{1e090}', '\u{1e0ff}'), + ('\u{1e12d}', '\u{1e12f}'), + ('\u{1e13e}', '\u{1e13f}'), + ('\u{1e14a}', '\u{1e14d}'), + ('\u{1e150}', '\u{1e28f}'), + ('\u{1e2af}', '\u{1e2bf}'), + ('\u{1e2fa}', '\u{1e2fe}'), + ('\u{1e300}', '\u{1e4cf}'), + ('\u{1e4fa}', '\u{1e7df}'), + ('\u{1e7e7}', '\u{1e7e7}'), + ('\u{1e7ec}', '\u{1e7ec}'), + ('\u{1e7ef}', '\u{1e7ef}'), + ('\u{1e7ff}', '\u{1e7ff}'), + ('\u{1e8c5}', '\u{1e8c6}'), + ('\u{1e8d7}', '\u{1e8ff}'), + ('\u{1e94c}', '\u{1e94f}'), + ('\u{1e95a}', '\u{1e95d}'), + ('\u{1e960}', '\u{1ec70}'), + ('\u{1ecb5}', '\u{1ed00}'), + ('\u{1ed3e}', '\u{1edff}'), + ('\u{1ee04}', '\u{1ee04}'), + ('\u{1ee20}', '\u{1ee20}'), + ('\u{1ee23}', '\u{1ee23}'), + ('\u{1ee25}', '\u{1ee26}'), + ('\u{1ee28}', '\u{1ee28}'), + ('\u{1ee33}', '\u{1ee33}'), + ('\u{1ee38}', '\u{1ee38}'), + ('\u{1ee3a}', '\u{1ee3a}'), + ('\u{1ee3c}', '\u{1ee41}'), + ('\u{1ee43}', '\u{1ee46}'), + ('\u{1ee48}', '\u{1ee48}'), + ('\u{1ee4a}', '\u{1ee4a}'), + ('\u{1ee4c}', '\u{1ee4c}'), + ('\u{1ee50}', '\u{1ee50}'), + ('\u{1ee53}', '\u{1ee53}'), + ('\u{1ee55}', '\u{1ee56}'), + ('\u{1ee58}', '\u{1ee58}'), + ('\u{1ee5a}', '\u{1ee5a}'), + ('\u{1ee5c}', '\u{1ee5c}'), + ('\u{1ee5e}', '\u{1ee5e}'), + ('\u{1ee60}', '\u{1ee60}'), + ('\u{1ee63}', '\u{1ee63}'), + ('\u{1ee65}', '\u{1ee66}'), + ('\u{1ee6b}', '\u{1ee6b}'), + ('\u{1ee73}', '\u{1ee73}'), + ('\u{1ee78}', '\u{1ee78}'), + ('\u{1ee7d}', '\u{1ee7d}'), + ('\u{1ee7f}', '\u{1ee7f}'), + ('\u{1ee8a}', '\u{1ee8a}'), + ('\u{1ee9c}', '\u{1eea0}'), + ('\u{1eea4}', '\u{1eea4}'), + ('\u{1eeaa}', '\u{1eeaa}'), + ('\u{1eebc}', '\u{1eeef}'), + ('\u{1eef2}', '\u{1efff}'), + ('\u{1f02c}', '\u{1f02f}'), + ('\u{1f094}', '\u{1f09f}'), + ('\u{1f0af}', '\u{1f0b0}'), + ('\u{1f0c0}', '\u{1f0c0}'), + ('\u{1f0d0}', '\u{1f0d0}'), + ('\u{1f0f6}', '\u{1f0ff}'), + ('\u{1f1ae}', '\u{1f1e5}'), + ('\u{1f203}', '\u{1f20f}'), + ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '\u{1f24f}'), + ('\u{1f252}', '\u{1f25f}'), + ('\u{1f266}', '\u{1f2ff}'), + ('\u{1f6d8}', '\u{1f6db}'), + ('\u{1f6ed}', '\u{1f6ef}'), + ('\u{1f6fd}', '\u{1f6ff}'), + ('\u{1f777}', '\u{1f77a}'), + ('\u{1f7da}', '\u{1f7df}'), + ('\u{1f7ec}', '\u{1f7ef}'), + ('\u{1f7f1}', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8af}'), + ('\u{1f8b2}', '\u{1f8ff}'), + ('\u{1fa54}', '\u{1fa5f}'), + ('\u{1fa6e}', '\u{1fa6f}'), + ('\u{1fa7d}', '\u{1fa7f}'), + ('\u{1fa89}', '\u{1fa8f}'), + ('\u{1fabe}', '\u{1fabe}'), + ('\u{1fac6}', '\u{1facd}'), + ('\u{1fadc}', '\u{1fadf}'), + ('\u{1fae9}', '\u{1faef}'), + ('\u{1faf9}', '\u{1faff}'), + ('\u{1fb93}', '\u{1fb93}'), + ('\u{1fbcb}', '\u{1fbef}'), + ('\u{1fbfa}', '\u{1ffff}'), + ('\u{2a6e0}', '\u{2a6ff}'), + ('\u{2b73a}', '\u{2b73f}'), + ('\u{2b81e}', '\u{2b81f}'), + ('\u{2cea2}', '\u{2ceaf}'), + ('\u{2ebe1}', '\u{2f7ff}'), + ('\u{2fa1e}', '\u{2ffff}'), + ('\u{3134b}', '\u{3134f}'), + ('\u{323b0}', '\u{e0000}'), + ('\u{e0002}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{effff}'), + ('\u{ffffe}', '\u{fffff}'), + ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const UPPERCASE_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('À', 'Ö'), + ('Ø', 'Þ'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'DŽ'), + ('LJ', 'LJ'), + ('NJ', 'NJ'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'DZ'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϴ', 'ϴ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('Ᾰ', 'Ά'), + ('Ὲ', 'Ή'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('Ὸ', 'Ώ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℋ', 'ℍ'), + ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℰ', 'ℳ'), + ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), + ('Ↄ', 'Ↄ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𝐀', '𝐙'), + ('𝐴', '𝑍'), + ('𝑨', '𝒁'), + ('𝒜', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒵'), + ('𝓐', '𝓩'), + ('𝔄', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔸', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕬', '𝖅'), + ('𝖠', '𝖹'), + ('𝗔', '𝗭'), + ('𝘈', '𝘡'), + ('𝘼', '𝙕'), + ('𝙰', '𝚉'), + ('𝚨', '𝛀'), + ('𝛢', '𝛺'), + ('𝜜', '𝜴'), + ('𝝖', '𝝮'), + ('𝞐', '𝞨'), + ('𝟊', '𝟊'), + ('𞤀', '𞤡'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs b/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs new file mode 100644 index 0000000..294dfbd --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs @@ -0,0 +1,1416 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate grapheme-cluster-break ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("CR", CR), + ("Control", CONTROL), + ("Extend", EXTEND), + ("L", L), + ("LF", LF), + ("LV", LV), + ("LVT", LVT), + ("Prepend", PREPEND), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("SpacingMark", SPACINGMARK), + ("T", T), + ("V", V), + ("ZWJ", ZWJ), +]; + +pub const CR: &'static [(char, char)] = &[('\r', '\r')]; + +pub const CONTROL: &'static [(char, char)] = &[ + ('\0', '\t'), + ('\u{b}', '\u{c}'), + ('\u{e}', '\u{1f}'), + ('\u{7f}', '\u{9f}'), + ('\u{ad}', '\u{ad}'), + ('\u{61c}', '\u{61c}'), + ('\u{180e}', '\u{180e}'), + ('\u{200b}', '\u{200b}'), + ('\u{200e}', '\u{200f}'), + ('\u{2028}', '\u{202e}'), + ('\u{2060}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff0}', '\u{fffb}'), + ('\u{13430}', '\u{1343f}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0000}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{e0fff}'), +]; + +pub const EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9be}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', '\u{bbe}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc2}', '\u{cc2}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d3e}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dcf}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{ddf}', '\u{ddf}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200c}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('\u{aaec}', '\u{aaed}'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '\u{1133e}'), + ('\u{11340}', '\u{11340}'), + ('\u{11357}', '\u{11357}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114b0}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bd}', '\u{114bd}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115af}', '\u{115af}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{11930}', '\u{11930}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{11f00}', '\u{11f01}'), + ('\u{11f36}', '\u{11f3a}'), + ('\u{11f40}', '\u{11f40}'), + ('\u{11f42}', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d165}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d16e}', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('🏻', '🏿'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const L: &'static [(char, char)] = &[('ᄀ', 'ᅟ'), ('ꥠ', 'ꥼ')]; + +pub const LF: &'static [(char, char)] = &[('\n', '\n')]; + +pub const LV: &'static [(char, char)] = &[ + ('가', '가'), + ('개', '개'), + ('갸', '갸'), + ('걔', '걔'), + ('거', '거'), + ('게', '게'), + ('겨', '겨'), + ('계', '계'), + ('고', '고'), + ('과', '과'), + ('괘', '괘'), + ('괴', '괴'), + ('교', '교'), + ('구', '구'), + ('궈', '궈'), + ('궤', '궤'), + ('귀', '귀'), + ('규', '규'), + ('그', '그'), + ('긔', '긔'), + ('기', '기'), + ('까', '까'), + ('깨', '깨'), + ('꺄', '꺄'), + ('꺠', '꺠'), + ('꺼', '꺼'), + ('께', '께'), + ('껴', '껴'), + ('꼐', '꼐'), + ('꼬', '꼬'), + ('꽈', '꽈'), + ('꽤', '꽤'), + ('꾀', '꾀'), + ('꾜', '꾜'), + ('꾸', '꾸'), + ('꿔', '꿔'), + ('꿰', '꿰'), + ('뀌', '뀌'), + ('뀨', '뀨'), + ('끄', '끄'), + ('끠', '끠'), + ('끼', '끼'), + ('나', '나'), + ('내', '내'), + ('냐', '냐'), + ('냬', '냬'), + ('너', '너'), + ('네', '네'), + ('녀', '녀'), + ('녜', '녜'), + ('노', '노'), + ('놔', '놔'), + ('놰', '놰'), + ('뇌', '뇌'), + ('뇨', '뇨'), + ('누', '누'), + ('눠', '눠'), + ('눼', '눼'), + ('뉘', '뉘'), + ('뉴', '뉴'), + ('느', '느'), + ('늬', '늬'), + ('니', '니'), + ('다', '다'), + ('대', '대'), + ('댜', '댜'), + ('댸', '댸'), + ('더', '더'), + ('데', '데'), + ('뎌', '뎌'), + ('뎨', '뎨'), + ('도', '도'), + ('돠', '돠'), + ('돼', '돼'), + ('되', '되'), + ('됴', '됴'), + ('두', '두'), + ('둬', '둬'), + ('뒈', '뒈'), + ('뒤', '뒤'), + ('듀', '듀'), + ('드', '드'), + ('듸', '듸'), + ('디', '디'), + ('따', '따'), + ('때', '때'), + ('땨', '땨'), + ('떄', '떄'), + ('떠', '떠'), + ('떼', '떼'), + ('뗘', '뗘'), + ('뗴', '뗴'), + ('또', '또'), + ('똬', '똬'), + ('뙈', '뙈'), + ('뙤', '뙤'), + ('뚀', '뚀'), + ('뚜', '뚜'), + ('뚸', '뚸'), + ('뛔', '뛔'), + ('뛰', '뛰'), + ('뜌', '뜌'), + ('뜨', '뜨'), + ('띄', '띄'), + ('띠', '띠'), + ('라', '라'), + ('래', '래'), + ('랴', '랴'), + ('럐', '럐'), + ('러', '러'), + ('레', '레'), + ('려', '려'), + ('례', '례'), + ('로', '로'), + ('롸', '롸'), + ('뢔', '뢔'), + ('뢰', '뢰'), + ('료', '료'), + ('루', '루'), + ('뤄', '뤄'), + ('뤠', '뤠'), + ('뤼', '뤼'), + ('류', '류'), + ('르', '르'), + ('릐', '릐'), + ('리', '리'), + ('마', '마'), + ('매', '매'), + ('먀', '먀'), + ('먜', '먜'), + ('머', '머'), + ('메', '메'), + ('며', '며'), + ('몌', '몌'), + ('모', '모'), + ('뫄', '뫄'), + ('뫠', '뫠'), + ('뫼', '뫼'), + ('묘', '묘'), + ('무', '무'), + ('뭐', '뭐'), + ('뭬', '뭬'), + ('뮈', '뮈'), + ('뮤', '뮤'), + ('므', '므'), + ('믜', '믜'), + ('미', '미'), + ('바', '바'), + ('배', '배'), + ('뱌', '뱌'), + ('뱨', '뱨'), + ('버', '버'), + ('베', '베'), + ('벼', '벼'), + ('볘', '볘'), + ('보', '보'), + ('봐', '봐'), + ('봬', '봬'), + ('뵈', '뵈'), + ('뵤', '뵤'), + ('부', '부'), + ('붜', '붜'), + ('붸', '붸'), + ('뷔', '뷔'), + ('뷰', '뷰'), + ('브', '브'), + ('븨', '븨'), + ('비', '비'), + ('빠', '빠'), + ('빼', '빼'), + ('뺘', '뺘'), + ('뺴', '뺴'), + ('뻐', '뻐'), + ('뻬', '뻬'), + ('뼈', '뼈'), + ('뼤', '뼤'), + ('뽀', '뽀'), + ('뽜', '뽜'), + ('뽸', '뽸'), + ('뾔', '뾔'), + ('뾰', '뾰'), + ('뿌', '뿌'), + ('뿨', '뿨'), + ('쀄', '쀄'), + ('쀠', '쀠'), + ('쀼', '쀼'), + ('쁘', '쁘'), + ('쁴', '쁴'), + ('삐', '삐'), + ('사', '사'), + ('새', '새'), + ('샤', '샤'), + ('섀', '섀'), + ('서', '서'), + ('세', '세'), + ('셔', '셔'), + ('셰', '셰'), + ('소', '소'), + ('솨', '솨'), + ('쇄', '쇄'), + ('쇠', '쇠'), + ('쇼', '쇼'), + ('수', '수'), + ('숴', '숴'), + ('쉐', '쉐'), + ('쉬', '쉬'), + ('슈', '슈'), + ('스', '스'), + ('싀', '싀'), + ('시', '시'), + ('싸', '싸'), + ('쌔', '쌔'), + ('쌰', '쌰'), + ('썌', '썌'), + ('써', '써'), + ('쎄', '쎄'), + ('쎠', '쎠'), + ('쎼', '쎼'), + ('쏘', '쏘'), + ('쏴', '쏴'), + ('쐐', '쐐'), + ('쐬', '쐬'), + ('쑈', '쑈'), + ('쑤', '쑤'), + ('쒀', '쒀'), + ('쒜', '쒜'), + ('쒸', '쒸'), + ('쓔', '쓔'), + ('쓰', '쓰'), + ('씌', '씌'), + ('씨', '씨'), + ('아', '아'), + ('애', '애'), + ('야', '야'), + ('얘', '얘'), + ('어', '어'), + ('에', '에'), + ('여', '여'), + ('예', '예'), + ('오', '오'), + ('와', '와'), + ('왜', '왜'), + ('외', '외'), + ('요', '요'), + ('우', '우'), + ('워', '워'), + ('웨', '웨'), + ('위', '위'), + ('유', '유'), + ('으', '으'), + ('의', '의'), + ('이', '이'), + ('자', '자'), + ('재', '재'), + ('쟈', '쟈'), + ('쟤', '쟤'), + ('저', '저'), + ('제', '제'), + ('져', '져'), + ('졔', '졔'), + ('조', '조'), + ('좌', '좌'), + ('좨', '좨'), + ('죄', '죄'), + ('죠', '죠'), + ('주', '주'), + ('줘', '줘'), + ('줴', '줴'), + ('쥐', '쥐'), + ('쥬', '쥬'), + ('즈', '즈'), + ('즤', '즤'), + ('지', '지'), + ('짜', '짜'), + ('째', '째'), + ('쨔', '쨔'), + ('쨰', '쨰'), + ('쩌', '쩌'), + ('쩨', '쩨'), + ('쪄', '쪄'), + ('쪠', '쪠'), + ('쪼', '쪼'), + ('쫘', '쫘'), + ('쫴', '쫴'), + ('쬐', '쬐'), + ('쬬', '쬬'), + ('쭈', '쭈'), + ('쭤', '쭤'), + ('쮀', '쮀'), + ('쮜', '쮜'), + ('쮸', '쮸'), + ('쯔', '쯔'), + ('쯰', '쯰'), + ('찌', '찌'), + ('차', '차'), + ('채', '채'), + ('챠', '챠'), + ('챼', '챼'), + ('처', '처'), + ('체', '체'), + ('쳐', '쳐'), + ('쳬', '쳬'), + ('초', '초'), + ('촤', '촤'), + ('쵀', '쵀'), + ('최', '최'), + ('쵸', '쵸'), + ('추', '추'), + ('춰', '춰'), + ('췌', '췌'), + ('취', '취'), + ('츄', '츄'), + ('츠', '츠'), + ('츼', '츼'), + ('치', '치'), + ('카', '카'), + ('캐', '캐'), + ('캬', '캬'), + ('컈', '컈'), + ('커', '커'), + ('케', '케'), + ('켜', '켜'), + ('켸', '켸'), + ('코', '코'), + ('콰', '콰'), + ('쾌', '쾌'), + ('쾨', '쾨'), + ('쿄', '쿄'), + ('쿠', '쿠'), + ('쿼', '쿼'), + ('퀘', '퀘'), + ('퀴', '퀴'), + ('큐', '큐'), + ('크', '크'), + ('킈', '킈'), + ('키', '키'), + ('타', '타'), + ('태', '태'), + ('탸', '탸'), + ('턔', '턔'), + ('터', '터'), + ('테', '테'), + ('텨', '텨'), + ('톄', '톄'), + ('토', '토'), + ('톼', '톼'), + ('퇘', '퇘'), + ('퇴', '퇴'), + ('툐', '툐'), + ('투', '투'), + ('퉈', '퉈'), + ('퉤', '퉤'), + ('튀', '튀'), + ('튜', '튜'), + ('트', '트'), + ('틔', '틔'), + ('티', '티'), + ('파', '파'), + ('패', '패'), + ('퍄', '퍄'), + ('퍠', '퍠'), + ('퍼', '퍼'), + ('페', '페'), + ('펴', '펴'), + ('폐', '폐'), + ('포', '포'), + ('퐈', '퐈'), + ('퐤', '퐤'), + ('푀', '푀'), + ('표', '표'), + ('푸', '푸'), + ('풔', '풔'), + ('풰', '풰'), + ('퓌', '퓌'), + ('퓨', '퓨'), + ('프', '프'), + ('픠', '픠'), + ('피', '피'), + ('하', '하'), + ('해', '해'), + ('햐', '햐'), + ('햬', '햬'), + ('허', '허'), + ('헤', '헤'), + ('혀', '혀'), + ('혜', '혜'), + ('호', '호'), + ('화', '화'), + ('홰', '홰'), + ('회', '회'), + ('효', '효'), + ('후', '후'), + ('훠', '훠'), + ('훼', '훼'), + ('휘', '휘'), + ('휴', '휴'), + ('흐', '흐'), + ('희', '희'), + ('히', '히'), +]; + +pub const LVT: &'static [(char, char)] = &[ + ('각', '갛'), + ('객', '갷'), + ('갹', '걓'), + ('걕', '걯'), + ('걱', '겋'), + ('겍', '겧'), + ('격', '곃'), + ('곅', '곟'), + ('곡', '곻'), + ('곽', '괗'), + ('괙', '괳'), + ('괵', '굏'), + ('굑', '굫'), + ('국', '궇'), + ('궉', '궣'), + ('궥', '궿'), + ('귁', '귛'), + ('귝', '귷'), + ('극', '긓'), + ('긕', '긯'), + ('긱', '깋'), + ('깍', '깧'), + ('깩', '꺃'), + ('꺅', '꺟'), + ('꺡', '꺻'), + ('꺽', '껗'), + ('껙', '껳'), + ('껵', '꼏'), + ('꼑', '꼫'), + ('꼭', '꽇'), + ('꽉', '꽣'), + ('꽥', '꽿'), + ('꾁', '꾛'), + ('꾝', '꾷'), + ('꾹', '꿓'), + ('꿕', '꿯'), + ('꿱', '뀋'), + ('뀍', '뀧'), + ('뀩', '끃'), + ('끅', '끟'), + ('끡', '끻'), + ('끽', '낗'), + ('낙', '낳'), + ('낵', '냏'), + ('냑', '냫'), + ('냭', '넇'), + ('넉', '넣'), + ('넥', '넿'), + ('녁', '녛'), + ('녝', '녷'), + ('녹', '놓'), + ('놕', '놯'), + ('놱', '뇋'), + ('뇍', '뇧'), + ('뇩', '눃'), + ('눅', '눟'), + ('눡', '눻'), + ('눽', '뉗'), + ('뉙', '뉳'), + ('뉵', '늏'), + ('늑', '늫'), + ('늭', '닇'), + ('닉', '닣'), + ('닥', '닿'), + ('댁', '댛'), + ('댝', '댷'), + ('댹', '덓'), + ('덕', '덯'), + ('덱', '뎋'), + ('뎍', '뎧'), + ('뎩', '돃'), + ('독', '돟'), + ('돡', '돻'), + ('돽', '됗'), + ('됙', '됳'), + ('됵', '둏'), + ('둑', '둫'), + ('둭', '뒇'), + ('뒉', '뒣'), + ('뒥', '뒿'), + ('듁', '듛'), + ('득', '듷'), + ('듹', '딓'), + ('딕', '딯'), + ('딱', '땋'), + ('땍', '땧'), + ('땩', '떃'), + ('떅', '떟'), + ('떡', '떻'), + ('떽', '뗗'), + ('뗙', '뗳'), + ('뗵', '똏'), + ('똑', '똫'), + ('똭', '뙇'), + ('뙉', '뙣'), + ('뙥', '뙿'), + ('뚁', '뚛'), + ('뚝', '뚷'), + ('뚹', '뛓'), + ('뛕', '뛯'), + ('뛱', '뜋'), + ('뜍', '뜧'), + ('뜩', '띃'), + ('띅', '띟'), + ('띡', '띻'), + ('락', '랗'), + ('랙', '랳'), + ('략', '럏'), + ('럑', '럫'), + ('럭', '렇'), + ('렉', '렣'), + ('력', '렿'), + ('롁', '롛'), + ('록', '롷'), + ('롹', '뢓'), + ('뢕', '뢯'), + ('뢱', '룋'), + ('룍', '룧'), + ('룩', '뤃'), + ('뤅', '뤟'), + ('뤡', '뤻'), + ('뤽', '륗'), + ('륙', '륳'), + ('륵', '릏'), + ('릑', '릫'), + ('릭', '맇'), + ('막', '맣'), + ('맥', '맿'), + ('먁', '먛'), + ('먝', '먷'), + ('먹', '멓'), + ('멕', '멯'), + ('멱', '몋'), + ('몍', '몧'), + ('목', '뫃'), + ('뫅', '뫟'), + ('뫡', '뫻'), + ('뫽', '묗'), + ('묙', '묳'), + ('묵', '뭏'), + ('뭑', '뭫'), + ('뭭', '뮇'), + ('뮉', '뮣'), + ('뮥', '뮿'), + ('믁', '믛'), + ('믝', '믷'), + ('믹', '밓'), + ('박', '밯'), + ('백', '뱋'), + ('뱍', '뱧'), + ('뱩', '벃'), + ('벅', '벟'), + ('벡', '벻'), + ('벽', '볗'), + ('볙', '볳'), + ('복', '봏'), + ('봑', '봫'), + ('봭', '뵇'), + ('뵉', '뵣'), + ('뵥', '뵿'), + ('북', '붛'), + ('붝', '붷'), + ('붹', '뷓'), + ('뷕', '뷯'), + ('뷱', '븋'), + ('븍', '븧'), + ('븩', '빃'), + ('빅', '빟'), + ('빡', '빻'), + ('빽', '뺗'), + ('뺙', '뺳'), + ('뺵', '뻏'), + ('뻑', '뻫'), + ('뻭', '뼇'), + ('뼉', '뼣'), + ('뼥', '뼿'), + ('뽁', '뽛'), + ('뽝', '뽷'), + ('뽹', '뾓'), + ('뾕', '뾯'), + ('뾱', '뿋'), + ('뿍', '뿧'), + ('뿩', '쀃'), + ('쀅', '쀟'), + ('쀡', '쀻'), + ('쀽', '쁗'), + ('쁙', '쁳'), + ('쁵', '삏'), + ('삑', '삫'), + ('삭', '샇'), + ('색', '샣'), + ('샥', '샿'), + ('섁', '섛'), + ('석', '섷'), + ('섹', '셓'), + ('셕', '셯'), + ('셱', '솋'), + ('속', '솧'), + ('솩', '쇃'), + ('쇅', '쇟'), + ('쇡', '쇻'), + ('쇽', '숗'), + ('숙', '숳'), + ('숵', '쉏'), + ('쉑', '쉫'), + ('쉭', '슇'), + ('슉', '슣'), + ('슥', '슿'), + ('싁', '싛'), + ('식', '싷'), + ('싹', '쌓'), + ('쌕', '쌯'), + ('쌱', '썋'), + ('썍', '썧'), + ('썩', '쎃'), + ('쎅', '쎟'), + ('쎡', '쎻'), + ('쎽', '쏗'), + ('쏙', '쏳'), + ('쏵', '쐏'), + ('쐑', '쐫'), + ('쐭', '쑇'), + ('쑉', '쑣'), + ('쑥', '쑿'), + ('쒁', '쒛'), + ('쒝', '쒷'), + ('쒹', '쓓'), + ('쓕', '쓯'), + ('쓱', '씋'), + ('씍', '씧'), + ('씩', '앃'), + ('악', '앟'), + ('액', '앻'), + ('약', '얗'), + ('얙', '얳'), + ('억', '엏'), + ('엑', '엫'), + ('역', '옇'), + ('옉', '옣'), + ('옥', '옿'), + ('왁', '왛'), + ('왝', '왷'), + ('왹', '욓'), + ('욕', '욯'), + ('욱', '웋'), + ('웍', '웧'), + ('웩', '윃'), + ('윅', '윟'), + ('육', '윻'), + ('윽', '읗'), + ('읙', '읳'), + ('익', '잏'), + ('작', '잫'), + ('잭', '쟇'), + ('쟉', '쟣'), + ('쟥', '쟿'), + ('적', '젛'), + ('젝', '젷'), + ('젹', '졓'), + ('졕', '졯'), + ('족', '좋'), + ('좍', '좧'), + ('좩', '죃'), + ('죅', '죟'), + ('죡', '죻'), + ('죽', '줗'), + ('줙', '줳'), + ('줵', '쥏'), + ('쥑', '쥫'), + ('쥭', '즇'), + ('즉', '즣'), + ('즥', '즿'), + ('직', '짛'), + ('짝', '짷'), + ('짹', '쨓'), + ('쨕', '쨯'), + ('쨱', '쩋'), + ('쩍', '쩧'), + ('쩩', '쪃'), + ('쪅', '쪟'), + ('쪡', '쪻'), + ('쪽', '쫗'), + ('쫙', '쫳'), + ('쫵', '쬏'), + ('쬑', '쬫'), + ('쬭', '쭇'), + ('쭉', '쭣'), + ('쭥', '쭿'), + ('쮁', '쮛'), + ('쮝', '쮷'), + ('쮹', '쯓'), + ('쯕', '쯯'), + ('쯱', '찋'), + ('찍', '찧'), + ('착', '챃'), + ('책', '챟'), + ('챡', '챻'), + ('챽', '첗'), + ('척', '첳'), + ('첵', '쳏'), + ('쳑', '쳫'), + ('쳭', '촇'), + ('촉', '촣'), + ('촥', '촿'), + ('쵁', '쵛'), + ('쵝', '쵷'), + ('쵹', '춓'), + ('축', '춯'), + ('춱', '췋'), + ('췍', '췧'), + ('췩', '츃'), + ('츅', '츟'), + ('측', '츻'), + ('츽', '칗'), + ('칙', '칳'), + ('칵', '캏'), + ('캑', '캫'), + ('캭', '컇'), + ('컉', '컣'), + ('컥', '컿'), + ('켁', '켛'), + ('켝', '켷'), + ('켹', '콓'), + ('콕', '콯'), + ('콱', '쾋'), + ('쾍', '쾧'), + ('쾩', '쿃'), + ('쿅', '쿟'), + ('쿡', '쿻'), + ('쿽', '퀗'), + ('퀙', '퀳'), + ('퀵', '큏'), + ('큑', '큫'), + ('큭', '킇'), + ('킉', '킣'), + ('킥', '킿'), + ('탁', '탛'), + ('택', '탷'), + ('탹', '턓'), + ('턕', '턯'), + ('턱', '텋'), + ('텍', '텧'), + ('텩', '톃'), + ('톅', '톟'), + ('톡', '톻'), + ('톽', '퇗'), + ('퇙', '퇳'), + ('퇵', '툏'), + ('툑', '툫'), + ('툭', '퉇'), + ('퉉', '퉣'), + ('퉥', '퉿'), + ('튁', '튛'), + ('튝', '튷'), + ('특', '틓'), + ('틕', '틯'), + ('틱', '팋'), + ('팍', '팧'), + ('팩', '퍃'), + ('퍅', '퍟'), + ('퍡', '퍻'), + ('퍽', '펗'), + ('펙', '펳'), + ('펵', '폏'), + ('폑', '폫'), + ('폭', '퐇'), + ('퐉', '퐣'), + ('퐥', '퐿'), + ('푁', '푛'), + ('푝', '푷'), + ('푹', '풓'), + ('풕', '풯'), + ('풱', '퓋'), + ('퓍', '퓧'), + ('퓩', '픃'), + ('픅', '픟'), + ('픡', '픻'), + ('픽', '핗'), + ('학', '핳'), + ('핵', '햏'), + ('햑', '햫'), + ('햭', '헇'), + ('헉', '헣'), + ('헥', '헿'), + ('혁', '혛'), + ('혝', '혷'), + ('혹', '홓'), + ('확', '홯'), + ('홱', '횋'), + ('획', '횧'), + ('횩', '훃'), + ('훅', '훟'), + ('훡', '훻'), + ('훽', '휗'), + ('휙', '휳'), + ('휵', '흏'), + ('흑', '흫'), + ('흭', '힇'), + ('힉', '힣'), +]; + +pub const PREPEND: &'static [(char, char)] = &[ + ('\u{600}', '\u{605}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('ൎ', 'ൎ'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('𑇂', '𑇃'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑨺', '𑨺'), + ('𑪄', '𑪉'), + ('𑵆', '𑵆'), + ('𑼂', '𑼂'), +]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')]; + +pub const SPACINGMARK: &'static [(char, char)] = &[ + ('ः', 'ः'), + ('ऻ', 'ऻ'), + ('ा', 'ी'), + ('ॉ', 'ौ'), + ('ॎ', 'ॏ'), + ('ং', 'ঃ'), + ('ি', 'ী'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('ਃ', 'ਃ'), + ('ਾ', 'ੀ'), + ('ઃ', 'ઃ'), + ('ા', 'ી'), + ('ૉ', 'ૉ'), + ('ો', 'ૌ'), + ('ଂ', 'ଃ'), + ('ୀ', 'ୀ'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('ி', 'ி'), + ('ு', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('ఁ', 'ః'), + ('ు', 'ౄ'), + ('ಂ', 'ಃ'), + ('ಾ', 'ಾ'), + ('ೀ', 'ು'), + ('ೃ', 'ೄ'), + ('ೇ', 'ೈ'), + ('ೊ', 'ೋ'), + ('ೳ', 'ೳ'), + ('ം', 'ഃ'), + ('ി', 'ീ'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('ං', 'ඃ'), + ('ැ', 'ෑ'), + ('ෘ', 'ෞ'), + ('ෲ', 'ෳ'), + ('ำ', 'ำ'), + ('ຳ', 'ຳ'), + ('༾', '༿'), + ('ཿ', 'ཿ'), + ('ေ', 'ေ'), + ('ျ', 'ြ'), + ('ၖ', 'ၗ'), + ('ႄ', 'ႄ'), + ('᜕', '᜕'), + ('᜴', '᜴'), + ('ា', 'ា'), + ('ើ', 'ៅ'), + ('ះ', 'ៈ'), + ('ᤣ', 'ᤦ'), + ('ᤩ', 'ᤫ'), + ('ᤰ', 'ᤱ'), + ('ᤳ', 'ᤸ'), + ('ᨙ', 'ᨚ'), + ('ᩕ', 'ᩕ'), + ('ᩗ', 'ᩗ'), + ('ᩭ', 'ᩲ'), + ('ᬄ', 'ᬄ'), + ('ᬻ', 'ᬻ'), + ('ᬽ', 'ᭁ'), + ('ᭃ', '᭄'), + ('ᮂ', 'ᮂ'), + ('ᮡ', 'ᮡ'), + ('ᮦ', 'ᮧ'), + ('᮪', '᮪'), + ('ᯧ', 'ᯧ'), + ('ᯪ', 'ᯬ'), + ('ᯮ', 'ᯮ'), + ('᯲', '᯳'), + ('ᰤ', 'ᰫ'), + ('ᰴ', 'ᰵ'), + ('᳡', '᳡'), + ('᳷', '᳷'), + ('ꠣ', 'ꠤ'), + ('ꠧ', 'ꠧ'), + ('ꢀ', 'ꢁ'), + ('ꢴ', 'ꣃ'), + ('ꥒ', '꥓'), + ('ꦃ', 'ꦃ'), + ('ꦴ', 'ꦵ'), + ('ꦺ', 'ꦻ'), + ('ꦾ', '꧀'), + ('ꨯ', 'ꨰ'), + ('ꨳ', 'ꨴ'), + ('ꩍ', 'ꩍ'), + ('ꫫ', 'ꫫ'), + ('ꫮ', 'ꫯ'), + ('ꫵ', 'ꫵ'), + ('ꯣ', 'ꯤ'), + ('ꯦ', 'ꯧ'), + ('ꯩ', 'ꯪ'), + ('꯬', '꯬'), + ('𑀀', '𑀀'), + ('𑀂', '𑀂'), + ('𑂂', '𑂂'), + ('𑂰', '𑂲'), + ('𑂷', '𑂸'), + ('𑄬', '𑄬'), + ('𑅅', '𑅆'), + ('𑆂', '𑆂'), + ('𑆳', '𑆵'), + ('𑆿', '𑇀'), + ('𑇎', '𑇎'), + ('𑈬', '𑈮'), + ('𑈲', '𑈳'), + ('𑈵', '𑈵'), + ('𑋠', '𑋢'), + ('𑌂', '𑌃'), + ('𑌿', '𑌿'), + ('𑍁', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍢', '𑍣'), + ('𑐵', '𑐷'), + ('𑑀', '𑑁'), + ('𑑅', '𑑅'), + ('𑒱', '𑒲'), + ('𑒹', '𑒹'), + ('𑒻', '𑒼'), + ('𑒾', '𑒾'), + ('𑓁', '𑓁'), + ('𑖰', '𑖱'), + ('𑖸', '𑖻'), + ('𑖾', '𑖾'), + ('𑘰', '𑘲'), + ('𑘻', '𑘼'), + ('𑘾', '𑘾'), + ('𑚬', '𑚬'), + ('𑚮', '𑚯'), + ('𑚶', '𑚶'), + ('𑜦', '𑜦'), + ('𑠬', '𑠮'), + ('𑠸', '𑠸'), + ('𑤱', '𑤵'), + ('𑤷', '𑤸'), + ('𑤽', '𑤽'), + ('𑥀', '𑥀'), + ('𑥂', '𑥂'), + ('𑧑', '𑧓'), + ('𑧜', '𑧟'), + ('𑧤', '𑧤'), + ('𑨹', '𑨹'), + ('𑩗', '𑩘'), + ('𑪗', '𑪗'), + ('𑰯', '𑰯'), + ('𑰾', '𑰾'), + ('𑲩', '𑲩'), + ('𑲱', '𑲱'), + ('𑲴', '𑲴'), + ('𑶊', '𑶎'), + ('𑶓', '𑶔'), + ('𑶖', '𑶖'), + ('𑻵', '𑻶'), + ('𑼃', '𑼃'), + ('𑼴', '𑼵'), + ('𑼾', '𑼿'), + ('𑽁', '𑽁'), + ('𖽑', '𖾇'), + ('𖿰', '𖿱'), + ('𝅦', '𝅦'), + ('𝅭', '𝅭'), +]; + +pub const T: &'static [(char, char)] = &[('ᆨ', 'ᇿ'), ('ퟋ', 'ퟻ')]; + +pub const V: &'static [(char, char)] = &[('ᅠ', 'ᆧ'), ('ힰ', 'ퟆ')]; + +pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')]; diff --git a/vendor/regex-syntax/src/unicode_tables/mod.rs b/vendor/regex-syntax/src/unicode_tables/mod.rs new file mode 100644 index 0000000..20736c7 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/mod.rs @@ -0,0 +1,57 @@ +#[cfg(feature = "unicode-age")] +pub mod age; + +#[cfg(feature = "unicode-case")] +pub mod case_folding_simple; + +#[cfg(feature = "unicode-gencat")] +pub mod general_category; + +#[cfg(feature = "unicode-segment")] +pub mod grapheme_cluster_break; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] +#[allow(dead_code)] +pub mod perl_decimal; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] +#[allow(dead_code)] +pub mod perl_space; + +#[cfg(feature = "unicode-perl")] +pub mod perl_word; + +#[cfg(feature = "unicode-bool")] +pub mod property_bool; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] +pub mod property_names; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] +pub mod property_values; + +#[cfg(feature = "unicode-script")] +pub mod script; + +#[cfg(feature = "unicode-script")] +pub mod script_extension; + +#[cfg(feature = "unicode-segment")] +pub mod sentence_break; + +#[cfg(feature = "unicode-segment")] +pub mod word_break; diff --git a/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs b/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs new file mode 100644 index 0000000..4f4c08a --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs @@ -0,0 +1,77 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category ucd-15.0.0 --chars --include decimalnumber +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("Decimal_Number", DECIMAL_NUMBER)]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑥐', '𑥙'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞥐', '𞥙'), + ('🯰', '🯹'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/perl_space.rs b/vendor/regex-syntax/src/unicode_tables/perl_space.rs new file mode 100644 index 0000000..1741695 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/perl_space.rs @@ -0,0 +1,23 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool ucd-15.0.0 --chars --include whitespace +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("White_Space", WHITE_SPACE)]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/perl_word.rs b/vendor/regex-syntax/src/unicode_tables/perl_word.rs new file mode 100644 index 0000000..c1b66bd --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/perl_word.rs @@ -0,0 +1,781 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate perl-word ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const PERL_WORD: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{300}', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('\u{483}', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '٩'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{74a}'), + ('ݍ', 'ޱ'), + ('߀', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('ࠀ', '\u{82d}'), + ('ࡀ', '\u{85b}'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('०', '९'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૯'), + ('ૹ', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୯'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௯'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('൦', '൯'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', '\u{e4e}'), + ('๐', '๙'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('\u{f18}', '\u{f19}'), + ('༠', '༩'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('က', '၉'), + ('ၐ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '\u{135f}'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '᜕'), + ('ᜟ', '᜴'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('ៜ', '\u{17dd}'), + ('០', '៩'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧙'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᭌ'), + ('᭐', '᭙'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '᯳'), + ('ᰀ', '\u{1c37}'), + ('᱀', '᱉'), + ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'ᳺ'), + ('ᴀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('\u{200c}', '\u{200d}'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20f0}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('々', '〇'), + ('〡', '\u{302f}'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('\u{3099}', '\u{309a}'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), + ('Ꙁ', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', '\u{a6f1}'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꡀ', 'ꡳ'), + ('ꢀ', '\u{a8c5}'), + ('꣐', '꣙'), + ('\u{a8e0}', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a92d}'), + ('ꤰ', '꥓'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧀'), + ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', '\u{aaf6}'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('︳', '︴'), + ('﹍', '﹏'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('\u{101fd}', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('\u{102e0}', '\u{102e0}'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '\u{10ae6}'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('\u{10efd}', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '\u{10f50}'), + ('𐽰', '\u{10f85}'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11046}'), + ('𑁦', '𑁵'), + ('\u{1107f}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑄿'), + ('𑅄', '𑅇'), + ('𑅐', '\u{11173}'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑇄'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐀', '𑑊'), + ('𑑐', '𑑙'), + ('\u{1145e}', '𑑡'), + ('𑒀', '𑓅'), + ('𑓇', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('𑗘', '\u{115dd}'), + ('𑘀', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑙐', '𑙙'), + ('𑚀', '𑚸'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜹'), + ('𑝀', '𑝆'), + ('𑠀', '\u{1183a}'), + ('𑢠', '𑣩'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{11943}'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('𑩐', '\u{11a99}'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱀'), + ('𑱐', '𑱙'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('𑽐', '𑽙'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('\u{13440}', '\u{13455}'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('\u{16af0}', '\u{16af4}'), + ('𖬀', '\u{16b36}'), + ('𖭀', '𖭃'), + ('𖭐', '𖭙'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅎'), + ('𞊐', '\u{1e2ae}'), + ('𞋀', '𞋹'), + ('𞓐', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('𞤀', '𞥋'), + ('𞥐', '𞥙'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), + ('\u{e0100}', '\u{e01ef}'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/property_bool.rs b/vendor/regex-syntax/src/unicode_tables/property_bool.rs new file mode 100644 index 0000000..a3e84b5 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/property_bool.rs @@ -0,0 +1,11367 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ASCII_Hex_Digit", ASCII_HEX_DIGIT), + ("Alphabetic", ALPHABETIC), + ("Bidi_Control", BIDI_CONTROL), + ("Bidi_Mirrored", BIDI_MIRRORED), + ("Case_Ignorable", CASE_IGNORABLE), + ("Cased", CASED), + ("Changes_When_Casefolded", CHANGES_WHEN_CASEFOLDED), + ("Changes_When_Casemapped", CHANGES_WHEN_CASEMAPPED), + ("Changes_When_Lowercased", CHANGES_WHEN_LOWERCASED), + ("Changes_When_Titlecased", CHANGES_WHEN_TITLECASED), + ("Changes_When_Uppercased", CHANGES_WHEN_UPPERCASED), + ("Dash", DASH), + ("Default_Ignorable_Code_Point", DEFAULT_IGNORABLE_CODE_POINT), + ("Deprecated", DEPRECATED), + ("Diacritic", DIACRITIC), + ("Emoji", EMOJI), + ("Emoji_Component", EMOJI_COMPONENT), + ("Emoji_Modifier", EMOJI_MODIFIER), + ("Emoji_Modifier_Base", EMOJI_MODIFIER_BASE), + ("Emoji_Presentation", EMOJI_PRESENTATION), + ("Extended_Pictographic", EXTENDED_PICTOGRAPHIC), + ("Extender", EXTENDER), + ("Grapheme_Base", GRAPHEME_BASE), + ("Grapheme_Extend", GRAPHEME_EXTEND), + ("Grapheme_Link", GRAPHEME_LINK), + ("Hex_Digit", HEX_DIGIT), + ("Hyphen", HYPHEN), + ("IDS_Binary_Operator", IDS_BINARY_OPERATOR), + ("IDS_Trinary_Operator", IDS_TRINARY_OPERATOR), + ("ID_Continue", ID_CONTINUE), + ("ID_Start", ID_START), + ("Ideographic", IDEOGRAPHIC), + ("Join_Control", JOIN_CONTROL), + ("Logical_Order_Exception", LOGICAL_ORDER_EXCEPTION), + ("Lowercase", LOWERCASE), + ("Math", MATH), + ("Noncharacter_Code_Point", NONCHARACTER_CODE_POINT), + ("Other_Alphabetic", OTHER_ALPHABETIC), + ("Other_Default_Ignorable_Code_Point", OTHER_DEFAULT_IGNORABLE_CODE_POINT), + ("Other_Grapheme_Extend", OTHER_GRAPHEME_EXTEND), + ("Other_ID_Continue", OTHER_ID_CONTINUE), + ("Other_ID_Start", OTHER_ID_START), + ("Other_Lowercase", OTHER_LOWERCASE), + ("Other_Math", OTHER_MATH), + ("Other_Uppercase", OTHER_UPPERCASE), + ("Pattern_Syntax", PATTERN_SYNTAX), + ("Pattern_White_Space", PATTERN_WHITE_SPACE), + ("Prepended_Concatenation_Mark", PREPENDED_CONCATENATION_MARK), + ("Quotation_Mark", QUOTATION_MARK), + ("Radical", RADICAL), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("Sentence_Terminal", SENTENCE_TERMINAL), + ("Soft_Dotted", SOFT_DOTTED), + ("Terminal_Punctuation", TERMINAL_PUNCTUATION), + ("Unified_Ideograph", UNIFIED_IDEOGRAPH), + ("Uppercase", UPPERCASE), + ("Variation_Selector", VARIATION_SELECTOR), + ("White_Space", WHITE_SPACE), + ("XID_Continue", XID_CONTINUE), + ("XID_Start", XID_START), +]; + +pub const ASCII_HEX_DIGIT: &'static [(char, char)] = + &[('0', '9'), ('A', 'F'), ('a', 'f')]; + +pub const ALPHABETIC: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{5b0}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '\u{657}'), + ('\u{659}', '\u{65f}'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6e1}', '\u{6e8}'), + ('\u{6ed}', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{73f}'), + ('ݍ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', '\u{817}'), + ('ࠚ', '\u{82c}'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('\u{8d4}', '\u{8df}'), + ('\u{8e3}', '\u{8e9}'), + ('\u{8f0}', 'ऻ'), + ('ऽ', 'ौ'), + ('ॎ', 'ॐ'), + ('\u{955}', '\u{963}'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('ৎ', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4c}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('\u{a70}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', 'ૌ'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('ૹ', '\u{afc}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('\u{b56}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4c}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccc}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('ൎ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', 'ๆ'), + ('\u{e4d}', '\u{e4d}'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', '\u{eb9}'), + ('\u{ebb}', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ecd}', '\u{ecd}'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f83}'), + ('ྈ', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('က', '\u{1036}'), + ('း', 'း'), + ('ျ', 'ဿ'), + ('ၐ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '\u{1713}'), + ('ᜟ', '\u{1733}'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', 'ឳ'), + ('ា', 'ៈ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', 'ᤸ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('ᩡ', '\u{1a74}'), + ('ᪧ', 'ᪧ'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{1acc}', '\u{1ace}'), + ('\u{1b00}', 'ᬳ'), + ('\u{1b35}', 'ᭃ'), + ('ᭅ', 'ᭌ'), + ('\u{1b80}', '\u{1ba9}'), + ('\u{1bac}', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᯧ', '\u{1bf1}'), + ('ᰀ', '\u{1c36}'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('\u{1de7}', '\u{1df4}'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('々', '〇'), + ('〡', '〩'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('\u{a674}', '\u{a67b}'), + ('ꙿ', 'ꛯ'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠅ'), + ('ꠇ', 'ꠧ'), + ('ꡀ', 'ꡳ'), + ('ꢀ', 'ꣃ'), + ('\u{a8c5}', '\u{a8c5}'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a8ff}'), + ('ꤊ', '\u{a92a}'), + ('ꤰ', 'ꥒ'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', 'ꦲ'), + ('ꦴ', 'ꦿ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', '\u{aabe}'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', 'ꫵ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11045}'), + ('𑁱', '𑁵'), + ('\u{11080}', '𑂸'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('\u{11100}', '\u{11132}'), + ('𑅄', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑆿'), + ('𑇁', '𑇄'), + ('𑇎', '\u{111cf}'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11234}'), + ('\u{11237}', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112e8}'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍌'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('𑐀', '𑑁'), + ('\u{11443}', '𑑅'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑓁'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '\u{115b5}'), + ('𑖸', '𑖾'), + ('𑗘', '\u{115dd}'), + ('𑘀', '𑘾'), + ('\u{11640}', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑚀', '\u{116b5}'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172a}'), + ('𑝀', '𑝆'), + ('𑠀', '𑠸'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193c}'), + ('𑤿', '𑥂'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧟'), + ('𑧡', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '𑨲'), + ('\u{11a35}', '\u{11a3e}'), + ('𑩐', '𑪗'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑰾'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d41}'), + ('\u{11d43}', '\u{11d43}'), + ('𑵆', '\u{11d47}'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶖'), + ('𑶘', '𑶘'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f40}'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9e}', '\u{1bc9e}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('\u{1e947}', '\u{1e947}'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const BIDI_CONTROL: &'static [(char, char)] = &[ + ('\u{61c}', '\u{61c}'), + ('\u{200e}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2066}', '\u{2069}'), +]; + +pub const BIDI_MIRRORED: &'static [(char, char)] = &[ + ('(', ')'), + ('<', '<'), + ('>', '>'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('«', '«'), + ('»', '»'), + ('༺', '༽'), + ('᚛', '᚜'), + ('‹', '›'), + ('⁅', '⁆'), + ('⁽', '⁾'), + ('₍', '₎'), + ('⅀', '⅀'), + ('∁', '∄'), + ('∈', '∍'), + ('∑', '∑'), + ('∕', '∖'), + ('√', '∝'), + ('∟', '∢'), + ('∤', '∤'), + ('∦', '∦'), + ('∫', '∳'), + ('∹', '∹'), + ('∻', '≌'), + ('≒', '≕'), + ('≟', '≠'), + ('≢', '≢'), + ('≤', '≫'), + ('≮', '⊌'), + ('⊏', '⊒'), + ('⊘', '⊘'), + ('⊢', '⊣'), + ('⊦', '⊸'), + ('⊾', '⊿'), + ('⋉', '⋍'), + ('⋐', '⋑'), + ('⋖', '⋭'), + ('⋰', '⋿'), + ('⌈', '⌋'), + ('⌠', '⌡'), + ('〈', '〉'), + ('❨', '❵'), + ('⟀', '⟀'), + ('⟃', '⟆'), + ('⟈', '⟉'), + ('⟋', '⟍'), + ('⟓', '⟖'), + ('⟜', '⟞'), + ('⟢', '⟯'), + ('⦃', '⦘'), + ('⦛', '⦠'), + ('⦢', '⦯'), + ('⦸', '⦸'), + ('⧀', '⧅'), + ('⧉', '⧉'), + ('⧎', '⧒'), + ('⧔', '⧕'), + ('⧘', '⧜'), + ('⧡', '⧡'), + ('⧣', '⧥'), + ('⧨', '⧩'), + ('⧴', '⧹'), + ('⧼', '⧽'), + ('⨊', '⨜'), + ('⨞', '⨡'), + ('⨤', '⨤'), + ('⨦', '⨦'), + ('⨩', '⨩'), + ('⨫', '⨮'), + ('⨴', '⨵'), + ('⨼', '⨾'), + ('⩗', '⩘'), + ('⩤', '⩥'), + ('⩪', '⩭'), + ('⩯', '⩰'), + ('⩳', '⩴'), + ('⩹', '⪣'), + ('⪦', '⪭'), + ('⪯', '⫖'), + ('⫝̸', '⫝̸'), + ('⫞', '⫞'), + ('⫢', '⫦'), + ('⫬', '⫮'), + ('⫳', '⫳'), + ('⫷', '⫻'), + ('⫽', '⫽'), + ('⯾', '⯾'), + ('⸂', '⸅'), + ('⸉', '⸊'), + ('⸌', '⸍'), + ('⸜', '⸝'), + ('⸠', '⸩'), + ('⹕', '⹜'), + ('〈', '】'), + ('〔', '〛'), + ('﹙', '﹞'), + ('﹤', '﹥'), + ('(', ')'), + ('<', '<'), + ('>', '>'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('⦅', '⦆'), + ('「', '」'), + ('𝛛', '𝛛'), + ('𝜕', '𝜕'), + ('𝝏', '𝝏'), + ('𝞉', '𝞉'), + ('𝟃', '𝟃'), +]; + +pub const CASE_IGNORABLE: &'static [(char, char)] = &[ + ('\'', '\''), + ('.', '.'), + (':', ':'), + ('^', '^'), + ('`', '`'), + ('¨', '¨'), + ('\u{ad}', '\u{ad}'), + ('¯', '¯'), + ('´', '´'), + ('·', '¸'), + ('ʰ', '\u{36f}'), + ('ʹ', '͵'), + ('ͺ', 'ͺ'), + ('΄', '΅'), + ('·', '·'), + ('\u{483}', '\u{489}'), + ('ՙ', 'ՙ'), + ('՟', '՟'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('״', '״'), + ('\u{600}', '\u{605}'), + ('\u{610}', '\u{61a}'), + ('\u{61c}', '\u{61c}'), + ('ـ', 'ـ'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dd}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{70f}', '\u{70f}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('࢈', '࢈'), + ('\u{890}', '\u{891}'), + ('\u{898}', '\u{89f}'), + ('ࣉ', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('ॱ', 'ॱ'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3f}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b56}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('ๆ', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('ჼ', 'ჼ'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180f}'), + ('ᡃ', 'ᡃ'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b34}'), + ('\u{1b36}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('ᱸ', 'ᱽ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('ᴬ', 'ᵪ'), + ('ᵸ', 'ᵸ'), + ('ᶛ', '\u{1dff}'), + ('᾽', '᾽'), + ('᾿', '῁'), + ('῍', '῏'), + ('῝', '῟'), + ('῭', '`'), + ('´', '῾'), + ('\u{200b}', '\u{200f}'), + ('‘', '’'), + ('․', '․'), + ('‧', '‧'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20f0}'), + ('ⱼ', 'ⱽ'), + ('\u{2cef}', '\u{2cf1}'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('々', '々'), + ('\u{302a}', '\u{302d}'), + ('〱', '〵'), + ('〻', '〻'), + ('\u{3099}', 'ゞ'), + ('ー', 'ヾ'), + ('ꀕ', 'ꀕ'), + ('ꓸ', 'ꓽ'), + ('ꘌ', 'ꘌ'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', 'ꙿ'), + ('ꚜ', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('꜀', '꜡'), + ('ꝰ', 'ꝰ'), + ('ꞈ', '꞊'), + ('ꟲ', 'ꟴ'), + ('ꟸ', 'ꟹ'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('ꧏ', 'ꧏ'), + ('\u{a9e5}', 'ꧦ'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('ꩰ', 'ꩰ'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ꫝ', 'ꫝ'), + ('\u{aaec}', '\u{aaed}'), + ('ꫳ', 'ꫴ'), + ('\u{aaf6}', '\u{aaf6}'), + ('꭛', 'ꭟ'), + ('ꭩ', '꭫'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('﮲', '﯂'), + ('\u{fe00}', '\u{fe0f}'), + ('︓', '︓'), + ('\u{fe20}', '\u{fe2f}'), + ('﹒', '﹒'), + ('﹕', '﹕'), + ('\u{feff}', '\u{feff}'), + (''', '''), + ('.', '.'), + (':', ':'), + ('^', '^'), + ('`', '`'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + (' ̄', ' ̄'), + ('\u{fff9}', '\u{fffb}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{11340}', '\u{11340}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{11f00}', '\u{11f01}'), + ('\u{11f36}', '\u{11f3a}'), + ('\u{11f40}', '\u{11f40}'), + ('\u{11f42}', '\u{11f42}'), + ('\u{13430}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('𖭀', '𖭃'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d173}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '𞄽'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('𞓫', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '𞥋'), + ('🏻', '🏿'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const CASED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ƺ'), + ('Ƽ', 'ƿ'), + ('DŽ', 'ʓ'), + ('ʕ', 'ʸ'), + ('ˀ', 'ˁ'), + ('ˠ', 'ˤ'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՠ', 'ֈ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ჿ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ⅿ'), + ('Ↄ', 'ↄ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('Ꙁ', 'ꙭ'), + ('Ꚁ', 'ꚝ'), + ('Ꜣ', 'ꞇ'), + ('Ꞌ', 'ꞎ'), + ('Ꞑ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꟶ'), + ('ꟸ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('A', 'Z'), + ('a', 'z'), + ('𐐀', '𐑏'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐞀', '𐞀'), + ('𐞃', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𑢠', '𑣟'), + ('𖹀', '𖹿'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞤀', '𞥃'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const CHANGES_WHEN_CASEFOLDED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('µ', 'µ'), + ('À', 'Ö'), + ('Ø', 'ß'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('ʼn', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('ſ', 'ſ'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'Dž'), + ('LJ', 'Lj'), + ('NJ', 'Nj'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'Dz'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('ς', 'ς'), + ('Ϗ', 'ϑ'), + ('ϕ', 'ϖ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϰ', 'ϱ'), + ('ϴ', 'ϵ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('և', 'և'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẚ', 'ẛ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('ᾀ', 'ᾯ'), + ('ᾲ', 'ᾴ'), + ('ᾷ', 'ᾼ'), + ('ῂ', 'ῄ'), + ('ῇ', 'ῌ'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῷ', 'ῼ'), + ('Ω', 'Ω'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('Ⅰ', 'Ⅿ'), + ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𞤀', '𞤡'), +]; + +pub const CHANGES_WHEN_CASEMAPPED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('µ', 'µ'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ķ'), + ('Ĺ', 'ƌ'), + ('Ǝ', 'ƚ'), + ('Ɯ', 'Ʃ'), + ('Ƭ', 'ƹ'), + ('Ƽ', 'ƽ'), + ('ƿ', 'ƿ'), + ('DŽ', 'Ƞ'), + ('Ȣ', 'ȳ'), + ('Ⱥ', 'ɔ'), + ('ɖ', 'ɗ'), + ('ə', 'ə'), + ('ɛ', 'ɜ'), + ('ɠ', 'ɡ'), + ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), + ('ɨ', 'ɬ'), + ('ɯ', 'ɯ'), + ('ɱ', 'ɲ'), + ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), + ('ʀ', 'ʀ'), + ('ʂ', 'ʃ'), + ('ʇ', 'ʌ'), + ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϑ'), + ('ϕ', 'ϵ'), + ('Ϸ', 'ϻ'), + ('Ͻ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ա', 'և'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჽ', 'ჿ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), + ('ᶎ', 'ᶎ'), + ('Ḁ', 'ẛ'), + ('ẞ', 'ẞ'), + ('Ạ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('Ω', 'Ω'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ⅿ'), + ('Ↄ', 'ↄ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'Ɒ'), + ('Ⱳ', 'ⱳ'), + ('Ⱶ', 'ⱶ'), + ('Ȿ', 'ⳣ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('Ꙁ', 'ꙭ'), + ('Ꚁ', 'ꚛ'), + ('Ꜣ', 'ꜯ'), + ('Ꜳ', 'ꝯ'), + ('Ꝺ', 'ꞇ'), + ('Ꞌ', 'Ɥ'), + ('Ꞑ', 'ꞔ'), + ('Ꞗ', 'Ɪ'), + ('Ʞ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('Ꟗ', 'ꟙ'), + ('Ꟶ', 'ꟶ'), + ('ꭓ', 'ꭓ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('A', 'Z'), + ('a', 'z'), + ('𐐀', '𐑏'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𑢠', '𑣟'), + ('𖹀', '𖹿'), + ('𞤀', '𞥃'), +]; + +pub const CHANGES_WHEN_LOWERCASED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('À', 'Ö'), + ('Ø', 'Þ'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'Dž'), + ('LJ', 'Lj'), + ('NJ', 'Nj'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'Dz'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('Ϗ', 'Ϗ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϴ', 'ϴ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), + ('ᾨ', 'ᾯ'), + ('Ᾰ', 'ᾼ'), + ('Ὲ', 'ῌ'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('Ὸ', 'ῼ'), + ('Ω', 'Ω'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('Ⅰ', 'Ⅿ'), + ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𞤀', '𞤡'), +]; + +pub const CHANGES_WHEN_TITLECASED: &'static [(char, char)] = &[ + ('a', 'z'), + ('µ', 'µ'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ķ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƌ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƚ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƹ'), + ('ƽ', 'ƽ'), + ('ƿ', 'ƿ'), + ('DŽ', 'DŽ'), + ('dž', 'LJ'), + ('lj', 'NJ'), + ('nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'DZ'), + ('dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȳ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ɔ'), + ('ɖ', 'ɗ'), + ('ə', 'ə'), + ('ɛ', 'ɜ'), + ('ɠ', 'ɡ'), + ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), + ('ɨ', 'ɬ'), + ('ɯ', 'ɯ'), + ('ɱ', 'ɲ'), + ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), + ('ʀ', 'ʀ'), + ('ʂ', 'ʃ'), + ('ʇ', 'ʌ'), + ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), + ('\u{345}', '\u{345}'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϻ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ա', 'և'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), + ('ᶎ', 'ᶎ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẛ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), + ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), + ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱳ', 'ⱳ'), + ('ⱶ', 'ⱶ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳣ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜯ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝯ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞔ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟶ', 'ꟶ'), + ('ꭓ', 'ꭓ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𞤢', '𞥃'), +]; + +pub const CHANGES_WHEN_UPPERCASED: &'static [(char, char)] = &[ + ('a', 'z'), + ('µ', 'µ'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ķ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƌ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƚ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƹ'), + ('ƽ', 'ƽ'), + ('ƿ', 'ƿ'), + ('Dž', 'dž'), + ('Lj', 'lj'), + ('Nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'ǰ'), + ('Dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȳ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ɔ'), + ('ɖ', 'ɗ'), + ('ə', 'ə'), + ('ɛ', 'ɜ'), + ('ɠ', 'ɡ'), + ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), + ('ɨ', 'ɬ'), + ('ɯ', 'ɯ'), + ('ɱ', 'ɲ'), + ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), + ('ʀ', 'ʀ'), + ('ʂ', 'ʃ'), + ('ʇ', 'ʌ'), + ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), + ('\u{345}', '\u{345}'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϻ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ա', 'և'), + ('ა', 'ჺ'), + ('ჽ', 'ჿ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), + ('ᶎ', 'ᶎ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẛ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ᾼ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῌ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ῼ', 'ῼ'), + ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), + ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱳ', 'ⱳ'), + ('ⱶ', 'ⱶ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳣ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜯ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝯ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞔ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟶ', 'ꟶ'), + ('ꭓ', 'ꭓ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𞤢', '𞥃'), +]; + +pub const DASH: &'static [(char, char)] = &[ + ('-', '-'), + ('֊', '֊'), + ('־', '־'), + ('᐀', '᐀'), + ('᠆', '᠆'), + ('‐', '―'), + ('⁓', '⁓'), + ('⁻', '⁻'), + ('₋', '₋'), + ('−', '−'), + ('⸗', '⸗'), + ('⸚', '⸚'), + ('⸺', '⸻'), + ('⹀', '⹀'), + ('⹝', '⹝'), + ('〜', '〜'), + ('〰', '〰'), + ('゠', '゠'), + ('︱', '︲'), + ('﹘', '﹘'), + ('﹣', '﹣'), + ('-', '-'), + ('𐺭', '𐺭'), +]; + +pub const DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{34f}', '\u{34f}'), + ('\u{61c}', '\u{61c}'), + ('ᅟ', 'ᅠ'), + ('\u{17b4}', '\u{17b5}'), + ('\u{180b}', '\u{180f}'), + ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{206f}'), + ('ㅤ', 'ㅤ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{feff}', '\u{feff}'), + ('ᅠ', 'ᅠ'), + ('\u{fff0}', '\u{fff8}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0000}', '\u{e0fff}'), +]; + +pub const DEPRECATED: &'static [(char, char)] = &[ + ('ʼn', 'ʼn'), + ('ٳ', 'ٳ'), + ('\u{f77}', '\u{f77}'), + ('\u{f79}', '\u{f79}'), + ('ឣ', 'ឤ'), + ('\u{206a}', '\u{206f}'), + ('〈', '〉'), + ('\u{e0001}', '\u{e0001}'), +]; + +pub const DIACRITIC: &'static [(char, char)] = &[ + ('^', '^'), + ('`', '`'), + ('¨', '¨'), + ('¯', '¯'), + ('´', '´'), + ('·', '¸'), + ('ʰ', '\u{34e}'), + ('\u{350}', '\u{357}'), + ('\u{35d}', '\u{362}'), + ('ʹ', '͵'), + ('ͺ', 'ͺ'), + ('΄', '΅'), + ('\u{483}', '\u{487}'), + ('ՙ', 'ՙ'), + ('\u{591}', '\u{5a1}'), + ('\u{5a3}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c4}'), + ('\u{64b}', '\u{652}'), + ('\u{657}', '\u{658}'), + ('\u{6df}', '\u{6e0}'), + ('ۥ', 'ۦ'), + ('\u{6ea}', '\u{6ec}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', 'ߵ'), + ('\u{818}', '\u{819}'), + ('\u{898}', '\u{89f}'), + ('ࣉ', '\u{8d2}'), + ('\u{8e3}', '\u{8fe}'), + ('\u{93c}', '\u{93c}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{954}'), + ('ॱ', 'ॱ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a4d}', '\u{a4d}'), + ('\u{abc}', '\u{abc}'), + ('\u{acd}', '\u{acd}'), + ('\u{afd}', '\u{aff}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b55}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c4d}', '\u{c4d}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{ccd}', '\u{ccd}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{dca}', '\u{dca}'), + ('\u{e47}', '\u{e4c}'), + ('\u{e4e}', '\u{e4e}'), + ('\u{eba}', '\u{eba}'), + ('\u{ec8}', '\u{ecc}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', '༿'), + ('\u{f82}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{1037}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('ၣ', 'ၤ'), + ('ၩ', 'ၭ'), + ('ႇ', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', 'ႛ'), + ('\u{135d}', '\u{135f}'), + ('\u{1714}', '᜕'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a75}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1abe}'), + ('\u{1ac1}', '\u{1acb}'), + ('\u{1b34}', '\u{1b34}'), + ('᭄', '᭄'), + ('\u{1b6b}', '\u{1b73}'), + ('᮪', '\u{1bab}'), + ('\u{1c36}', '\u{1c37}'), + ('ᱸ', 'ᱽ'), + ('\u{1cd0}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('᳷', '\u{1cf9}'), + ('ᴬ', 'ᵪ'), + ('\u{1dc4}', '\u{1dcf}'), + ('\u{1df5}', '\u{1dff}'), + ('᾽', '᾽'), + ('᾿', '῁'), + ('῍', '῏'), + ('῝', '῟'), + ('῭', '`'), + ('´', '῾'), + ('\u{2cef}', '\u{2cf1}'), + ('ⸯ', 'ⸯ'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '゜'), + ('ー', 'ー'), + ('\u{a66f}', '\u{a66f}'), + ('\u{a67c}', '\u{a67d}'), + ('ꙿ', 'ꙿ'), + ('ꚜ', 'ꚝ'), + ('\u{a6f0}', '\u{a6f1}'), + ('꜀', '꜡'), + ('ꞈ', '꞊'), + ('ꟸ', 'ꟹ'), + ('\u{a8c4}', '\u{a8c4}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a92b}', '꤮'), + ('꥓', '꥓'), + ('\u{a9b3}', '\u{a9b3}'), + ('꧀', '꧀'), + ('\u{a9e5}', '\u{a9e5}'), + ('ꩻ', 'ꩽ'), + ('\u{aabf}', 'ꫂ'), + ('\u{aaf6}', '\u{aaf6}'), + ('꭛', 'ꭟ'), + ('ꭩ', '꭫'), + ('꯬', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe20}', '\u{fe2f}'), + ('^', '^'), + ('`', '`'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + (' ̄', ' ̄'), + ('\u{102e0}', '\u{102e0}'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('\u{10ae5}', '\u{10ae6}'), + ('𐴢', '\u{10d27}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11046}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{11133}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('𑇀', '𑇀'), + ('\u{111ca}', '\u{111cc}'), + ('𑈵', '\u{11236}'), + ('\u{112e9}', '\u{112ea}'), + ('\u{1133c}', '\u{1133c}'), + ('𑍍', '𑍍'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11442}', '\u{11442}'), + ('\u{11446}', '\u{11446}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{1163f}', '\u{1163f}'), + ('𑚶', '\u{116b7}'), + ('\u{1172b}', '\u{1172b}'), + ('\u{11839}', '\u{1183a}'), + ('𑤽', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a34}', '\u{11a34}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a99}', '\u{11a99}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11d42}', '\u{11d42}'), + ('\u{11d44}', '\u{11d45}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f8f}', '𖾟'), + ('𖿰', '𖿱'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('𞀰', '𞁭'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e946}'), + ('\u{1e948}', '\u{1e94a}'), +]; + +pub const EMOJI: &'static [(char, char)] = &[ + ('#', '#'), + ('*', '*'), + ('0', '9'), + ('©', '©'), + ('®', '®'), + ('‼', '‼'), + ('⁉', '⁉'), + ('™', '™'), + ('ℹ', 'ℹ'), + ('↔', '↙'), + ('↩', '↪'), + ('⌚', '⌛'), + ('⌨', '⌨'), + ('⏏', '⏏'), + ('⏩', '⏳'), + ('⏸', '⏺'), + ('Ⓜ', 'Ⓜ'), + ('▪', '▫'), + ('▶', '▶'), + ('◀', '◀'), + ('◻', '◾'), + ('☀', '☄'), + ('☎', '☎'), + ('☑', '☑'), + ('☔', '☕'), + ('☘', '☘'), + ('☝', '☝'), + ('☠', '☠'), + ('☢', '☣'), + ('☦', '☦'), + ('☪', '☪'), + ('☮', '☯'), + ('☸', '☺'), + ('♀', '♀'), + ('♂', '♂'), + ('♈', '♓'), + ('♟', '♠'), + ('♣', '♣'), + ('♥', '♦'), + ('♨', '♨'), + ('♻', '♻'), + ('♾', '♿'), + ('⚒', '⚗'), + ('⚙', '⚙'), + ('⚛', '⚜'), + ('⚠', '⚡'), + ('⚧', '⚧'), + ('⚪', '⚫'), + ('⚰', '⚱'), + ('⚽', '⚾'), + ('⛄', '⛅'), + ('⛈', '⛈'), + ('⛎', '⛏'), + ('⛑', '⛑'), + ('⛓', '⛔'), + ('⛩', '⛪'), + ('⛰', '⛵'), + ('⛷', '⛺'), + ('⛽', '⛽'), + ('✂', '✂'), + ('✅', '✅'), + ('✈', '✍'), + ('✏', '✏'), + ('✒', '✒'), + ('✔', '✔'), + ('✖', '✖'), + ('✝', '✝'), + ('✡', '✡'), + ('✨', '✨'), + ('✳', '✴'), + ('❄', '❄'), + ('❇', '❇'), + ('❌', '❌'), + ('❎', '❎'), + ('❓', '❕'), + ('❗', '❗'), + ('❣', '❤'), + ('➕', '➗'), + ('➡', '➡'), + ('➰', '➰'), + ('➿', '➿'), + ('⤴', '⤵'), + ('⬅', '⬇'), + ('⬛', '⬜'), + ('⭐', '⭐'), + ('⭕', '⭕'), + ('〰', '〰'), + ('〽', '〽'), + ('㊗', '㊗'), + ('㊙', '㊙'), + ('🀄', '🀄'), + ('🃏', '🃏'), + ('🅰', '🅱'), + ('🅾', '🅿'), + ('🆎', '🆎'), + ('🆑', '🆚'), + ('🇦', '🇿'), + ('🈁', '🈂'), + ('🈚', '🈚'), + ('🈯', '🈯'), + ('🈲', '🈺'), + ('🉐', '🉑'), + ('🌀', '🌡'), + ('🌤', '🎓'), + ('🎖', '🎗'), + ('🎙', '🎛'), + ('🎞', '🏰'), + ('🏳', '🏵'), + ('🏷', '📽'), + ('📿', '🔽'), + ('🕉', '🕎'), + ('🕐', '🕧'), + ('🕯', '🕰'), + ('🕳', '🕺'), + ('🖇', '🖇'), + ('🖊', '🖍'), + ('🖐', '🖐'), + ('🖕', '🖖'), + ('🖤', '🖥'), + ('🖨', '🖨'), + ('🖱', '🖲'), + ('🖼', '🖼'), + ('🗂', '🗄'), + ('🗑', '🗓'), + ('🗜', '🗞'), + ('🗡', '🗡'), + ('🗣', '🗣'), + ('🗨', '🗨'), + ('🗯', '🗯'), + ('🗳', '🗳'), + ('🗺', '🙏'), + ('🚀', '🛅'), + ('🛋', '🛒'), + ('🛕', '🛗'), + ('🛜', '🛥'), + ('🛩', '🛩'), + ('🛫', '🛬'), + ('🛰', '🛰'), + ('🛳', '🛼'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🤌', '🤺'), + ('🤼', '🥅'), + ('🥇', '🧿'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), +]; + +pub const EMOJI_COMPONENT: &'static [(char, char)] = &[ + ('#', '#'), + ('*', '*'), + ('0', '9'), + ('\u{200d}', '\u{200d}'), + ('\u{20e3}', '\u{20e3}'), + ('\u{fe0f}', '\u{fe0f}'), + ('🇦', '🇿'), + ('🏻', '🏿'), + ('🦰', '🦳'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const EMOJI_MODIFIER: &'static [(char, char)] = &[('🏻', '🏿')]; + +pub const EMOJI_MODIFIER_BASE: &'static [(char, char)] = &[ + ('☝', '☝'), + ('⛹', '⛹'), + ('✊', '✍'), + ('🎅', '🎅'), + ('🏂', '🏄'), + ('🏇', '🏇'), + ('🏊', '🏌'), + ('👂', '👃'), + ('👆', '👐'), + ('👦', '👸'), + ('👼', '👼'), + ('💁', '💃'), + ('💅', '💇'), + ('💏', '💏'), + ('💑', '💑'), + ('💪', '💪'), + ('🕴', '🕵'), + ('🕺', '🕺'), + ('🖐', '🖐'), + ('🖕', '🖖'), + ('🙅', '🙇'), + ('🙋', '🙏'), + ('🚣', '🚣'), + ('🚴', '🚶'), + ('🛀', '🛀'), + ('🛌', '🛌'), + ('🤌', '🤌'), + ('🤏', '🤏'), + ('🤘', '🤟'), + ('🤦', '🤦'), + ('🤰', '🤹'), + ('🤼', '🤾'), + ('🥷', '🥷'), + ('🦵', '🦶'), + ('🦸', '🦹'), + ('🦻', '🦻'), + ('🧍', '🧏'), + ('🧑', '🧝'), + ('🫃', '🫅'), + ('🫰', '🫸'), +]; + +pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[ + ('⌚', '⌛'), + ('⏩', '⏬'), + ('⏰', '⏰'), + ('⏳', '⏳'), + ('◽', '◾'), + ('☔', '☕'), + ('♈', '♓'), + ('♿', '♿'), + ('⚓', '⚓'), + ('⚡', '⚡'), + ('⚪', '⚫'), + ('⚽', '⚾'), + ('⛄', '⛅'), + ('⛎', '⛎'), + ('⛔', '⛔'), + ('⛪', '⛪'), + ('⛲', '⛳'), + ('⛵', '⛵'), + ('⛺', '⛺'), + ('⛽', '⛽'), + ('✅', '✅'), + ('✊', '✋'), + ('✨', '✨'), + ('❌', '❌'), + ('❎', '❎'), + ('❓', '❕'), + ('❗', '❗'), + ('➕', '➗'), + ('➰', '➰'), + ('➿', '➿'), + ('⬛', '⬜'), + ('⭐', '⭐'), + ('⭕', '⭕'), + ('🀄', '🀄'), + ('🃏', '🃏'), + ('🆎', '🆎'), + ('🆑', '🆚'), + ('🇦', '🇿'), + ('🈁', '🈁'), + ('🈚', '🈚'), + ('🈯', '🈯'), + ('🈲', '🈶'), + ('🈸', '🈺'), + ('🉐', '🉑'), + ('🌀', '🌠'), + ('🌭', '🌵'), + ('🌷', '🍼'), + ('🍾', '🎓'), + ('🎠', '🏊'), + ('🏏', '🏓'), + ('🏠', '🏰'), + ('🏴', '🏴'), + ('🏸', '🐾'), + ('👀', '👀'), + ('👂', '📼'), + ('📿', '🔽'), + ('🕋', '🕎'), + ('🕐', '🕧'), + ('🕺', '🕺'), + ('🖕', '🖖'), + ('🖤', '🖤'), + ('🗻', '🙏'), + ('🚀', '🛅'), + ('🛌', '🛌'), + ('🛐', '🛒'), + ('🛕', '🛗'), + ('🛜', '🛟'), + ('🛫', '🛬'), + ('🛴', '🛼'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🤌', '🤺'), + ('🤼', '🥅'), + ('🥇', '🧿'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), +]; + +pub const EXTENDED_PICTOGRAPHIC: &'static [(char, char)] = &[ + ('©', '©'), + ('®', '®'), + ('‼', '‼'), + ('⁉', '⁉'), + ('™', '™'), + ('ℹ', 'ℹ'), + ('↔', '↙'), + ('↩', '↪'), + ('⌚', '⌛'), + ('⌨', '⌨'), + ('⎈', '⎈'), + ('⏏', '⏏'), + ('⏩', '⏳'), + ('⏸', '⏺'), + ('Ⓜ', 'Ⓜ'), + ('▪', '▫'), + ('▶', '▶'), + ('◀', '◀'), + ('◻', '◾'), + ('☀', '★'), + ('☇', '☒'), + ('☔', '⚅'), + ('⚐', '✅'), + ('✈', '✒'), + ('✔', '✔'), + ('✖', '✖'), + ('✝', '✝'), + ('✡', '✡'), + ('✨', '✨'), + ('✳', '✴'), + ('❄', '❄'), + ('❇', '❇'), + ('❌', '❌'), + ('❎', '❎'), + ('❓', '❕'), + ('❗', '❗'), + ('❣', '❧'), + ('➕', '➗'), + ('➡', '➡'), + ('➰', '➰'), + ('➿', '➿'), + ('⤴', '⤵'), + ('⬅', '⬇'), + ('⬛', '⬜'), + ('⭐', '⭐'), + ('⭕', '⭕'), + ('〰', '〰'), + ('〽', '〽'), + ('㊗', '㊗'), + ('㊙', '㊙'), + ('🀀', '\u{1f0ff}'), + ('🄍', '🄏'), + ('🄯', '🄯'), + ('🅬', '🅱'), + ('🅾', '🅿'), + ('🆎', '🆎'), + ('🆑', '🆚'), + ('🆭', '\u{1f1e5}'), + ('🈁', '\u{1f20f}'), + ('🈚', '🈚'), + ('🈯', '🈯'), + ('🈲', '🈺'), + ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '🏺'), + ('🐀', '🔽'), + ('🕆', '🙏'), + ('🚀', '\u{1f6ff}'), + ('🝴', '🝿'), + ('🟕', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8ff}'), + ('🤌', '🤺'), + ('🤼', '🥅'), + ('🥇', '\u{1faff}'), + ('\u{1fc00}', '\u{1fffd}'), +]; + +pub const EXTENDER: &'static [(char, char)] = &[ + ('·', '·'), + ('ː', 'ˑ'), + ('ـ', 'ـ'), + ('ߺ', 'ߺ'), + ('\u{b55}', '\u{b55}'), + ('ๆ', 'ๆ'), + ('ໆ', 'ໆ'), + ('᠊', '᠊'), + ('ᡃ', 'ᡃ'), + ('ᪧ', 'ᪧ'), + ('\u{1c36}', '\u{1c36}'), + ('ᱻ', 'ᱻ'), + ('々', '々'), + ('〱', '〵'), + ('ゝ', 'ゞ'), + ('ー', 'ヾ'), + ('ꀕ', 'ꀕ'), + ('ꘌ', 'ꘌ'), + ('ꧏ', 'ꧏ'), + ('ꧦ', 'ꧦ'), + ('ꩰ', 'ꩰ'), + ('ꫝ', 'ꫝ'), + ('ꫳ', 'ꫴ'), + ('ー', 'ー'), + ('𐞁', '𐞂'), + ('𑍝', '𑍝'), + ('𑗆', '𑗈'), + ('\u{11a98}', '\u{11a98}'), + ('𖭂', '𖭃'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𞄼', '𞄽'), + ('\u{1e944}', '\u{1e946}'), +]; + +pub const GRAPHEME_BASE: &'static [(char, char)] = &[ + (' ', '~'), + ('\u{a0}', '¬'), + ('®', '˿'), + ('Ͱ', 'ͷ'), + ('ͺ', 'Ϳ'), + ('΄', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', '҂'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', '֊'), + ('֍', '֏'), + ('־', '־'), + ('׀', '׀'), + ('׃', '׃'), + ('׆', '׆'), + ('א', 'ת'), + ('ׯ', '״'), + ('؆', '؏'), + ('؛', '؛'), + ('؝', 'ي'), + ('٠', 'ٯ'), + ('ٱ', 'ە'), + ('۞', '۞'), + ('ۥ', 'ۦ'), + ('۩', '۩'), + ('ۮ', '܍'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('߀', 'ߪ'), + ('ߴ', 'ߺ'), + ('߾', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('࠰', '࠾'), + ('ࡀ', 'ࡘ'), + ('࡞', '࡞'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ः', 'ह'), + ('ऻ', 'ऻ'), + ('ऽ', 'ी'), + ('ॉ', 'ौ'), + ('ॎ', 'ॐ'), + ('क़', 'ॡ'), + ('।', 'ঀ'), + ('ং', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ি', 'ী'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('০', '৽'), + ('ਃ', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਾ', 'ੀ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '੯'), + ('ੲ', 'ੴ'), + ('੶', '੶'), + ('ઃ', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ી'), + ('ૉ', 'ૉ'), + ('ો', 'ૌ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('૦', '૱'), + ('ૹ', 'ૹ'), + ('ଂ', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ୀ', 'ୀ'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('୦', '୷'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ி', 'ி'), + ('ு', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('ௐ', 'ௐ'), + ('௦', '௺'), + ('ఁ', 'ః'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ు', 'ౄ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('౦', '౯'), + ('౷', 'ಀ'), + ('ಂ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಾ'), + ('ೀ', 'ು'), + ('ೃ', 'ೄ'), + ('ೇ', 'ೈ'), + ('ೊ', 'ೋ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('ം', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ി', 'ീ'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('ൎ', '൏'), + ('ൔ', 'ൖ'), + ('൘', 'ൡ'), + ('൦', 'ൿ'), + ('ං', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ැ', 'ෑ'), + ('ෘ', 'ෞ'), + ('෦', '෯'), + ('ෲ', '෴'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('฿', 'ๆ'), + ('๏', '๛'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', '༗'), + ('༚', '༴'), + ('༶', '༶'), + ('༸', '༸'), + ('༺', 'ཇ'), + ('ཉ', 'ཬ'), + ('ཿ', 'ཿ'), + ('྅', '྅'), + ('ྈ', 'ྌ'), + ('྾', '࿅'), + ('࿇', '࿌'), + ('࿎', '࿚'), + ('က', 'ာ'), + ('ေ', 'ေ'), + ('း', 'း'), + ('ျ', 'ြ'), + ('ဿ', 'ၗ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႃ', 'ႄ'), + ('ႇ', 'ႌ'), + ('ႎ', 'ႜ'), + ('႞', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('፠', '፼'), + ('ᎀ', '᎙'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('᐀', '᚜'), + ('ᚠ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('᜕', '᜕'), + ('ᜟ', 'ᜱ'), + ('᜴', '᜶'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ា', 'ា'), + ('ើ', 'ៅ'), + ('ះ', 'ៈ'), + ('។', 'ៜ'), + ('០', '៩'), + ('៰', '៹'), + ('᠀', '᠊'), + ('᠐', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᤣ', 'ᤦ'), + ('ᤩ', 'ᤫ'), + ('ᤰ', 'ᤱ'), + ('ᤳ', 'ᤸ'), + ('᥀', '᥀'), + ('᥄', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧚'), + ('᧞', 'ᨖ'), + ('ᨙ', 'ᨚ'), + ('᨞', 'ᩕ'), + ('ᩗ', 'ᩗ'), + ('ᩡ', 'ᩡ'), + ('ᩣ', 'ᩤ'), + ('ᩭ', 'ᩲ'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᪠', '᪭'), + ('ᬄ', 'ᬳ'), + ('ᬻ', 'ᬻ'), + ('ᬽ', 'ᭁ'), + ('ᭃ', 'ᭌ'), + ('᭐', '᭪'), + ('᭴', '᭾'), + ('ᮂ', 'ᮡ'), + ('ᮦ', 'ᮧ'), + ('᮪', '᮪'), + ('ᮮ', 'ᯥ'), + ('ᯧ', 'ᯧ'), + ('ᯪ', 'ᯬ'), + ('ᯮ', 'ᯮ'), + ('᯲', '᯳'), + ('᯼', 'ᰫ'), + ('ᰴ', 'ᰵ'), + ('᰻', '᱉'), + ('ᱍ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', '᳇'), + ('᳓', '᳓'), + ('᳡', '᳡'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', '᳷'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), + ('῝', '`'), + ('ῲ', 'ῴ'), + ('ῶ', '῾'), + ('\u{2000}', '\u{200a}'), + ('‐', '‧'), + ('\u{202f}', '\u{205f}'), + ('⁰', 'ⁱ'), + ('⁴', '₎'), + ('ₐ', 'ₜ'), + ('₠', '⃀'), + ('℀', '↋'), + ('←', '␦'), + ('⑀', '⑊'), + ('①', '⭳'), + ('⭶', '⮕'), + ('⮗', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('⳹', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', '⵰'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('⸀', '⹝'), + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('⿰', '⿻'), + ('\u{3000}', '〩'), + ('〰', '〿'), + ('ぁ', 'ゖ'), + ('゛', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('㆐', '㇣'), + ('ㇰ', '㈞'), + ('㈠', 'ꒌ'), + ('꒐', '꓆'), + ('ꓐ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('꙳', '꙳'), + ('꙾', 'ꚝ'), + ('ꚠ', 'ꛯ'), + ('꛲', '꛷'), + ('꜀', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠤ'), + ('ꠧ', '꠫'), + ('꠰', '꠹'), + ('ꡀ', '꡷'), + ('ꢀ', 'ꣃ'), + ('꣎', '꣙'), + ('ꣲ', 'ꣾ'), + ('꤀', 'ꤥ'), + ('꤮', 'ꥆ'), + ('ꥒ', '꥓'), + ('꥟', 'ꥼ'), + ('ꦃ', 'ꦲ'), + ('ꦴ', 'ꦵ'), + ('ꦺ', 'ꦻ'), + ('ꦾ', '꧍'), + ('ꧏ', '꧙'), + ('꧞', 'ꧤ'), + ('ꧦ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꨯ', 'ꨰ'), + ('ꨳ', 'ꨴ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩍ', 'ꩍ'), + ('꩐', '꩙'), + ('꩜', 'ꩻ'), + ('ꩽ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫫ'), + ('ꫮ', 'ꫵ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', '꭫'), + ('ꭰ', 'ꯤ'), + ('ꯦ', 'ꯧ'), + ('ꯩ', '꯬'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), + ('ײַ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', '﯂'), + ('ﯓ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('﷏', '﷏'), + ('ﷰ', '﷿'), + ('︐', '︙'), + ('︰', '﹒'), + ('﹔', '﹦'), + ('﹨', '﹫'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('!', 'ン'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('¢', '₩'), + ('│', '○'), + ('', '�'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐆎'), + ('𐆐', '𐆜'), + ('𐆠', '𐆠'), + ('𐇐', '𐇼'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐋡', '𐋻'), + ('𐌀', '𐌣'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎟', '𐏃'), + ('𐏈', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕯', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡗', '𐢞'), + ('𐢧', '𐢯'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐣻', '𐤛'), + ('𐤟', '𐤹'), + ('𐤿', '𐤿'), + ('𐦀', '𐦷'), + ('𐦼', '𐧏'), + ('𐧒', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩀', '𐩈'), + ('𐩐', '𐩘'), + ('𐩠', '𐪟'), + ('𐫀', '𐫤'), + ('𐫫', '𐫶'), + ('𐬀', '𐬵'), + ('𐬹', '𐭕'), + ('𐭘', '𐭲'), + ('𐭸', '𐮑'), + ('𐮙', '𐮜'), + ('𐮩', '𐮯'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐳺', '𐴣'), + ('𐴰', '𐴹'), + ('𐹠', '𐹾'), + ('𐺀', '𐺩'), + ('𐺭', '𐺭'), + ('𐺰', '𐺱'), + ('𐼀', '𐼧'), + ('𐼰', '𐽅'), + ('𐽑', '𐽙'), + ('𐽰', '𐾁'), + ('𐾆', '𐾉'), + ('𐾰', '𐿋'), + ('𐿠', '𐿶'), + ('𑀀', '𑀀'), + ('𑀂', '𑀷'), + ('𑁇', '𑁍'), + ('𑁒', '𑁯'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂂', '𑂲'), + ('𑂷', '𑂸'), + ('𑂻', '𑂼'), + ('𑂾', '𑃁'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('𑄃', '𑄦'), + ('𑄬', '𑄬'), + ('𑄶', '𑅇'), + ('𑅐', '𑅲'), + ('𑅴', '𑅶'), + ('𑆂', '𑆵'), + ('𑆿', '𑇈'), + ('𑇍', '𑇎'), + ('𑇐', '𑇟'), + ('𑇡', '𑇴'), + ('𑈀', '𑈑'), + ('𑈓', '𑈮'), + ('𑈲', '𑈳'), + ('𑈵', '𑈵'), + ('𑈸', '𑈽'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊩'), + ('𑊰', '𑋞'), + ('𑋠', '𑋢'), + ('𑋰', '𑋹'), + ('𑌂', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑌿', '𑌿'), + ('𑍁', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('𑍝', '𑍣'), + ('𑐀', '𑐷'), + ('𑑀', '𑑁'), + ('𑑅', '𑑅'), + ('𑑇', '𑑛'), + ('𑑝', '𑑝'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑒱', '𑒲'), + ('𑒹', '𑒹'), + ('𑒻', '𑒼'), + ('𑒾', '𑒾'), + ('𑓁', '𑓁'), + ('𑓄', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '𑖮'), + ('𑖰', '𑖱'), + ('𑖸', '𑖻'), + ('𑖾', '𑖾'), + ('𑗁', '𑗛'), + ('𑘀', '𑘲'), + ('𑘻', '𑘼'), + ('𑘾', '𑘾'), + ('𑙁', '𑙄'), + ('𑙐', '𑙙'), + ('𑙠', '𑙬'), + ('𑚀', '𑚪'), + ('𑚬', '𑚬'), + ('𑚮', '𑚯'), + ('𑚶', '𑚶'), + ('𑚸', '𑚹'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('𑜠', '𑜡'), + ('𑜦', '𑜦'), + ('𑜰', '𑝆'), + ('𑠀', '𑠮'), + ('𑠸', '𑠸'), + ('𑠻', '𑠻'), + ('𑢠', '𑣲'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤱', '𑤵'), + ('𑤷', '𑤸'), + ('𑤽', '𑤽'), + ('𑤿', '𑥂'), + ('𑥄', '𑥆'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '𑧓'), + ('𑧜', '𑧟'), + ('𑧡', '𑧤'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨹', '𑨺'), + ('𑨿', '𑩆'), + ('𑩐', '𑩐'), + ('𑩗', '𑩘'), + ('𑩜', '𑪉'), + ('𑪗', '𑪗'), + ('𑪚', '𑪢'), + ('𑪰', '𑫸'), + ('𑬀', '𑬉'), + ('𑰀', '𑰈'), + ('𑰊', '𑰯'), + ('𑰾', '𑰾'), + ('𑱀', '𑱅'), + ('𑱐', '𑱬'), + ('𑱰', '𑲏'), + ('𑲩', '𑲩'), + ('𑲱', '𑲱'), + ('𑲴', '𑲴'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('𑶓', '𑶔'), + ('𑶖', '𑶖'), + ('𑶘', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻲'), + ('𑻵', '𑻸'), + ('𑼂', '𑼐'), + ('𑼒', '𑼵'), + ('𑼾', '𑼿'), + ('𑽁', '𑽁'), + ('𑽃', '𑽙'), + ('𑾰', '𑾰'), + ('𑿀', '𑿱'), + ('𑿿', '𒎙'), + ('𒐀', '𒑮'), + ('𒑰', '𒑴'), + ('𒒀', '𒕃'), + ('𒾐', '𒿲'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩮', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('𖫵', '𖫵'), + ('𖬀', '𖬯'), + ('𖬷', '𖭅'), + ('𖭐', '𖭙'), + ('𖭛', '𖭡'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖺚'), + ('𖼀', '𖽊'), + ('𖽐', '𖾇'), + ('𖾓', '𖾟'), + ('𖿠', '𖿣'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𛲜', '𛲜'), + ('𛲟', '𛲟'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅘𝅥𝅲'), + ('𝅦', '𝅦'), + ('𝅪', '𝅭'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝈀', '𝉁'), + ('𝉅', '𝉅'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝌀', '𝍖'), + ('𝍠', '𝍸'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝟋'), + ('𝟎', '𝧿'), + ('𝨷', '𝨺'), + ('𝩭', '𝩴'), + ('𝩶', '𝪃'), + ('𝪅', '𝪋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅏'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞋰', '𞋹'), + ('𞋿', '𞋿'), + ('𞓐', '𞓫'), + ('𞓰', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞣇', '𞣏'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞥐', '𞥙'), + ('𞥞', '𞥟'), + ('𞱱', '𞲴'), + ('𞴁', '𞴽'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄀', '🆭'), + ('🇦', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉐', '🉑'), + ('🉠', '🉥'), + ('🌀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9be}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', '\u{bbe}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc2}', '\u{cc2}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d3e}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dcf}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{ddf}', '\u{ddf}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200c}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('\u{aaec}', '\u{aaed}'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '\u{1133e}'), + ('\u{11340}', '\u{11340}'), + ('\u{11357}', '\u{11357}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114b0}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bd}', '\u{114bd}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115af}', '\u{115af}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{11930}', '\u{11930}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{11f00}', '\u{11f01}'), + ('\u{11f36}', '\u{11f3a}'), + ('\u{11f40}', '\u{11f40}'), + ('\u{11f42}', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d165}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d16e}', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const GRAPHEME_LINK: &'static [(char, char)] = &[ + ('\u{94d}', '\u{94d}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{a4d}', '\u{a4d}'), + ('\u{acd}', '\u{acd}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c4d}', '\u{c4d}'), + ('\u{ccd}', '\u{ccd}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{dca}', '\u{dca}'), + ('\u{e3a}', '\u{e3a}'), + ('\u{eba}', '\u{eba}'), + ('\u{f84}', '\u{f84}'), + ('\u{1039}', '\u{103a}'), + ('\u{1714}', '᜕'), + ('᜴', '᜴'), + ('\u{17d2}', '\u{17d2}'), + ('\u{1a60}', '\u{1a60}'), + ('᭄', '᭄'), + ('᮪', '\u{1bab}'), + ('᯲', '᯳'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{a806}', '\u{a806}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c4}'), + ('꥓', '꥓'), + ('꧀', '꧀'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abed}', '\u{abed}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{11046}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{1107f}', '\u{1107f}'), + ('\u{110b9}', '\u{110b9}'), + ('\u{11133}', '\u{11134}'), + ('𑇀', '𑇀'), + ('𑈵', '𑈵'), + ('\u{112ea}', '\u{112ea}'), + ('𑍍', '𑍍'), + ('\u{11442}', '\u{11442}'), + ('\u{114c2}', '\u{114c2}'), + ('\u{115bf}', '\u{115bf}'), + ('\u{1163f}', '\u{1163f}'), + ('𑚶', '𑚶'), + ('\u{1172b}', '\u{1172b}'), + ('\u{11839}', '\u{11839}'), + ('𑤽', '\u{1193e}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a34}', '\u{11a34}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a99}', '\u{11a99}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11d44}', '\u{11d45}'), + ('\u{11d97}', '\u{11d97}'), + ('𑽁', '\u{11f42}'), +]; + +pub const HEX_DIGIT: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'F'), + ('a', 'f'), + ('0', '9'), + ('A', 'F'), + ('a', 'f'), +]; + +pub const HYPHEN: &'static [(char, char)] = &[ + ('-', '-'), + ('\u{ad}', '\u{ad}'), + ('֊', '֊'), + ('᠆', '᠆'), + ('‐', '‑'), + ('⸗', '⸗'), + ('・', '・'), + ('﹣', '﹣'), + ('-', '-'), + ('・', '・'), +]; + +pub const IDS_BINARY_OPERATOR: &'static [(char, char)] = + &[('⿰', '⿱'), ('⿴', '⿻')]; + +pub const IDS_TRINARY_OPERATOR: &'static [(char, char)] = &[('⿲', '⿳')]; + +pub const ID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('·', '·'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{300}', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('\u{483}', '\u{487}'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '٩'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{74a}'), + ('ݍ', 'ޱ'), + ('߀', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('ࠀ', '\u{82d}'), + ('ࡀ', '\u{85b}'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('०', '९'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૯'), + ('ૹ', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୯'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௯'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('൦', '൯'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', '\u{e4e}'), + ('๐', '๙'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('\u{f18}', '\u{f19}'), + ('༠', '༩'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('က', '၉'), + ('ၐ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '\u{135f}'), + ('፩', '፱'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '᜕'), + ('ᜟ', '᜴'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('ៜ', '\u{17dd}'), + ('០', '៩'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧚'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1abf}', '\u{1ace}'), + ('\u{1b00}', 'ᭌ'), + ('᭐', '᭙'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '᯳'), + ('ᰀ', '\u{1c37}'), + ('᱀', '᱉'), + ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'ᳺ'), + ('ᴀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20f0}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('々', '〇'), + ('〡', '\u{302f}'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('\u{3099}', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), + ('Ꙁ', '\u{a66f}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', '\u{a6f1}'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꡀ', 'ꡳ'), + ('ꢀ', '\u{a8c5}'), + ('꣐', '꣙'), + ('\u{a8e0}', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a92d}'), + ('ꤰ', '꥓'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧀'), + ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', '\u{aaf6}'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('︳', '︴'), + ('﹍', '﹏'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('\u{101fd}', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('\u{102e0}', '\u{102e0}'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '\u{10ae6}'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('\u{10efd}', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '\u{10f50}'), + ('𐽰', '\u{10f85}'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11046}'), + ('𑁦', '𑁵'), + ('\u{1107f}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑄿'), + ('𑅄', '𑅇'), + ('𑅐', '\u{11173}'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑇄'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐀', '𑑊'), + ('𑑐', '𑑙'), + ('\u{1145e}', '𑑡'), + ('𑒀', '𑓅'), + ('𑓇', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('𑗘', '\u{115dd}'), + ('𑘀', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑙐', '𑙙'), + ('𑚀', '𑚸'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜹'), + ('𑝀', '𑝆'), + ('𑠀', '\u{1183a}'), + ('𑢠', '𑣩'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{11943}'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('𑩐', '\u{11a99}'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱀'), + ('𑱐', '𑱙'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('𑽐', '𑽙'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('\u{13440}', '\u{13455}'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('\u{16af0}', '\u{16af4}'), + ('𖬀', '\u{16b36}'), + ('𖭀', '𖭃'), + ('𖭐', '𖭙'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅎'), + ('𞊐', '\u{1e2ae}'), + ('𞋀', '𞋹'), + ('𞓐', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('𞤀', '𞥋'), + ('𞥐', '𞥙'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const ID_START: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('เ', 'ๆ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('々', '〇'), + ('〡', '〩'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('゛', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const IDEOGRAPHIC: &'static [(char, char)] = &[ + ('〆', '〇'), + ('〡', '〩'), + ('〸', '〺'), + ('㐀', '䶿'), + ('一', '鿿'), + ('豈', '舘'), + ('並', '龎'), + ('\u{16fe4}', '\u{16fe4}'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𛅰', '𛋻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const JOIN_CONTROL: &'static [(char, char)] = &[('\u{200c}', '\u{200d}')]; + +pub const LOGICAL_ORDER_EXCEPTION: &'static [(char, char)] = &[ + ('เ', 'ไ'), + ('ເ', 'ໄ'), + ('ᦵ', 'ᦷ'), + ('ᦺ', 'ᦺ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪹ'), + ('ꪻ', 'ꪼ'), +]; + +pub const LOWERCASE: &'static [(char, char)] = &[ + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ĸ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƍ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƛ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƪ', 'ƫ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƺ'), + ('ƽ', 'ƿ'), + ('dž', 'dž'), + ('lj', 'lj'), + ('nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'ǰ'), + ('dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȡ', 'ȡ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȹ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ʓ'), + ('ʕ', 'ʸ'), + ('ˀ', 'ˁ'), + ('ˠ', 'ˤ'), + ('\u{345}', '\u{345}'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϼ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ՠ', 'ֈ'), + ('ა', 'ჺ'), + ('ჼ', 'ჿ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᴀ', 'ᶿ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẝ'), + ('ẟ', 'ẟ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), + ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℊ', 'ℊ'), + ('ℎ', 'ℏ'), + ('ℓ', 'ℓ'), + ('ℯ', 'ℯ'), + ('ℴ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℽ'), + ('ⅆ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), + ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱱ', 'ⱱ'), + ('ⱳ', 'ⱴ'), + ('ⱶ', 'ⱽ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳤ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚝ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜱ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝸ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞎ', 'ꞎ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞕ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞯ', 'ꞯ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟕ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟲ', 'ꟴ'), + ('ꟶ', 'ꟶ'), + ('ꟸ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐞀', '𐞀'), + ('𐞃', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𝐚', '𝐳'), + ('𝑎', '𝑔'), + ('𝑖', '𝑧'), + ('𝒂', '𝒛'), + ('𝒶', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝓏'), + ('𝓪', '𝔃'), + ('𝔞', '𝔷'), + ('𝕒', '𝕫'), + ('𝖆', '𝖟'), + ('𝖺', '𝗓'), + ('𝗮', '𝘇'), + ('𝘢', '𝘻'), + ('𝙖', '𝙯'), + ('𝚊', '𝚥'), + ('𝛂', '𝛚'), + ('𝛜', '𝛡'), + ('𝛼', '𝜔'), + ('𝜖', '𝜛'), + ('𝜶', '𝝎'), + ('𝝐', '𝝕'), + ('𝝰', '𝞈'), + ('𝞊', '𝞏'), + ('𝞪', '𝟂'), + ('𝟄', '𝟉'), + ('𝟋', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞤢', '𞥃'), +]; + +pub const MATH: &'static [(char, char)] = &[ + ('+', '+'), + ('<', '>'), + ('^', '^'), + ('|', '|'), + ('~', '~'), + ('¬', '¬'), + ('±', '±'), + ('×', '×'), + ('÷', '÷'), + ('ϐ', 'ϒ'), + ('ϕ', 'ϕ'), + ('ϰ', 'ϱ'), + ('ϴ', '϶'), + ('؆', '؈'), + ('‖', '‖'), + ('′', '‴'), + ('⁀', '⁀'), + ('⁄', '⁄'), + ('⁒', '⁒'), + ('\u{2061}', '\u{2064}'), + ('⁺', '⁾'), + ('₊', '₎'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20e6}'), + ('\u{20eb}', '\u{20ef}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('ℨ', '℩'), + ('ℬ', 'ℭ'), + ('ℯ', 'ℱ'), + ('ℳ', 'ℸ'), + ('ℼ', 'ⅉ'), + ('⅋', '⅋'), + ('←', '↧'), + ('↩', '↮'), + ('↰', '↱'), + ('↶', '↷'), + ('↼', '⇛'), + ('⇝', '⇝'), + ('⇤', '⇥'), + ('⇴', '⋿'), + ('⌈', '⌋'), + ('⌠', '⌡'), + ('⍼', '⍼'), + ('⎛', '⎵'), + ('⎷', '⎷'), + ('⏐', '⏐'), + ('⏜', '⏢'), + ('■', '□'), + ('▮', '▷'), + ('▼', '◁'), + ('◆', '◇'), + ('◊', '○'), + ('●', '◓'), + ('◢', '◢'), + ('◤', '◤'), + ('◧', '◬'), + ('◸', '◿'), + ('★', '☆'), + ('♀', '♀'), + ('♂', '♂'), + ('♠', '♣'), + ('♭', '♯'), + ('⟀', '⟿'), + ('⤀', '⫿'), + ('⬰', '⭄'), + ('⭇', '⭌'), + ('﬩', '﬩'), + ('﹡', '﹦'), + ('﹨', '﹨'), + ('+', '+'), + ('<', '>'), + ('\', '\'), + ('^', '^'), + ('|', '|'), + ('~', '~'), + ('¬', '¬'), + ('←', '↓'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝟋'), + ('𝟎', '𝟿'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), +]; + +pub const NONCHARACTER_CODE_POINT: &'static [(char, char)] = &[ + ('\u{fdd0}', '\u{fdef}'), + ('\u{fffe}', '\u{ffff}'), + ('\u{1fffe}', '\u{1ffff}'), + ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), + ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), + ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), + ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), + ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), + ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), + ('\u{efffe}', '\u{effff}'), + ('\u{ffffe}', '\u{fffff}'), + ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const OTHER_ALPHABETIC: &'static [(char, char)] = &[ + ('\u{345}', '\u{345}'), + ('\u{5b0}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{657}'), + ('\u{659}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6e1}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ed}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{73f}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{816}', '\u{817}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82c}'), + ('\u{8d4}', '\u{8df}'), + ('\u{8e3}', '\u{8e9}'), + ('\u{8f0}', 'ः'), + ('\u{93a}', 'ऻ'), + ('ा', 'ौ'), + ('ॎ', 'ॏ'), + ('\u{955}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'ঃ'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{a01}', 'ਃ'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4c}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('ા', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', 'ૌ'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{afc}'), + ('\u{b01}', 'ଃ'), + ('\u{b3e}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('\u{b56}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4c}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ಃ'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccc}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('ೳ', 'ೳ'), + ('\u{d00}', 'ഃ'), + ('\u{d3e}', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'ඃ'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e4d}', '\u{e4d}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{eb9}'), + ('\u{ebb}', '\u{ebc}'), + ('\u{ecd}', '\u{ecd}'), + ('\u{f71}', '\u{f83}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('ါ', '\u{1036}'), + ('း', 'း'), + ('ျ', '\u{103e}'), + ('ၖ', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('\u{1712}', '\u{1713}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('ា', 'ៈ'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', 'ᤸ'), + ('\u{1a17}', '\u{1a1b}'), + ('ᩕ', '\u{1a5e}'), + ('ᩡ', '\u{1a74}'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{1acc}', '\u{1ace}'), + ('\u{1b00}', 'ᬄ'), + ('\u{1b35}', 'ᭃ'), + ('\u{1b80}', 'ᮂ'), + ('ᮡ', '\u{1ba9}'), + ('\u{1bac}', '\u{1bad}'), + ('ᯧ', '\u{1bf1}'), + ('ᰤ', '\u{1c36}'), + ('\u{1de7}', '\u{1df4}'), + ('Ⓐ', 'ⓩ'), + ('\u{2de0}', '\u{2dff}'), + ('\u{a674}', '\u{a67b}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a802}', '\u{a802}'), + ('\u{a80b}', '\u{a80b}'), + ('ꠣ', 'ꠧ'), + ('ꢀ', 'ꢁ'), + ('ꢴ', 'ꣃ'), + ('\u{a8c5}', '\u{a8c5}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92a}'), + ('\u{a947}', 'ꥒ'), + ('\u{a980}', 'ꦃ'), + ('ꦴ', 'ꦿ'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ꩍ'), + ('ꩻ', 'ꩽ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabe}'), + ('ꫫ', 'ꫯ'), + ('ꫵ', 'ꫵ'), + ('ꯣ', 'ꯪ'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('𑀀', '𑀂'), + ('\u{11038}', '\u{11045}'), + ('\u{11073}', '\u{11074}'), + ('\u{11080}', '𑂂'), + ('𑂰', '𑂸'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11132}'), + ('𑅅', '𑅆'), + ('\u{11180}', '𑆂'), + ('𑆳', '𑆿'), + ('𑇎', '\u{111cf}'), + ('𑈬', '\u{11234}'), + ('\u{11237}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112e8}'), + ('\u{11300}', '𑌃'), + ('\u{1133e}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍌'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('𑐵', '𑑁'), + ('\u{11443}', '𑑅'), + ('\u{114b0}', '𑓁'), + ('\u{115af}', '\u{115b5}'), + ('𑖸', '𑖾'), + ('\u{115dc}', '\u{115dd}'), + ('𑘰', '𑘾'), + ('\u{11640}', '\u{11640}'), + ('\u{116ab}', '\u{116b5}'), + ('\u{1171d}', '\u{1172a}'), + ('𑠬', '𑠸'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193c}'), + ('𑥀', '𑥀'), + ('𑥂', '𑥂'), + ('𑧑', '\u{119d7}'), + ('\u{119da}', '𑧟'), + ('𑧤', '𑧤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a35}', '𑨹'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '𑪗'), + ('𑰯', '\u{11c36}'), + ('\u{11c38}', '𑰾'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d41}'), + ('\u{11d43}', '\u{11d43}'), + ('\u{11d47}', '\u{11d47}'), + ('𑶊', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶖'), + ('\u{11ef3}', '𑻶'), + ('\u{11f00}', '\u{11f01}'), + ('𑼃', '𑼃'), + ('𑼴', '\u{11f3a}'), + ('𑼾', '\u{11f40}'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽑', '𖾇'), + ('\u{16f8f}', '\u{16f92}'), + ('𖿰', '𖿱'), + ('\u{1bc9e}', '\u{1bc9e}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e947}', '\u{1e947}'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const OTHER_DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('\u{34f}', '\u{34f}'), + ('ᅟ', 'ᅠ'), + ('\u{17b4}', '\u{17b5}'), + ('\u{2065}', '\u{2065}'), + ('ㅤ', 'ㅤ'), + ('ᅠ', 'ᅠ'), + ('\u{fff0}', '\u{fff8}'), + ('\u{e0000}', '\u{e0000}'), + ('\u{e0002}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{e0fff}'), +]; + +pub const OTHER_GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('\u{9be}', '\u{9be}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{b3e}', '\u{b3e}'), + ('\u{b57}', '\u{b57}'), + ('\u{bbe}', '\u{bbe}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{cc2}', '\u{cc2}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{d3e}', '\u{d3e}'), + ('\u{d57}', '\u{d57}'), + ('\u{dcf}', '\u{dcf}'), + ('\u{ddf}', '\u{ddf}'), + ('\u{1b35}', '\u{1b35}'), + ('\u{200c}', '\u{200c}'), + ('\u{302e}', '\u{302f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{1133e}', '\u{1133e}'), + ('\u{11357}', '\u{11357}'), + ('\u{114b0}', '\u{114b0}'), + ('\u{114bd}', '\u{114bd}'), + ('\u{115af}', '\u{115af}'), + ('\u{11930}', '\u{11930}'), + ('\u{1d165}', '\u{1d165}'), + ('\u{1d16e}', '\u{1d172}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const OTHER_ID_CONTINUE: &'static [(char, char)] = + &[('·', '·'), ('·', '·'), ('፩', '፱'), ('᧚', '᧚')]; + +pub const OTHER_ID_START: &'static [(char, char)] = + &[('\u{1885}', '\u{1886}'), ('℘', '℘'), ('℮', '℮'), ('゛', '゜')]; + +pub const OTHER_LOWERCASE: &'static [(char, char)] = &[ + ('ª', 'ª'), + ('º', 'º'), + ('ʰ', 'ʸ'), + ('ˀ', 'ˁ'), + ('ˠ', 'ˤ'), + ('\u{345}', '\u{345}'), + ('ͺ', 'ͺ'), + ('ჼ', 'ჼ'), + ('ᴬ', 'ᵪ'), + ('ᵸ', 'ᵸ'), + ('ᶛ', 'ᶿ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ⅰ', 'ⅿ'), + ('ⓐ', 'ⓩ'), + ('ⱼ', 'ⱽ'), + ('ꚜ', 'ꚝ'), + ('ꝰ', 'ꝰ'), + ('ꟲ', 'ꟴ'), + ('ꟸ', 'ꟹ'), + ('ꭜ', 'ꭟ'), + ('ꭩ', 'ꭩ'), + ('𐞀', '𐞀'), + ('𐞃', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𞀰', '𞁭'), +]; + +pub const OTHER_MATH: &'static [(char, char)] = &[ + ('^', '^'), + ('ϐ', 'ϒ'), + ('ϕ', 'ϕ'), + ('ϰ', 'ϱ'), + ('ϴ', 'ϵ'), + ('‖', '‖'), + ('′', '‴'), + ('⁀', '⁀'), + ('\u{2061}', '\u{2064}'), + ('⁽', '⁾'), + ('₍', '₎'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20e6}'), + ('\u{20eb}', '\u{20ef}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('ℨ', '℩'), + ('ℬ', 'ℭ'), + ('ℯ', 'ℱ'), + ('ℳ', 'ℸ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('↕', '↙'), + ('↜', '↟'), + ('↡', '↢'), + ('↤', '↥'), + ('↧', '↧'), + ('↩', '↭'), + ('↰', '↱'), + ('↶', '↷'), + ('↼', '⇍'), + ('⇐', '⇑'), + ('⇓', '⇓'), + ('⇕', '⇛'), + ('⇝', '⇝'), + ('⇤', '⇥'), + ('⌈', '⌋'), + ('⎴', '⎵'), + ('⎷', '⎷'), + ('⏐', '⏐'), + ('⏢', '⏢'), + ('■', '□'), + ('▮', '▶'), + ('▼', '◀'), + ('◆', '◇'), + ('◊', '○'), + ('●', '◓'), + ('◢', '◢'), + ('◤', '◤'), + ('◧', '◬'), + ('★', '☆'), + ('♀', '♀'), + ('♂', '♂'), + ('♠', '♣'), + ('♭', '♮'), + ('⟅', '⟆'), + ('⟦', '⟯'), + ('⦃', '⦘'), + ('⧘', '⧛'), + ('⧼', '⧽'), + ('﹡', '﹡'), + ('﹣', '﹣'), + ('﹨', '﹨'), + ('\', '\'), + ('^', '^'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), +]; + +pub const OTHER_UPPERCASE: &'static [(char, char)] = + &[('Ⅰ', 'Ⅿ'), ('Ⓐ', 'Ⓩ'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉')]; + +pub const PATTERN_SYNTAX: &'static [(char, char)] = &[ + ('!', '/'), + (':', '@'), + ('[', '^'), + ('`', '`'), + ('{', '~'), + ('¡', '§'), + ('©', '©'), + ('«', '¬'), + ('®', '®'), + ('°', '±'), + ('¶', '¶'), + ('»', '»'), + ('¿', '¿'), + ('×', '×'), + ('÷', '÷'), + ('‐', '‧'), + ('‰', '‾'), + ('⁁', '⁓'), + ('⁕', '⁞'), + ('←', '\u{245f}'), + ('─', '❵'), + ('➔', '⯿'), + ('⸀', '\u{2e7f}'), + ('、', '〃'), + ('〈', '〠'), + ('〰', '〰'), + ('﴾', '﴿'), + ('﹅', '﹆'), +]; + +pub const PATTERN_WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{200e}', '\u{200f}'), + ('\u{2028}', '\u{2029}'), +]; + +pub const PREPENDED_CONCATENATION_MARK: &'static [(char, char)] = &[ + ('\u{600}', '\u{605}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), +]; + +pub const QUOTATION_MARK: &'static [(char, char)] = &[ + ('"', '"'), + ('\'', '\''), + ('«', '«'), + ('»', '»'), + ('‘', '‟'), + ('‹', '›'), + ('⹂', '⹂'), + ('「', '』'), + ('〝', '〟'), + ('﹁', '﹄'), + ('"', '"'), + (''', '''), + ('「', '」'), +]; + +pub const RADICAL: &'static [(char, char)] = + &[('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕')]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')]; + +pub const SENTENCE_TERMINAL: &'static [(char, char)] = &[ + ('!', '!'), + ('.', '.'), + ('?', '?'), + ('։', '։'), + ('؝', '؟'), + ('۔', '۔'), + ('܀', '܂'), + ('߹', '߹'), + ('࠷', '࠷'), + ('࠹', '࠹'), + ('࠽', '࠾'), + ('।', '॥'), + ('၊', '။'), + ('።', '።'), + ('፧', '፨'), + ('᙮', '᙮'), + ('᜵', '᜶'), + ('᠃', '᠃'), + ('᠉', '᠉'), + ('᥄', '᥅'), + ('᪨', '᪫'), + ('᭚', '᭛'), + ('᭞', '᭟'), + ('᭽', '᭾'), + ('᰻', '᰼'), + ('᱾', '᱿'), + ('‼', '‽'), + ('⁇', '⁉'), + ('⸮', '⸮'), + ('⸼', '⸼'), + ('⹓', '⹔'), + ('。', '。'), + ('꓿', '꓿'), + ('꘎', '꘏'), + ('꛳', '꛳'), + ('꛷', '꛷'), + ('꡶', '꡷'), + ('꣎', '꣏'), + ('꤯', '꤯'), + ('꧈', '꧉'), + ('꩝', '꩟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('﹒', '﹒'), + ('﹖', '﹗'), + ('!', '!'), + ('.', '.'), + ('?', '?'), + ('。', '。'), + ('𐩖', '𐩗'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁈'), + ('𑂾', '𑃁'), + ('𑅁', '𑅃'), + ('𑇅', '𑇆'), + ('𑇍', '𑇍'), + ('𑇞', '𑇟'), + ('𑈸', '𑈹'), + ('𑈻', '𑈼'), + ('𑊩', '𑊩'), + ('𑑋', '𑑌'), + ('𑗂', '𑗃'), + ('𑗉', '𑗗'), + ('𑙁', '𑙂'), + ('𑜼', '𑜾'), + ('𑥄', '𑥄'), + ('𑥆', '𑥆'), + ('𑩂', '𑩃'), + ('𑪛', '𑪜'), + ('𑱁', '𑱂'), + ('𑻷', '𑻸'), + ('𑽃', '𑽄'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬸'), + ('𖭄', '𖭄'), + ('𖺘', '𖺘'), + ('𛲟', '𛲟'), + ('𝪈', '𝪈'), +]; + +pub const SOFT_DOTTED: &'static [(char, char)] = &[ + ('i', 'j'), + ('į', 'į'), + ('ɉ', 'ɉ'), + ('ɨ', 'ɨ'), + ('ʝ', 'ʝ'), + ('ʲ', 'ʲ'), + ('ϳ', 'ϳ'), + ('і', 'і'), + ('ј', 'ј'), + ('ᵢ', 'ᵢ'), + ('ᶖ', 'ᶖ'), + ('ᶤ', 'ᶤ'), + ('ᶨ', 'ᶨ'), + ('ḭ', 'ḭ'), + ('ị', 'ị'), + ('ⁱ', 'ⁱ'), + ('ⅈ', 'ⅉ'), + ('ⱼ', 'ⱼ'), + ('𝐢', '𝐣'), + ('𝑖', '𝑗'), + ('𝒊', '𝒋'), + ('𝒾', '𝒿'), + ('𝓲', '𝓳'), + ('𝔦', '𝔧'), + ('𝕚', '𝕛'), + ('𝖎', '𝖏'), + ('𝗂', '𝗃'), + ('𝗶', '𝗷'), + ('𝘪', '𝘫'), + ('𝙞', '𝙟'), + ('𝚒', '𝚓'), + ('𝼚', '𝼚'), + ('𞁌', '𞁍'), + ('𞁨', '𞁨'), +]; + +pub const TERMINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '!'), + (',', ','), + ('.', '.'), + (':', ';'), + ('?', '?'), + (';', ';'), + ('·', '·'), + ('։', '։'), + ('׃', '׃'), + ('،', '،'), + ('؛', '؛'), + ('؝', '؟'), + ('۔', '۔'), + ('܀', '܊'), + ('܌', '܌'), + ('߸', '߹'), + ('࠰', '࠾'), + ('࡞', '࡞'), + ('।', '॥'), + ('๚', '๛'), + ('༈', '༈'), + ('།', '༒'), + ('၊', '။'), + ('፡', '፨'), + ('᙮', '᙮'), + ('᛫', '᛭'), + ('᜵', '᜶'), + ('។', '៖'), + ('៚', '៚'), + ('᠂', '᠅'), + ('᠈', '᠉'), + ('᥄', '᥅'), + ('᪨', '᪫'), + ('᭚', '᭛'), + ('᭝', '᭟'), + ('᭽', '᭾'), + ('᰻', '᰿'), + ('᱾', '᱿'), + ('‼', '‽'), + ('⁇', '⁉'), + ('⸮', '⸮'), + ('⸼', '⸼'), + ('⹁', '⹁'), + ('⹌', '⹌'), + ('⹎', '⹏'), + ('⹓', '⹔'), + ('、', '。'), + ('꓾', '꓿'), + ('꘍', '꘏'), + ('꛳', '꛷'), + ('꡶', '꡷'), + ('꣎', '꣏'), + ('꤯', '꤯'), + ('꧇', '꧉'), + ('꩝', '꩟'), + ('꫟', '꫟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('﹐', '﹒'), + ('﹔', '﹗'), + ('!', '!'), + (',', ','), + ('.', '.'), + (':', ';'), + ('?', '?'), + ('。', '。'), + ('、', '、'), + ('𐎟', '𐎟'), + ('𐏐', '𐏐'), + ('𐡗', '𐡗'), + ('𐤟', '𐤟'), + ('𐩖', '𐩗'), + ('𐫰', '𐫵'), + ('𐬺', '𐬿'), + ('𐮙', '𐮜'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁍'), + ('𑂾', '𑃁'), + ('𑅁', '𑅃'), + ('𑇅', '𑇆'), + ('𑇍', '𑇍'), + ('𑇞', '𑇟'), + ('𑈸', '𑈼'), + ('𑊩', '𑊩'), + ('𑑋', '𑑍'), + ('𑑚', '𑑛'), + ('𑗂', '𑗅'), + ('𑗉', '𑗗'), + ('𑙁', '𑙂'), + ('𑜼', '𑜾'), + ('𑥄', '𑥄'), + ('𑥆', '𑥆'), + ('𑩂', '𑩃'), + ('𑪛', '𑪜'), + ('𑪡', '𑪢'), + ('𑱁', '𑱃'), + ('𑱱', '𑱱'), + ('𑻷', '𑻸'), + ('𑽃', '𑽄'), + ('𒑰', '𒑴'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬹'), + ('𖭄', '𖭄'), + ('𖺗', '𖺘'), + ('𛲟', '𛲟'), + ('𝪇', '𝪊'), +]; + +pub const UNIFIED_IDEOGRAPH: &'static [(char, char)] = &[ + ('㐀', '䶿'), + ('一', '鿿'), + ('﨎', '﨏'), + ('﨑', '﨑'), + ('﨓', '﨔'), + ('﨟', '﨟'), + ('﨡', '﨡'), + ('﨣', '﨤'), + ('﨧', '﨩'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const UPPERCASE: &'static [(char, char)] = &[ + ('A', 'Z'), + ('À', 'Ö'), + ('Ø', 'Þ'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'DŽ'), + ('LJ', 'LJ'), + ('NJ', 'NJ'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'DZ'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϴ', 'ϴ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('Ᾰ', 'Ά'), + ('Ὲ', 'Ή'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('Ὸ', 'Ώ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℋ', 'ℍ'), + ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℰ', 'ℳ'), + ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), + ('Ⅰ', 'Ⅿ'), + ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𝐀', '𝐙'), + ('𝐴', '𝑍'), + ('𝑨', '𝒁'), + ('𝒜', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒵'), + ('𝓐', '𝓩'), + ('𝔄', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔸', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕬', '𝖅'), + ('𝖠', '𝖹'), + ('𝗔', '𝗭'), + ('𝘈', '𝘡'), + ('𝘼', '𝙕'), + ('𝙰', '𝚉'), + ('𝚨', '𝛀'), + ('𝛢', '𝛺'), + ('𝜜', '𝜴'), + ('𝝖', '𝝮'), + ('𝞐', '𝞨'), + ('𝟊', '𝟊'), + ('𞤀', '𞤡'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const VARIATION_SELECTOR: &'static [(char, char)] = &[ + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const XID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('·', '·'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{300}', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('\u{483}', '\u{487}'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '٩'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{74a}'), + ('ݍ', 'ޱ'), + ('߀', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('ࠀ', '\u{82d}'), + ('ࡀ', '\u{85b}'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('०', '९'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૯'), + ('ૹ', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୯'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௯'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('൦', '൯'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', '\u{e4e}'), + ('๐', '๙'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('\u{f18}', '\u{f19}'), + ('༠', '༩'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('က', '၉'), + ('ၐ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '\u{135f}'), + ('፩', '፱'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '᜕'), + ('ᜟ', '᜴'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('ៜ', '\u{17dd}'), + ('០', '៩'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧚'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1abf}', '\u{1ace}'), + ('\u{1b00}', 'ᭌ'), + ('᭐', '᭙'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '᯳'), + ('ᰀ', '\u{1c37}'), + ('᱀', '᱉'), + ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'ᳺ'), + ('ᴀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20f0}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('々', '〇'), + ('〡', '\u{302f}'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('\u{3099}', '\u{309a}'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), + ('Ꙁ', '\u{a66f}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', '\u{a6f1}'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꡀ', 'ꡳ'), + ('ꢀ', '\u{a8c5}'), + ('꣐', '꣙'), + ('\u{a8e0}', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a92d}'), + ('ꤰ', '꥓'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧀'), + ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', '\u{aaf6}'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﱝ'), + ('ﱤ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷹ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('︳', '︴'), + ('﹍', '﹏'), + ('ﹱ', 'ﹱ'), + ('ﹳ', 'ﹳ'), + ('ﹷ', 'ﹷ'), + ('ﹹ', 'ﹹ'), + ('ﹻ', 'ﹻ'), + ('ﹽ', 'ﹽ'), + ('ﹿ', 'ﻼ'), + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('\u{101fd}', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('\u{102e0}', '\u{102e0}'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '\u{10ae6}'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('\u{10efd}', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '\u{10f50}'), + ('𐽰', '\u{10f85}'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11046}'), + ('𑁦', '𑁵'), + ('\u{1107f}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑄿'), + ('𑅄', '𑅇'), + ('𑅐', '\u{11173}'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑇄'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐀', '𑑊'), + ('𑑐', '𑑙'), + ('\u{1145e}', '𑑡'), + ('𑒀', '𑓅'), + ('𑓇', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('𑗘', '\u{115dd}'), + ('𑘀', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑙐', '𑙙'), + ('𑚀', '𑚸'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜹'), + ('𑝀', '𑝆'), + ('𑠀', '\u{1183a}'), + ('𑢠', '𑣩'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{11943}'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('𑩐', '\u{11a99}'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱀'), + ('𑱐', '𑱙'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('𑽐', '𑽙'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('\u{13440}', '\u{13455}'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('\u{16af0}', '\u{16af4}'), + ('𖬀', '\u{16b36}'), + ('𖭀', '𖭃'), + ('𖭐', '𖭙'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅎'), + ('𞊐', '\u{1e2ae}'), + ('𞋀', '𞋹'), + ('𞓐', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('𞤀', '𞥋'), + ('𞥐', '𞥙'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const XID_START: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'า'), + ('เ', 'ๆ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'າ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('々', '〇'), + ('〡', '〩'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﱝ'), + ('ﱤ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷹ'), + ('ﹱ', 'ﹱ'), + ('ﹳ', 'ﹳ'), + ('ﹷ', 'ﹷ'), + ('ﹹ', 'ﹹ'), + ('ﹻ', 'ﹻ'), + ('ﹽ', 'ﹽ'), + ('ﹿ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ヲ', 'ン'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/property_names.rs b/vendor/regex-syntax/src/unicode_tables/property_names.rs new file mode 100644 index 0000000..599a123 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/property_names.rs @@ -0,0 +1,264 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-names ucd-15.0.0 +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[ + ("age", "Age"), + ("ahex", "ASCII_Hex_Digit"), + ("alpha", "Alphabetic"), + ("alphabetic", "Alphabetic"), + ("asciihexdigit", "ASCII_Hex_Digit"), + ("bc", "Bidi_Class"), + ("bidic", "Bidi_Control"), + ("bidiclass", "Bidi_Class"), + ("bidicontrol", "Bidi_Control"), + ("bidim", "Bidi_Mirrored"), + ("bidimirrored", "Bidi_Mirrored"), + ("bidimirroringglyph", "Bidi_Mirroring_Glyph"), + ("bidipairedbracket", "Bidi_Paired_Bracket"), + ("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"), + ("blk", "Block"), + ("block", "Block"), + ("bmg", "Bidi_Mirroring_Glyph"), + ("bpb", "Bidi_Paired_Bracket"), + ("bpt", "Bidi_Paired_Bracket_Type"), + ("canonicalcombiningclass", "Canonical_Combining_Class"), + ("cased", "Cased"), + ("casefolding", "Case_Folding"), + ("caseignorable", "Case_Ignorable"), + ("ccc", "Canonical_Combining_Class"), + ("ce", "Composition_Exclusion"), + ("cf", "Case_Folding"), + ("changeswhencasefolded", "Changes_When_Casefolded"), + ("changeswhencasemapped", "Changes_When_Casemapped"), + ("changeswhenlowercased", "Changes_When_Lowercased"), + ("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"), + ("changeswhentitlecased", "Changes_When_Titlecased"), + ("changeswhenuppercased", "Changes_When_Uppercased"), + ("ci", "Case_Ignorable"), + ("cjkaccountingnumeric", "kAccountingNumeric"), + ("cjkcompatibilityvariant", "kCompatibilityVariant"), + ("cjkiicore", "kIICore"), + ("cjkirggsource", "kIRG_GSource"), + ("cjkirghsource", "kIRG_HSource"), + ("cjkirgjsource", "kIRG_JSource"), + ("cjkirgkpsource", "kIRG_KPSource"), + ("cjkirgksource", "kIRG_KSource"), + ("cjkirgmsource", "kIRG_MSource"), + ("cjkirgssource", "kIRG_SSource"), + ("cjkirgtsource", "kIRG_TSource"), + ("cjkirguksource", "kIRG_UKSource"), + ("cjkirgusource", "kIRG_USource"), + ("cjkirgvsource", "kIRG_VSource"), + ("cjkothernumeric", "kOtherNumeric"), + ("cjkprimarynumeric", "kPrimaryNumeric"), + ("cjkrsunicode", "kRSUnicode"), + ("compex", "Full_Composition_Exclusion"), + ("compositionexclusion", "Composition_Exclusion"), + ("cwcf", "Changes_When_Casefolded"), + ("cwcm", "Changes_When_Casemapped"), + ("cwkcf", "Changes_When_NFKC_Casefolded"), + ("cwl", "Changes_When_Lowercased"), + ("cwt", "Changes_When_Titlecased"), + ("cwu", "Changes_When_Uppercased"), + ("dash", "Dash"), + ("decompositionmapping", "Decomposition_Mapping"), + ("decompositiontype", "Decomposition_Type"), + ("defaultignorablecodepoint", "Default_Ignorable_Code_Point"), + ("dep", "Deprecated"), + ("deprecated", "Deprecated"), + ("di", "Default_Ignorable_Code_Point"), + ("dia", "Diacritic"), + ("diacritic", "Diacritic"), + ("dm", "Decomposition_Mapping"), + ("dt", "Decomposition_Type"), + ("ea", "East_Asian_Width"), + ("eastasianwidth", "East_Asian_Width"), + ("ebase", "Emoji_Modifier_Base"), + ("ecomp", "Emoji_Component"), + ("emod", "Emoji_Modifier"), + ("emoji", "Emoji"), + ("emojicomponent", "Emoji_Component"), + ("emojimodifier", "Emoji_Modifier"), + ("emojimodifierbase", "Emoji_Modifier_Base"), + ("emojipresentation", "Emoji_Presentation"), + ("epres", "Emoji_Presentation"), + ("equideo", "Equivalent_Unified_Ideograph"), + ("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"), + ("expandsonnfc", "Expands_On_NFC"), + ("expandsonnfd", "Expands_On_NFD"), + ("expandsonnfkc", "Expands_On_NFKC"), + ("expandsonnfkd", "Expands_On_NFKD"), + ("ext", "Extender"), + ("extendedpictographic", "Extended_Pictographic"), + ("extender", "Extender"), + ("extpict", "Extended_Pictographic"), + ("fcnfkc", "FC_NFKC_Closure"), + ("fcnfkcclosure", "FC_NFKC_Closure"), + ("fullcompositionexclusion", "Full_Composition_Exclusion"), + ("gc", "General_Category"), + ("gcb", "Grapheme_Cluster_Break"), + ("generalcategory", "General_Category"), + ("graphemebase", "Grapheme_Base"), + ("graphemeclusterbreak", "Grapheme_Cluster_Break"), + ("graphemeextend", "Grapheme_Extend"), + ("graphemelink", "Grapheme_Link"), + ("grbase", "Grapheme_Base"), + ("grext", "Grapheme_Extend"), + ("grlink", "Grapheme_Link"), + ("hangulsyllabletype", "Hangul_Syllable_Type"), + ("hex", "Hex_Digit"), + ("hexdigit", "Hex_Digit"), + ("hst", "Hangul_Syllable_Type"), + ("hyphen", "Hyphen"), + ("idc", "ID_Continue"), + ("idcontinue", "ID_Continue"), + ("ideo", "Ideographic"), + ("ideographic", "Ideographic"), + ("ids", "ID_Start"), + ("idsb", "IDS_Binary_Operator"), + ("idsbinaryoperator", "IDS_Binary_Operator"), + ("idst", "IDS_Trinary_Operator"), + ("idstart", "ID_Start"), + ("idstrinaryoperator", "IDS_Trinary_Operator"), + ("indicpositionalcategory", "Indic_Positional_Category"), + ("indicsyllabiccategory", "Indic_Syllabic_Category"), + ("inpc", "Indic_Positional_Category"), + ("insc", "Indic_Syllabic_Category"), + ("isc", "ISO_Comment"), + ("jamoshortname", "Jamo_Short_Name"), + ("jg", "Joining_Group"), + ("joinc", "Join_Control"), + ("joincontrol", "Join_Control"), + ("joininggroup", "Joining_Group"), + ("joiningtype", "Joining_Type"), + ("jsn", "Jamo_Short_Name"), + ("jt", "Joining_Type"), + ("kaccountingnumeric", "kAccountingNumeric"), + ("kcompatibilityvariant", "kCompatibilityVariant"), + ("kiicore", "kIICore"), + ("kirggsource", "kIRG_GSource"), + ("kirghsource", "kIRG_HSource"), + ("kirgjsource", "kIRG_JSource"), + ("kirgkpsource", "kIRG_KPSource"), + ("kirgksource", "kIRG_KSource"), + ("kirgmsource", "kIRG_MSource"), + ("kirgssource", "kIRG_SSource"), + ("kirgtsource", "kIRG_TSource"), + ("kirguksource", "kIRG_UKSource"), + ("kirgusource", "kIRG_USource"), + ("kirgvsource", "kIRG_VSource"), + ("kothernumeric", "kOtherNumeric"), + ("kprimarynumeric", "kPrimaryNumeric"), + ("krsunicode", "kRSUnicode"), + ("lb", "Line_Break"), + ("lc", "Lowercase_Mapping"), + ("linebreak", "Line_Break"), + ("loe", "Logical_Order_Exception"), + ("logicalorderexception", "Logical_Order_Exception"), + ("lower", "Lowercase"), + ("lowercase", "Lowercase"), + ("lowercasemapping", "Lowercase_Mapping"), + ("math", "Math"), + ("na", "Name"), + ("na1", "Unicode_1_Name"), + ("name", "Name"), + ("namealias", "Name_Alias"), + ("nchar", "Noncharacter_Code_Point"), + ("nfcqc", "NFC_Quick_Check"), + ("nfcquickcheck", "NFC_Quick_Check"), + ("nfdqc", "NFD_Quick_Check"), + ("nfdquickcheck", "NFD_Quick_Check"), + ("nfkccasefold", "NFKC_Casefold"), + ("nfkccf", "NFKC_Casefold"), + ("nfkcqc", "NFKC_Quick_Check"), + ("nfkcquickcheck", "NFKC_Quick_Check"), + ("nfkdqc", "NFKD_Quick_Check"), + ("nfkdquickcheck", "NFKD_Quick_Check"), + ("noncharactercodepoint", "Noncharacter_Code_Point"), + ("nt", "Numeric_Type"), + ("numerictype", "Numeric_Type"), + ("numericvalue", "Numeric_Value"), + ("nv", "Numeric_Value"), + ("oalpha", "Other_Alphabetic"), + ("ocomment", "ISO_Comment"), + ("odi", "Other_Default_Ignorable_Code_Point"), + ("ogrext", "Other_Grapheme_Extend"), + ("oidc", "Other_ID_Continue"), + ("oids", "Other_ID_Start"), + ("olower", "Other_Lowercase"), + ("omath", "Other_Math"), + ("otheralphabetic", "Other_Alphabetic"), + ("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"), + ("othergraphemeextend", "Other_Grapheme_Extend"), + ("otheridcontinue", "Other_ID_Continue"), + ("otheridstart", "Other_ID_Start"), + ("otherlowercase", "Other_Lowercase"), + ("othermath", "Other_Math"), + ("otheruppercase", "Other_Uppercase"), + ("oupper", "Other_Uppercase"), + ("patsyn", "Pattern_Syntax"), + ("patternsyntax", "Pattern_Syntax"), + ("patternwhitespace", "Pattern_White_Space"), + ("patws", "Pattern_White_Space"), + ("pcm", "Prepended_Concatenation_Mark"), + ("prependedconcatenationmark", "Prepended_Concatenation_Mark"), + ("qmark", "Quotation_Mark"), + ("quotationmark", "Quotation_Mark"), + ("radical", "Radical"), + ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), + ("sb", "Sentence_Break"), + ("sc", "Script"), + ("scf", "Simple_Case_Folding"), + ("script", "Script"), + ("scriptextensions", "Script_Extensions"), + ("scx", "Script_Extensions"), + ("sd", "Soft_Dotted"), + ("sentencebreak", "Sentence_Break"), + ("sentenceterminal", "Sentence_Terminal"), + ("sfc", "Simple_Case_Folding"), + ("simplecasefolding", "Simple_Case_Folding"), + ("simplelowercasemapping", "Simple_Lowercase_Mapping"), + ("simpletitlecasemapping", "Simple_Titlecase_Mapping"), + ("simpleuppercasemapping", "Simple_Uppercase_Mapping"), + ("slc", "Simple_Lowercase_Mapping"), + ("softdotted", "Soft_Dotted"), + ("space", "White_Space"), + ("stc", "Simple_Titlecase_Mapping"), + ("sterm", "Sentence_Terminal"), + ("suc", "Simple_Uppercase_Mapping"), + ("tc", "Titlecase_Mapping"), + ("term", "Terminal_Punctuation"), + ("terminalpunctuation", "Terminal_Punctuation"), + ("titlecasemapping", "Titlecase_Mapping"), + ("uc", "Uppercase_Mapping"), + ("uideo", "Unified_Ideograph"), + ("unicode1name", "Unicode_1_Name"), + ("unicoderadicalstroke", "kRSUnicode"), + ("unifiedideograph", "Unified_Ideograph"), + ("upper", "Uppercase"), + ("uppercase", "Uppercase"), + ("uppercasemapping", "Uppercase_Mapping"), + ("urs", "kRSUnicode"), + ("variationselector", "Variation_Selector"), + ("verticalorientation", "Vertical_Orientation"), + ("vo", "Vertical_Orientation"), + ("vs", "Variation_Selector"), + ("wb", "Word_Break"), + ("whitespace", "White_Space"), + ("wordbreak", "Word_Break"), + ("wspace", "White_Space"), + ("xidc", "XID_Continue"), + ("xidcontinue", "XID_Continue"), + ("xids", "XID_Start"), + ("xidstart", "XID_Start"), + ("xonfc", "Expands_On_NFC"), + ("xonfd", "Expands_On_NFD"), + ("xonfkc", "Expands_On_NFKC"), + ("xonfkd", "Expands_On_NFKD"), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/property_values.rs b/vendor/regex-syntax/src/unicode_tables/property_values.rs new file mode 100644 index 0000000..cb2d32f --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/property_values.rs @@ -0,0 +1,924 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-values ucd-15.0.0 --include gc,script,scx,age,gcb,wb,sb +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const PROPERTY_VALUES: &'static [( + &'static str, + &'static [(&'static str, &'static str)], +)] = &[ + ( + "Age", + &[ + ("1.1", "V1_1"), + ("10.0", "V10_0"), + ("11.0", "V11_0"), + ("12.0", "V12_0"), + ("12.1", "V12_1"), + ("13.0", "V13_0"), + ("14.0", "V14_0"), + ("15.0", "V15_0"), + ("2.0", "V2_0"), + ("2.1", "V2_1"), + ("3.0", "V3_0"), + ("3.1", "V3_1"), + ("3.2", "V3_2"), + ("4.0", "V4_0"), + ("4.1", "V4_1"), + ("5.0", "V5_0"), + ("5.1", "V5_1"), + ("5.2", "V5_2"), + ("6.0", "V6_0"), + ("6.1", "V6_1"), + ("6.2", "V6_2"), + ("6.3", "V6_3"), + ("7.0", "V7_0"), + ("8.0", "V8_0"), + ("9.0", "V9_0"), + ("na", "Unassigned"), + ("unassigned", "Unassigned"), + ("v100", "V10_0"), + ("v11", "V1_1"), + ("v110", "V11_0"), + ("v120", "V12_0"), + ("v121", "V12_1"), + ("v130", "V13_0"), + ("v140", "V14_0"), + ("v150", "V15_0"), + ("v20", "V2_0"), + ("v21", "V2_1"), + ("v30", "V3_0"), + ("v31", "V3_1"), + ("v32", "V3_2"), + ("v40", "V4_0"), + ("v41", "V4_1"), + ("v50", "V5_0"), + ("v51", "V5_1"), + ("v52", "V5_2"), + ("v60", "V6_0"), + ("v61", "V6_1"), + ("v62", "V6_2"), + ("v63", "V6_3"), + ("v70", "V7_0"), + ("v80", "V8_0"), + ("v90", "V9_0"), + ], + ), + ( + "General_Category", + &[ + ("c", "Other"), + ("casedletter", "Cased_Letter"), + ("cc", "Control"), + ("cf", "Format"), + ("closepunctuation", "Close_Punctuation"), + ("cn", "Unassigned"), + ("cntrl", "Control"), + ("co", "Private_Use"), + ("combiningmark", "Mark"), + ("connectorpunctuation", "Connector_Punctuation"), + ("control", "Control"), + ("cs", "Surrogate"), + ("currencysymbol", "Currency_Symbol"), + ("dashpunctuation", "Dash_Punctuation"), + ("decimalnumber", "Decimal_Number"), + ("digit", "Decimal_Number"), + ("enclosingmark", "Enclosing_Mark"), + ("finalpunctuation", "Final_Punctuation"), + ("format", "Format"), + ("initialpunctuation", "Initial_Punctuation"), + ("l", "Letter"), + ("lc", "Cased_Letter"), + ("letter", "Letter"), + ("letternumber", "Letter_Number"), + ("lineseparator", "Line_Separator"), + ("ll", "Lowercase_Letter"), + ("lm", "Modifier_Letter"), + ("lo", "Other_Letter"), + ("lowercaseletter", "Lowercase_Letter"), + ("lt", "Titlecase_Letter"), + ("lu", "Uppercase_Letter"), + ("m", "Mark"), + ("mark", "Mark"), + ("mathsymbol", "Math_Symbol"), + ("mc", "Spacing_Mark"), + ("me", "Enclosing_Mark"), + ("mn", "Nonspacing_Mark"), + ("modifierletter", "Modifier_Letter"), + ("modifiersymbol", "Modifier_Symbol"), + ("n", "Number"), + ("nd", "Decimal_Number"), + ("nl", "Letter_Number"), + ("no", "Other_Number"), + ("nonspacingmark", "Nonspacing_Mark"), + ("number", "Number"), + ("openpunctuation", "Open_Punctuation"), + ("other", "Other"), + ("otherletter", "Other_Letter"), + ("othernumber", "Other_Number"), + ("otherpunctuation", "Other_Punctuation"), + ("othersymbol", "Other_Symbol"), + ("p", "Punctuation"), + ("paragraphseparator", "Paragraph_Separator"), + ("pc", "Connector_Punctuation"), + ("pd", "Dash_Punctuation"), + ("pe", "Close_Punctuation"), + ("pf", "Final_Punctuation"), + ("pi", "Initial_Punctuation"), + ("po", "Other_Punctuation"), + ("privateuse", "Private_Use"), + ("ps", "Open_Punctuation"), + ("punct", "Punctuation"), + ("punctuation", "Punctuation"), + ("s", "Symbol"), + ("sc", "Currency_Symbol"), + ("separator", "Separator"), + ("sk", "Modifier_Symbol"), + ("sm", "Math_Symbol"), + ("so", "Other_Symbol"), + ("spaceseparator", "Space_Separator"), + ("spacingmark", "Spacing_Mark"), + ("surrogate", "Surrogate"), + ("symbol", "Symbol"), + ("titlecaseletter", "Titlecase_Letter"), + ("unassigned", "Unassigned"), + ("uppercaseletter", "Uppercase_Letter"), + ("z", "Separator"), + ("zl", "Line_Separator"), + ("zp", "Paragraph_Separator"), + ("zs", "Space_Separator"), + ], + ), + ( + "Grapheme_Cluster_Break", + &[ + ("cn", "Control"), + ("control", "Control"), + ("cr", "CR"), + ("eb", "E_Base"), + ("ebase", "E_Base"), + ("ebasegaz", "E_Base_GAZ"), + ("ebg", "E_Base_GAZ"), + ("em", "E_Modifier"), + ("emodifier", "E_Modifier"), + ("ex", "Extend"), + ("extend", "Extend"), + ("gaz", "Glue_After_Zwj"), + ("glueafterzwj", "Glue_After_Zwj"), + ("l", "L"), + ("lf", "LF"), + ("lv", "LV"), + ("lvt", "LVT"), + ("other", "Other"), + ("pp", "Prepend"), + ("prepend", "Prepend"), + ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), + ("sm", "SpacingMark"), + ("spacingmark", "SpacingMark"), + ("t", "T"), + ("v", "V"), + ("xx", "Other"), + ("zwj", "ZWJ"), + ], + ), + ( + "Script", + &[ + ("adlam", "Adlam"), + ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), + ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), + ("arab", "Arabic"), + ("arabic", "Arabic"), + ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), + ("armn", "Armenian"), + ("avestan", "Avestan"), + ("avst", "Avestan"), + ("bali", "Balinese"), + ("balinese", "Balinese"), + ("bamu", "Bamum"), + ("bamum", "Bamum"), + ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), + ("batak", "Batak"), + ("batk", "Batak"), + ("beng", "Bengali"), + ("bengali", "Bengali"), + ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), + ("bopo", "Bopomofo"), + ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), + ("brahmi", "Brahmi"), + ("brai", "Braille"), + ("braille", "Braille"), + ("bugi", "Buginese"), + ("buginese", "Buginese"), + ("buhd", "Buhid"), + ("buhid", "Buhid"), + ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), + ("cari", "Carian"), + ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), + ("chakma", "Chakma"), + ("cham", "Cham"), + ("cher", "Cherokee"), + ("cherokee", "Cherokee"), + ("chorasmian", "Chorasmian"), + ("chrs", "Chorasmian"), + ("common", "Common"), + ("copt", "Coptic"), + ("coptic", "Coptic"), + ("cpmn", "Cypro_Minoan"), + ("cprt", "Cypriot"), + ("cuneiform", "Cuneiform"), + ("cypriot", "Cypriot"), + ("cyprominoan", "Cypro_Minoan"), + ("cyrillic", "Cyrillic"), + ("cyrl", "Cyrillic"), + ("deseret", "Deseret"), + ("deva", "Devanagari"), + ("devanagari", "Devanagari"), + ("diak", "Dives_Akuru"), + ("divesakuru", "Dives_Akuru"), + ("dogr", "Dogra"), + ("dogra", "Dogra"), + ("dsrt", "Deseret"), + ("dupl", "Duployan"), + ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), + ("elba", "Elbasan"), + ("elbasan", "Elbasan"), + ("elym", "Elymaic"), + ("elymaic", "Elymaic"), + ("ethi", "Ethiopic"), + ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), + ("georgian", "Georgian"), + ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), + ("gong", "Gunjala_Gondi"), + ("gonm", "Masaram_Gondi"), + ("goth", "Gothic"), + ("gothic", "Gothic"), + ("gran", "Grantha"), + ("grantha", "Grantha"), + ("greek", "Greek"), + ("grek", "Greek"), + ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), + ("gunjalagondi", "Gunjala_Gondi"), + ("gurmukhi", "Gurmukhi"), + ("guru", "Gurmukhi"), + ("han", "Han"), + ("hang", "Hangul"), + ("hangul", "Hangul"), + ("hani", "Han"), + ("hanifirohingya", "Hanifi_Rohingya"), + ("hano", "Hanunoo"), + ("hanunoo", "Hanunoo"), + ("hatr", "Hatran"), + ("hatran", "Hatran"), + ("hebr", "Hebrew"), + ("hebrew", "Hebrew"), + ("hira", "Hiragana"), + ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), + ("hmng", "Pahawh_Hmong"), + ("hmnp", "Nyiakeng_Puachue_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), + ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), + ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), + ("ital", "Old_Italic"), + ("java", "Javanese"), + ("javanese", "Javanese"), + ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), + ("kana", "Katakana"), + ("kannada", "Kannada"), + ("katakana", "Katakana"), + ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kawi", "Kawi"), + ("kayahli", "Kayah_Li"), + ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), + ("khitansmallscript", "Khitan_Small_Script"), + ("khmer", "Khmer"), + ("khmr", "Khmer"), + ("khoj", "Khojki"), + ("khojki", "Khojki"), + ("khudawadi", "Khudawadi"), + ("kits", "Khitan_Small_Script"), + ("knda", "Kannada"), + ("kthi", "Kaithi"), + ("lana", "Tai_Tham"), + ("lao", "Lao"), + ("laoo", "Lao"), + ("latin", "Latin"), + ("latn", "Latin"), + ("lepc", "Lepcha"), + ("lepcha", "Lepcha"), + ("limb", "Limbu"), + ("limbu", "Limbu"), + ("lina", "Linear_A"), + ("linb", "Linear_B"), + ("lineara", "Linear_A"), + ("linearb", "Linear_B"), + ("lisu", "Lisu"), + ("lyci", "Lycian"), + ("lycian", "Lycian"), + ("lydi", "Lydian"), + ("lydian", "Lydian"), + ("mahajani", "Mahajani"), + ("mahj", "Mahajani"), + ("maka", "Makasar"), + ("makasar", "Makasar"), + ("malayalam", "Malayalam"), + ("mand", "Mandaic"), + ("mandaic", "Mandaic"), + ("mani", "Manichaean"), + ("manichaean", "Manichaean"), + ("marc", "Marchen"), + ("marchen", "Marchen"), + ("masaramgondi", "Masaram_Gondi"), + ("medefaidrin", "Medefaidrin"), + ("medf", "Medefaidrin"), + ("meeteimayek", "Meetei_Mayek"), + ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), + ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), + ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), + ("miao", "Miao"), + ("mlym", "Malayalam"), + ("modi", "Modi"), + ("mong", "Mongolian"), + ("mongolian", "Mongolian"), + ("mro", "Mro"), + ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), + ("mult", "Multani"), + ("multani", "Multani"), + ("myanmar", "Myanmar"), + ("mymr", "Myanmar"), + ("nabataean", "Nabataean"), + ("nagm", "Nag_Mundari"), + ("nagmundari", "Nag_Mundari"), + ("nand", "Nandinagari"), + ("nandinagari", "Nandinagari"), + ("narb", "Old_North_Arabian"), + ("nbat", "Nabataean"), + ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), + ("nko", "Nko"), + ("nkoo", "Nko"), + ("nshu", "Nushu"), + ("nushu", "Nushu"), + ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"), + ("ogam", "Ogham"), + ("ogham", "Ogham"), + ("olchiki", "Ol_Chiki"), + ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), + ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), + ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), + ("oldsogdian", "Old_Sogdian"), + ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), + ("olduyghur", "Old_Uyghur"), + ("oriya", "Oriya"), + ("orkh", "Old_Turkic"), + ("orya", "Oriya"), + ("osage", "Osage"), + ("osge", "Osage"), + ("osma", "Osmanya"), + ("osmanya", "Osmanya"), + ("ougr", "Old_Uyghur"), + ("pahawhhmong", "Pahawh_Hmong"), + ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), + ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), + ("perm", "Old_Permic"), + ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), + ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), + ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), + ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), + ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), + ("qaai", "Inherited"), + ("rejang", "Rejang"), + ("rjng", "Rejang"), + ("rohg", "Hanifi_Rohingya"), + ("runic", "Runic"), + ("runr", "Runic"), + ("samaritan", "Samaritan"), + ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), + ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), + ("sgnw", "SignWriting"), + ("sharada", "Sharada"), + ("shavian", "Shavian"), + ("shaw", "Shavian"), + ("shrd", "Sharada"), + ("sidd", "Siddham"), + ("siddham", "Siddham"), + ("signwriting", "SignWriting"), + ("sind", "Khudawadi"), + ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), + ("sogd", "Sogdian"), + ("sogdian", "Sogdian"), + ("sogo", "Old_Sogdian"), + ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), + ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), + ("sund", "Sundanese"), + ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), + ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), + ("syriac", "Syriac"), + ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), + ("tagbanwa", "Tagbanwa"), + ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), + ("taiviet", "Tai_Viet"), + ("takr", "Takri"), + ("takri", "Takri"), + ("tale", "Tai_Le"), + ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), + ("taml", "Tamil"), + ("tang", "Tangut"), + ("tangsa", "Tangsa"), + ("tangut", "Tangut"), + ("tavt", "Tai_Viet"), + ("telu", "Telugu"), + ("telugu", "Telugu"), + ("tfng", "Tifinagh"), + ("tglg", "Tagalog"), + ("thaa", "Thaana"), + ("thaana", "Thaana"), + ("thai", "Thai"), + ("tibetan", "Tibetan"), + ("tibt", "Tibetan"), + ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), + ("tirhuta", "Tirhuta"), + ("tnsa", "Tangsa"), + ("toto", "Toto"), + ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), + ("unknown", "Unknown"), + ("vai", "Vai"), + ("vaii", "Vai"), + ("vith", "Vithkuqi"), + ("vithkuqi", "Vithkuqi"), + ("wancho", "Wancho"), + ("wara", "Warang_Citi"), + ("warangciti", "Warang_Citi"), + ("wcho", "Wancho"), + ("xpeo", "Old_Persian"), + ("xsux", "Cuneiform"), + ("yezi", "Yezidi"), + ("yezidi", "Yezidi"), + ("yi", "Yi"), + ("yiii", "Yi"), + ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), + ("zinh", "Inherited"), + ("zyyy", "Common"), + ("zzzz", "Unknown"), + ], + ), + ( + "Script_Extensions", + &[ + ("adlam", "Adlam"), + ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), + ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), + ("arab", "Arabic"), + ("arabic", "Arabic"), + ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), + ("armn", "Armenian"), + ("avestan", "Avestan"), + ("avst", "Avestan"), + ("bali", "Balinese"), + ("balinese", "Balinese"), + ("bamu", "Bamum"), + ("bamum", "Bamum"), + ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), + ("batak", "Batak"), + ("batk", "Batak"), + ("beng", "Bengali"), + ("bengali", "Bengali"), + ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), + ("bopo", "Bopomofo"), + ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), + ("brahmi", "Brahmi"), + ("brai", "Braille"), + ("braille", "Braille"), + ("bugi", "Buginese"), + ("buginese", "Buginese"), + ("buhd", "Buhid"), + ("buhid", "Buhid"), + ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), + ("cari", "Carian"), + ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), + ("chakma", "Chakma"), + ("cham", "Cham"), + ("cher", "Cherokee"), + ("cherokee", "Cherokee"), + ("chorasmian", "Chorasmian"), + ("chrs", "Chorasmian"), + ("common", "Common"), + ("copt", "Coptic"), + ("coptic", "Coptic"), + ("cpmn", "Cypro_Minoan"), + ("cprt", "Cypriot"), + ("cuneiform", "Cuneiform"), + ("cypriot", "Cypriot"), + ("cyprominoan", "Cypro_Minoan"), + ("cyrillic", "Cyrillic"), + ("cyrl", "Cyrillic"), + ("deseret", "Deseret"), + ("deva", "Devanagari"), + ("devanagari", "Devanagari"), + ("diak", "Dives_Akuru"), + ("divesakuru", "Dives_Akuru"), + ("dogr", "Dogra"), + ("dogra", "Dogra"), + ("dsrt", "Deseret"), + ("dupl", "Duployan"), + ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), + ("elba", "Elbasan"), + ("elbasan", "Elbasan"), + ("elym", "Elymaic"), + ("elymaic", "Elymaic"), + ("ethi", "Ethiopic"), + ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), + ("georgian", "Georgian"), + ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), + ("gong", "Gunjala_Gondi"), + ("gonm", "Masaram_Gondi"), + ("goth", "Gothic"), + ("gothic", "Gothic"), + ("gran", "Grantha"), + ("grantha", "Grantha"), + ("greek", "Greek"), + ("grek", "Greek"), + ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), + ("gunjalagondi", "Gunjala_Gondi"), + ("gurmukhi", "Gurmukhi"), + ("guru", "Gurmukhi"), + ("han", "Han"), + ("hang", "Hangul"), + ("hangul", "Hangul"), + ("hani", "Han"), + ("hanifirohingya", "Hanifi_Rohingya"), + ("hano", "Hanunoo"), + ("hanunoo", "Hanunoo"), + ("hatr", "Hatran"), + ("hatran", "Hatran"), + ("hebr", "Hebrew"), + ("hebrew", "Hebrew"), + ("hira", "Hiragana"), + ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), + ("hmng", "Pahawh_Hmong"), + ("hmnp", "Nyiakeng_Puachue_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), + ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), + ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), + ("ital", "Old_Italic"), + ("java", "Javanese"), + ("javanese", "Javanese"), + ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), + ("kana", "Katakana"), + ("kannada", "Kannada"), + ("katakana", "Katakana"), + ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kawi", "Kawi"), + ("kayahli", "Kayah_Li"), + ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), + ("khitansmallscript", "Khitan_Small_Script"), + ("khmer", "Khmer"), + ("khmr", "Khmer"), + ("khoj", "Khojki"), + ("khojki", "Khojki"), + ("khudawadi", "Khudawadi"), + ("kits", "Khitan_Small_Script"), + ("knda", "Kannada"), + ("kthi", "Kaithi"), + ("lana", "Tai_Tham"), + ("lao", "Lao"), + ("laoo", "Lao"), + ("latin", "Latin"), + ("latn", "Latin"), + ("lepc", "Lepcha"), + ("lepcha", "Lepcha"), + ("limb", "Limbu"), + ("limbu", "Limbu"), + ("lina", "Linear_A"), + ("linb", "Linear_B"), + ("lineara", "Linear_A"), + ("linearb", "Linear_B"), + ("lisu", "Lisu"), + ("lyci", "Lycian"), + ("lycian", "Lycian"), + ("lydi", "Lydian"), + ("lydian", "Lydian"), + ("mahajani", "Mahajani"), + ("mahj", "Mahajani"), + ("maka", "Makasar"), + ("makasar", "Makasar"), + ("malayalam", "Malayalam"), + ("mand", "Mandaic"), + ("mandaic", "Mandaic"), + ("mani", "Manichaean"), + ("manichaean", "Manichaean"), + ("marc", "Marchen"), + ("marchen", "Marchen"), + ("masaramgondi", "Masaram_Gondi"), + ("medefaidrin", "Medefaidrin"), + ("medf", "Medefaidrin"), + ("meeteimayek", "Meetei_Mayek"), + ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), + ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), + ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), + ("miao", "Miao"), + ("mlym", "Malayalam"), + ("modi", "Modi"), + ("mong", "Mongolian"), + ("mongolian", "Mongolian"), + ("mro", "Mro"), + ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), + ("mult", "Multani"), + ("multani", "Multani"), + ("myanmar", "Myanmar"), + ("mymr", "Myanmar"), + ("nabataean", "Nabataean"), + ("nagm", "Nag_Mundari"), + ("nagmundari", "Nag_Mundari"), + ("nand", "Nandinagari"), + ("nandinagari", "Nandinagari"), + ("narb", "Old_North_Arabian"), + ("nbat", "Nabataean"), + ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), + ("nko", "Nko"), + ("nkoo", "Nko"), + ("nshu", "Nushu"), + ("nushu", "Nushu"), + ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"), + ("ogam", "Ogham"), + ("ogham", "Ogham"), + ("olchiki", "Ol_Chiki"), + ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), + ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), + ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), + ("oldsogdian", "Old_Sogdian"), + ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), + ("olduyghur", "Old_Uyghur"), + ("oriya", "Oriya"), + ("orkh", "Old_Turkic"), + ("orya", "Oriya"), + ("osage", "Osage"), + ("osge", "Osage"), + ("osma", "Osmanya"), + ("osmanya", "Osmanya"), + ("ougr", "Old_Uyghur"), + ("pahawhhmong", "Pahawh_Hmong"), + ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), + ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), + ("perm", "Old_Permic"), + ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), + ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), + ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), + ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), + ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), + ("qaai", "Inherited"), + ("rejang", "Rejang"), + ("rjng", "Rejang"), + ("rohg", "Hanifi_Rohingya"), + ("runic", "Runic"), + ("runr", "Runic"), + ("samaritan", "Samaritan"), + ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), + ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), + ("sgnw", "SignWriting"), + ("sharada", "Sharada"), + ("shavian", "Shavian"), + ("shaw", "Shavian"), + ("shrd", "Sharada"), + ("sidd", "Siddham"), + ("siddham", "Siddham"), + ("signwriting", "SignWriting"), + ("sind", "Khudawadi"), + ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), + ("sogd", "Sogdian"), + ("sogdian", "Sogdian"), + ("sogo", "Old_Sogdian"), + ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), + ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), + ("sund", "Sundanese"), + ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), + ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), + ("syriac", "Syriac"), + ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), + ("tagbanwa", "Tagbanwa"), + ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), + ("taiviet", "Tai_Viet"), + ("takr", "Takri"), + ("takri", "Takri"), + ("tale", "Tai_Le"), + ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), + ("taml", "Tamil"), + ("tang", "Tangut"), + ("tangsa", "Tangsa"), + ("tangut", "Tangut"), + ("tavt", "Tai_Viet"), + ("telu", "Telugu"), + ("telugu", "Telugu"), + ("tfng", "Tifinagh"), + ("tglg", "Tagalog"), + ("thaa", "Thaana"), + ("thaana", "Thaana"), + ("thai", "Thai"), + ("tibetan", "Tibetan"), + ("tibt", "Tibetan"), + ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), + ("tirhuta", "Tirhuta"), + ("tnsa", "Tangsa"), + ("toto", "Toto"), + ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), + ("unknown", "Unknown"), + ("vai", "Vai"), + ("vaii", "Vai"), + ("vith", "Vithkuqi"), + ("vithkuqi", "Vithkuqi"), + ("wancho", "Wancho"), + ("wara", "Warang_Citi"), + ("warangciti", "Warang_Citi"), + ("wcho", "Wancho"), + ("xpeo", "Old_Persian"), + ("xsux", "Cuneiform"), + ("yezi", "Yezidi"), + ("yezidi", "Yezidi"), + ("yi", "Yi"), + ("yiii", "Yi"), + ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), + ("zinh", "Inherited"), + ("zyyy", "Common"), + ("zzzz", "Unknown"), + ], + ), + ( + "Sentence_Break", + &[ + ("at", "ATerm"), + ("aterm", "ATerm"), + ("cl", "Close"), + ("close", "Close"), + ("cr", "CR"), + ("ex", "Extend"), + ("extend", "Extend"), + ("fo", "Format"), + ("format", "Format"), + ("le", "OLetter"), + ("lf", "LF"), + ("lo", "Lower"), + ("lower", "Lower"), + ("nu", "Numeric"), + ("numeric", "Numeric"), + ("oletter", "OLetter"), + ("other", "Other"), + ("sc", "SContinue"), + ("scontinue", "SContinue"), + ("se", "Sep"), + ("sep", "Sep"), + ("sp", "Sp"), + ("st", "STerm"), + ("sterm", "STerm"), + ("up", "Upper"), + ("upper", "Upper"), + ("xx", "Other"), + ], + ), + ( + "Word_Break", + &[ + ("aletter", "ALetter"), + ("cr", "CR"), + ("doublequote", "Double_Quote"), + ("dq", "Double_Quote"), + ("eb", "E_Base"), + ("ebase", "E_Base"), + ("ebasegaz", "E_Base_GAZ"), + ("ebg", "E_Base_GAZ"), + ("em", "E_Modifier"), + ("emodifier", "E_Modifier"), + ("ex", "ExtendNumLet"), + ("extend", "Extend"), + ("extendnumlet", "ExtendNumLet"), + ("fo", "Format"), + ("format", "Format"), + ("gaz", "Glue_After_Zwj"), + ("glueafterzwj", "Glue_After_Zwj"), + ("hebrewletter", "Hebrew_Letter"), + ("hl", "Hebrew_Letter"), + ("ka", "Katakana"), + ("katakana", "Katakana"), + ("le", "ALetter"), + ("lf", "LF"), + ("mb", "MidNumLet"), + ("midletter", "MidLetter"), + ("midnum", "MidNum"), + ("midnumlet", "MidNumLet"), + ("ml", "MidLetter"), + ("mn", "MidNum"), + ("newline", "Newline"), + ("nl", "Newline"), + ("nu", "Numeric"), + ("numeric", "Numeric"), + ("other", "Other"), + ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), + ("singlequote", "Single_Quote"), + ("sq", "Single_Quote"), + ("wsegspace", "WSegSpace"), + ("xx", "Other"), + ("zwj", "ZWJ"), + ], + ), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/script.rs b/vendor/regex-syntax/src/unicode_tables/script.rs new file mode 100644 index 0000000..cc5c400 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/script.rs @@ -0,0 +1,1263 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), + ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), + ("Arabic", ARABIC), + ("Armenian", ARMENIAN), + ("Avestan", AVESTAN), + ("Balinese", BALINESE), + ("Bamum", BAMUM), + ("Bassa_Vah", BASSA_VAH), + ("Batak", BATAK), + ("Bengali", BENGALI), + ("Bhaiksuki", BHAIKSUKI), + ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), + ("Braille", BRAILLE), + ("Buginese", BUGINESE), + ("Buhid", BUHID), + ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), + ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), + ("Cham", CHAM), + ("Cherokee", CHEROKEE), + ("Chorasmian", CHORASMIAN), + ("Common", COMMON), + ("Coptic", COPTIC), + ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), + ("Cypro_Minoan", CYPRO_MINOAN), + ("Cyrillic", CYRILLIC), + ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), + ("Dives_Akuru", DIVES_AKURU), + ("Dogra", DOGRA), + ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), + ("Elbasan", ELBASAN), + ("Elymaic", ELYMAIC), + ("Ethiopic", ETHIOPIC), + ("Georgian", GEORGIAN), + ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), + ("Grantha", GRANTHA), + ("Greek", GREEK), + ("Gujarati", GUJARATI), + ("Gunjala_Gondi", GUNJALA_GONDI), + ("Gurmukhi", GURMUKHI), + ("Han", HAN), + ("Hangul", HANGUL), + ("Hanifi_Rohingya", HANIFI_ROHINGYA), + ("Hanunoo", HANUNOO), + ("Hatran", HATRAN), + ("Hebrew", HEBREW), + ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), + ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), + ("Javanese", JAVANESE), + ("Kaithi", KAITHI), + ("Kannada", KANNADA), + ("Katakana", KATAKANA), + ("Kawi", KAWI), + ("Kayah_Li", KAYAH_LI), + ("Kharoshthi", KHAROSHTHI), + ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), + ("Khmer", KHMER), + ("Khojki", KHOJKI), + ("Khudawadi", KHUDAWADI), + ("Lao", LAO), + ("Latin", LATIN), + ("Lepcha", LEPCHA), + ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), + ("Linear_B", LINEAR_B), + ("Lisu", LISU), + ("Lycian", LYCIAN), + ("Lydian", LYDIAN), + ("Mahajani", MAHAJANI), + ("Makasar", MAKASAR), + ("Malayalam", MALAYALAM), + ("Mandaic", MANDAIC), + ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), + ("Masaram_Gondi", MASARAM_GONDI), + ("Medefaidrin", MEDEFAIDRIN), + ("Meetei_Mayek", MEETEI_MAYEK), + ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), + ("Miao", MIAO), + ("Modi", MODI), + ("Mongolian", MONGOLIAN), + ("Mro", MRO), + ("Multani", MULTANI), + ("Myanmar", MYANMAR), + ("Nabataean", NABATAEAN), + ("Nag_Mundari", NAG_MUNDARI), + ("Nandinagari", NANDINAGARI), + ("New_Tai_Lue", NEW_TAI_LUE), + ("Newa", NEWA), + ("Nko", NKO), + ("Nushu", NUSHU), + ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), + ("Ogham", OGHAM), + ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), + ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), + ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), + ("Old_Sogdian", OLD_SOGDIAN), + ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), + ("Old_Uyghur", OLD_UYGHUR), + ("Oriya", ORIYA), + ("Osage", OSAGE), + ("Osmanya", OSMANYA), + ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), + ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), + ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), + ("Rejang", REJANG), + ("Runic", RUNIC), + ("Samaritan", SAMARITAN), + ("Saurashtra", SAURASHTRA), + ("Sharada", SHARADA), + ("Shavian", SHAVIAN), + ("Siddham", SIDDHAM), + ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), + ("Sogdian", SOGDIAN), + ("Sora_Sompeng", SORA_SOMPENG), + ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), + ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), + ("Tagalog", TAGALOG), + ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), + ("Tai_Tham", TAI_THAM), + ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), + ("Tamil", TAMIL), + ("Tangsa", TANGSA), + ("Tangut", TANGUT), + ("Telugu", TELUGU), + ("Thaana", THAANA), + ("Thai", THAI), + ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), + ("Tirhuta", TIRHUTA), + ("Toto", TOTO), + ("Ugaritic", UGARITIC), + ("Vai", VAI), + ("Vithkuqi", VITHKUQI), + ("Wancho", WANCHO), + ("Warang_Citi", WARANG_CITI), + ("Yezidi", YEZIDI), + ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = + &[('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')]; + +pub const AHOM: &'static [(char, char)] = + &[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑝆')]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𔐀', '𔙆')]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), + ('؆', '؋'), + ('؍', '\u{61a}'), + ('\u{61c}', '؞'), + ('ؠ', 'ؿ'), + ('ف', 'ي'), + ('\u{656}', 'ٯ'), + ('ٱ', '\u{6dc}'), + ('۞', 'ۿ'), + ('ݐ', 'ݿ'), + ('ࡰ', 'ࢎ'), + ('\u{890}', '\u{891}'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{8ff}'), + ('ﭐ', '﯂'), + ('ﯓ', 'ﴽ'), + ('﵀', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('﷏', '﷏'), + ('ﷰ', '﷿'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('𐹠', '𐹾'), + ('\u{10efd}', '\u{10eff}'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), +]; + +pub const ARMENIAN: &'static [(char, char)] = + &[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')]; + +pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; + +pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭌ'), ('᭐', '᭾')]; + +pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')]; + +pub const BASSA_VAH: &'static [(char, char)] = + &[('𖫐', '𖫭'), ('\u{16af0}', '𖫵')]; + +pub const BATAK: &'static [(char, char)] = &[('ᯀ', '᯳'), ('᯼', '᯿')]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('ঀ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', '\u{9fe}'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = + &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')]; + +pub const BOPOMOFO: &'static [(char, char)] = + &[('˪', '˫'), ('ㄅ', 'ㄯ'), ('ㆠ', 'ㆿ')]; + +pub const BRAHMI: &'static [(char, char)] = + &[('𑀀', '𑁍'), ('𑁒', '𑁵'), ('\u{1107f}', '\u{1107f}')]; + +pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')]; + +pub const BUGINESE: &'static [(char, char)] = &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟')]; + +pub const BUHID: &'static [(char, char)] = &[('ᝀ', '\u{1753}')]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = + &[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), ('𑪰', '𑪿')]; + +pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = + &[('𐔰', '𐕣'), ('𐕯', '𐕯')]; + +pub const CHAKMA: &'static [(char, char)] = + &[('\u{11100}', '\u{11134}'), ('𑄶', '𑅇')]; + +pub const CHAM: &'static [(char, char)] = + &[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')]; + +pub const CHEROKEE: &'static [(char, char)] = + &[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')]; + +pub const CHORASMIAN: &'static [(char, char)] = &[('𐾰', '𐿋')]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\0', '@'), + ('[', '`'), + ('{', '©'), + ('«', '¹'), + ('»', '¿'), + ('×', '×'), + ('÷', '÷'), + ('ʹ', '˟'), + ('˥', '˩'), + ('ˬ', '˿'), + ('ʹ', 'ʹ'), + (';', ';'), + ('΅', '΅'), + ('·', '·'), + ('\u{605}', '\u{605}'), + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('ـ', 'ـ'), + ('\u{6dd}', '\u{6dd}'), + ('\u{8e2}', '\u{8e2}'), + ('।', '॥'), + ('฿', '฿'), + ('࿕', '࿘'), + ('჻', '჻'), + ('᛫', '᛭'), + ('᜵', '᜶'), + ('᠂', '᠃'), + ('᠅', '᠅'), + ('᳓', '᳓'), + ('᳡', '᳡'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', '᳷'), + ('ᳺ', 'ᳺ'), + ('\u{2000}', '\u{200b}'), + ('\u{200e}', '\u{2064}'), + ('\u{2066}', '⁰'), + ('⁴', '⁾'), + ('₀', '₎'), + ('₠', '⃀'), + ('℀', '℥'), + ('℧', '℩'), + ('ℬ', 'ℱ'), + ('ℳ', '⅍'), + ('⅏', '⅟'), + ('↉', '↋'), + ('←', '␦'), + ('⑀', '⑊'), + ('①', '⟿'), + ('⤀', '⭳'), + ('⭶', '⮕'), + ('⮗', '⯿'), + ('⸀', '⹝'), + ('⿰', '⿻'), + ('\u{3000}', '〄'), + ('〆', '〆'), + ('〈', '〠'), + ('〰', '〷'), + ('〼', '〿'), + ('゛', '゜'), + ('゠', '゠'), + ('・', 'ー'), + ('㆐', '㆟'), + ('㇀', '㇣'), + ('㈠', '㉟'), + ('㉿', '㋏'), + ('㋿', '㋿'), + ('㍘', '㏿'), + ('䷀', '䷿'), + ('꜀', '꜡'), + ('ꞈ', '꞊'), + ('꠰', '꠹'), + ('꤮', '꤮'), + ('ꧏ', 'ꧏ'), + ('꭛', '꭛'), + ('꭪', '꭫'), + ('﴾', '﴿'), + ('︐', '︙'), + ('︰', '﹒'), + ('﹔', '﹦'), + ('﹨', '﹫'), + ('\u{feff}', '\u{feff}'), + ('!', '@'), + ('[', '`'), + ('{', '・'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + ('¢', '₩'), + ('│', '○'), + ('\u{fff9}', '�'), + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐄿'), + ('𐆐', '𐆜'), + ('𐇐', '𐇼'), + ('𐋡', '𐋻'), + ('\u{1bca0}', '\u{1bca3}'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅦'), + ('𝅪', '\u{1d17a}'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝌀', '𝍖'), + ('𝍠', '𝍸'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝟋'), + ('𝟎', '𝟿'), + ('𞱱', '𞲴'), + ('𞴁', '𞴽'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄀', '🆭'), + ('🇦', '🇿'), + ('🈁', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉐', '🉑'), + ('🉠', '🉥'), + ('🌀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), + ('🯰', '🯹'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = + &[('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿')]; + +pub const CUNEIFORM: &'static [(char, char)] = + &[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')]; + +pub const CYPRIOT: &'static [(char, char)] = + &[('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿')]; + +pub const CYPRO_MINOAN: &'static [(char, char)] = &[('𒾐', '𒿲')]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Ѐ', '\u{484}'), + ('\u{487}', 'ԯ'), + ('ᲀ', 'ᲈ'), + ('ᴫ', 'ᴫ'), + ('ᵸ', 'ᵸ'), + ('\u{2de0}', '\u{2dff}'), + ('Ꙁ', '\u{a69f}'), + ('\u{fe2e}', '\u{fe2f}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), +]; + +pub const DESERET: &'static [(char, char)] = &[('𐐀', '𐑏')]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('\u{900}', 'ॐ'), + ('\u{955}', '\u{963}'), + ('०', 'ॿ'), + ('\u{a8e0}', '\u{a8ff}'), + ('𑬀', '𑬉'), +]; + +pub const DIVES_AKURU: &'static [(char, char)] = &[ + ('𑤀', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '𑥆'), + ('𑥐', '𑥙'), +]; + +pub const DOGRA: &'static [(char, char)] = &[('𑠀', '𑠻')]; + +pub const DUPLOYAN: &'static [(char, char)] = + &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '𛲟')]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = + &[('𓀀', '\u{13455}')]; + +pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')]; + +pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('ሀ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '፼'), + ('ᎀ', '᎙'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ჿ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('Ⰰ', 'ⱟ'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[('𐌰', '𐍊')]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133c}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('Ͱ', 'ͳ'), + ('͵', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('΄', '΄'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϡ'), + ('ϰ', 'Ͽ'), + ('ᴦ', 'ᴪ'), + ('ᵝ', 'ᵡ'), + ('ᵦ', 'ᵪ'), + ('ᶿ', 'ᶿ'), + ('ἀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), + ('῝', '`'), + ('ῲ', 'ῴ'), + ('ῶ', '῾'), + ('Ω', 'Ω'), + ('ꭥ', 'ꭥ'), + ('𐅀', '𐆎'), + ('𐆠', '𐆠'), + ('𝈀', '𝉅'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૱'), + ('ૹ', '\u{aff}'), +]; + +pub const GUNJALA_GONDI: &'static [(char, char)] = &[ + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '੶'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('々', '々'), + ('〇', '〇'), + ('〡', '〩'), + ('〸', '〻'), + ('㐀', '䶿'), + ('一', '鿿'), + ('豈', '舘'), + ('並', '龎'), + ('𖿢', '𖿣'), + ('𖿰', '𖿱'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('ᄀ', 'ᇿ'), + ('\u{302e}', '\u{302f}'), + ('ㄱ', 'ㆎ'), + ('㈀', '㈞'), + ('㉠', '㉾'), + ('ꥠ', 'ꥼ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), +]; + +pub const HANIFI_ROHINGYA: &'static [(char, char)] = + &[('𐴀', '\u{10d27}'), ('𐴰', '𐴹')]; + +pub const HANUNOO: &'static [(char, char)] = &[('ᜠ', '᜴')]; + +pub const HATRAN: &'static [(char, char)] = + &[('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿')]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('\u{591}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', '״'), + ('יִ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﭏ'), +]; + +pub const HIRAGANA: &'static [(char, char)] = &[ + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('𛀁', '𛄟'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('🈀', '🈀'), +]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = + &[('𐡀', '𐡕'), ('𐡗', '𐡟')]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{485}', '\u{486}'), + ('\u{64b}', '\u{655}'), + ('\u{670}', '\u{670}'), + ('\u{951}', '\u{954}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200d}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{302a}', '\u{302d}'), + ('\u{3099}', '\u{309a}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2d}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{1133b}', '\u{1133b}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = + &[('𐭠', '𐭲'), ('𐭸', '𐭿')]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = + &[('𐭀', '𐭕'), ('𐭘', '𐭟')]; + +pub const JAVANESE: &'static [(char, char)] = + &[('\u{a980}', '꧍'), ('꧐', '꧙'), ('꧞', '꧟')]; + +pub const KAITHI: &'static [(char, char)] = + &[('\u{11080}', '\u{110c2}'), ('\u{110cd}', '\u{110cd}')]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('ಀ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('ァ', 'ヺ'), + ('ヽ', 'ヿ'), + ('ㇰ', 'ㇿ'), + ('㋐', '㋾'), + ('㌀', '㍗'), + ('ヲ', 'ッ'), + ('ア', 'ン'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛀀'), + ('𛄠', '𛄢'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), +]; + +pub const KAWI: &'static [(char, char)] = + &[('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '𑽙')]; + +pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '\u{a92d}'), ('꤯', '꤯')]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '𐩈'), + ('𐩐', '𐩘'), +]; + +pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] = + &[('\u{16fe4}', '\u{16fe4}'), ('𘬀', '𘳕')]; + +pub const KHMER: &'static [(char, char)] = + &[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')]; + +pub const KHOJKI: &'static [(char, char)] = &[('𑈀', '𑈑'), ('𑈓', '\u{11241}')]; + +pub const KHUDAWADI: &'static [(char, char)] = + &[('𑊰', '\u{112ea}'), ('𑋰', '𑋹')]; + +pub const LAO: &'static [(char, char)] = &[ + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ʸ'), + ('ˠ', 'ˤ'), + ('ᴀ', 'ᴥ'), + ('ᴬ', 'ᵜ'), + ('ᵢ', 'ᵥ'), + ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶾ'), + ('Ḁ', 'ỿ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⱡ', 'Ɀ'), + ('Ꜣ', 'ꞇ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꟿ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭤ'), + ('ꭦ', 'ꭩ'), + ('ff', 'st'), + ('A', 'Z'), + ('a', 'z'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), +]; + +pub const LEPCHA: &'static [(char, char)] = + &[('ᰀ', '\u{1c37}'), ('᰻', '᱉'), ('ᱍ', 'ᱏ')]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥀', '᥀'), + ('᥄', '᥏'), +]; + +pub const LINEAR_A: &'static [(char, char)] = + &[('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧')]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), +]; + +pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿'), ('𑾰', '𑾰')]; + +pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')]; + +pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; + +pub const MAHAJANI: &'static [(char, char)] = &[('𑅐', '𑅶')]; + +pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '൏'), + ('ൔ', '\u{d63}'), + ('൦', 'ൿ'), +]; + +pub const MANDAIC: &'static [(char, char)] = &[('ࡀ', '\u{85b}'), ('࡞', '࡞')]; + +pub const MANICHAEAN: &'static [(char, char)] = + &[('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')]; + +pub const MARCHEN: &'static [(char, char)] = + &[('𑱰', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}')]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), +]; + +pub const MEDEFAIDRIN: &'static [(char, char)] = &[('𖹀', '𖺚')]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = + &[('ꫠ', '\u{aaf6}'), ('ꯀ', '\u{abed}'), ('꯰', '꯹')]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = + &[('𞠀', '𞣄'), ('𞣇', '\u{1e8d6}')]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = + &[('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿')]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')]; + +pub const MIAO: &'static [(char, char)] = + &[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')]; + +pub const MODI: &'static [(char, char)] = &[('𑘀', '𑙄'), ('𑙐', '𑙙')]; + +pub const MONGOLIAN: &'static [(char, char)] = + &[('᠀', '᠁'), ('᠄', '᠄'), ('᠆', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('𑙠', '𑙬')]; + +pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; + +pub const MULTANI: &'static [(char, char)] = + &[('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')]; + +pub const MYANMAR: &'static [(char, char)] = + &[('က', '႟'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')]; + +pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; + +pub const NAG_MUNDARI: &'static [(char, char)] = &[('𞓐', '𞓹')]; + +pub const NANDINAGARI: &'static [(char, char)] = + &[('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧤')]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = + &[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')]; + +pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '𑑡')]; + +pub const NKO: &'static [(char, char)] = &[('߀', 'ߺ'), ('\u{7fd}', '߿')]; + +pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; + +pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = + &[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')]; + +pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')]; + +pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = + &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')]; + +pub const OLD_PERMIC: &'static [(char, char)] = &[('𐍐', '\u{1037a}')]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; + +pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('𐩠', '𐩿')]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[('𐰀', '𐱈')]; + +pub const OLD_UYGHUR: &'static [(char, char)] = &[('𐽰', '𐾉')]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୷'), +]; + +pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; + +pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = + &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')]; + +pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')]; + +pub const PHAGS_PA: &'static [(char, char)] = &[('ꡀ', '꡷')]; + +pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = + &[('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')]; + +pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')]; + +pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')]; + +pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; + +pub const SAURASHTRA: &'static [(char, char)] = + &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')]; + +pub const SHARADA: &'static [(char, char)] = &[('\u{11180}', '𑇟')]; + +pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')]; + +pub const SIDDHAM: &'static [(char, char)] = + &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')]; + +pub const SIGNWRITING: &'static [(char, char)] = + &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', '෴'), + ('𑇡', '𑇴'), +]; + +pub const SOGDIAN: &'static [(char, char)] = &[('𐼰', '𐽙')]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; + +pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')]; + +pub const SUNDANESE: &'static [(char, char)] = + &[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = &[('ꠀ', '\u{a82c}')]; + +pub const SYRIAC: &'static [(char, char)] = + &[('܀', '܍'), ('\u{70f}', '\u{74a}'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ')]; + +pub const TAGALOG: &'static [(char, char)] = &[('ᜀ', '᜕'), ('ᜟ', 'ᜟ')]; + +pub const TAGBANWA: &'static [(char, char)] = + &[('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')]; + +pub const TAI_LE: &'static [(char, char)] = &[('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('᪠', '᪭'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; + +pub const TAKRI: &'static [(char, char)] = &[('𑚀', '𑚹'), ('𑛀', '𑛉')]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௺'), + ('𑿀', '𑿱'), + ('𑿿', '𑿿'), +]; + +pub const TANGSA: &'static [(char, char)] = &[('𖩰', '𖪾'), ('𖫀', '𖫉')]; + +pub const TANGUT: &'static [(char, char)] = + &[('𖿠', '𖿠'), ('𗀀', '𘟷'), ('𘠀', '𘫿'), ('𘴀', '𘴈')]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('౷', '౿'), +]; + +pub const THAANA: &'static [(char, char)] = &[('ހ', 'ޱ')]; + +pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('ༀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('྾', '࿌'), + ('࿎', '࿔'), + ('࿙', '࿚'), +]; + +pub const TIFINAGH: &'static [(char, char)] = + &[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')]; + +pub const TIRHUTA: &'static [(char, char)] = &[('𑒀', '𑓇'), ('𑓐', '𑓙')]; + +pub const TOTO: &'static [(char, char)] = &[('𞊐', '\u{1e2ae}')]; + +pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; + +pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')]; + +pub const VITHKUQI: &'static [(char, char)] = &[ + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), +]; + +pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')]; + +pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; + +pub const YEZIDI: &'static [(char, char)] = + &[('𐺀', '𐺩'), ('\u{10eab}', '𐺭'), ('𐺰', '𐺱')]; + +pub const YI: &'static [(char, char)] = &[('ꀀ', 'ꒌ'), ('꒐', '꓆')]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('𑨀', '\u{11a47}')]; diff --git a/vendor/regex-syntax/src/unicode_tables/script_extension.rs b/vendor/regex-syntax/src/unicode_tables/script_extension.rs new file mode 100644 index 0000000..42625e2 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/script_extension.rs @@ -0,0 +1,1457 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script-extension ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), + ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), + ("Arabic", ARABIC), + ("Armenian", ARMENIAN), + ("Avestan", AVESTAN), + ("Balinese", BALINESE), + ("Bamum", BAMUM), + ("Bassa_Vah", BASSA_VAH), + ("Batak", BATAK), + ("Bengali", BENGALI), + ("Bhaiksuki", BHAIKSUKI), + ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), + ("Braille", BRAILLE), + ("Buginese", BUGINESE), + ("Buhid", BUHID), + ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), + ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), + ("Cham", CHAM), + ("Cherokee", CHEROKEE), + ("Chorasmian", CHORASMIAN), + ("Common", COMMON), + ("Coptic", COPTIC), + ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), + ("Cypro_Minoan", CYPRO_MINOAN), + ("Cyrillic", CYRILLIC), + ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), + ("Dives_Akuru", DIVES_AKURU), + ("Dogra", DOGRA), + ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), + ("Elbasan", ELBASAN), + ("Elymaic", ELYMAIC), + ("Ethiopic", ETHIOPIC), + ("Georgian", GEORGIAN), + ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), + ("Grantha", GRANTHA), + ("Greek", GREEK), + ("Gujarati", GUJARATI), + ("Gunjala_Gondi", GUNJALA_GONDI), + ("Gurmukhi", GURMUKHI), + ("Han", HAN), + ("Hangul", HANGUL), + ("Hanifi_Rohingya", HANIFI_ROHINGYA), + ("Hanunoo", HANUNOO), + ("Hatran", HATRAN), + ("Hebrew", HEBREW), + ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), + ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), + ("Javanese", JAVANESE), + ("Kaithi", KAITHI), + ("Kannada", KANNADA), + ("Katakana", KATAKANA), + ("Kawi", KAWI), + ("Kayah_Li", KAYAH_LI), + ("Kharoshthi", KHAROSHTHI), + ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), + ("Khmer", KHMER), + ("Khojki", KHOJKI), + ("Khudawadi", KHUDAWADI), + ("Lao", LAO), + ("Latin", LATIN), + ("Lepcha", LEPCHA), + ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), + ("Linear_B", LINEAR_B), + ("Lisu", LISU), + ("Lycian", LYCIAN), + ("Lydian", LYDIAN), + ("Mahajani", MAHAJANI), + ("Makasar", MAKASAR), + ("Malayalam", MALAYALAM), + ("Mandaic", MANDAIC), + ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), + ("Masaram_Gondi", MASARAM_GONDI), + ("Medefaidrin", MEDEFAIDRIN), + ("Meetei_Mayek", MEETEI_MAYEK), + ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), + ("Miao", MIAO), + ("Modi", MODI), + ("Mongolian", MONGOLIAN), + ("Mro", MRO), + ("Multani", MULTANI), + ("Myanmar", MYANMAR), + ("Nabataean", NABATAEAN), + ("Nag_Mundari", NAG_MUNDARI), + ("Nandinagari", NANDINAGARI), + ("New_Tai_Lue", NEW_TAI_LUE), + ("Newa", NEWA), + ("Nko", NKO), + ("Nushu", NUSHU), + ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), + ("Ogham", OGHAM), + ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), + ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), + ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), + ("Old_Sogdian", OLD_SOGDIAN), + ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), + ("Old_Uyghur", OLD_UYGHUR), + ("Oriya", ORIYA), + ("Osage", OSAGE), + ("Osmanya", OSMANYA), + ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), + ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), + ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), + ("Rejang", REJANG), + ("Runic", RUNIC), + ("Samaritan", SAMARITAN), + ("Saurashtra", SAURASHTRA), + ("Sharada", SHARADA), + ("Shavian", SHAVIAN), + ("Siddham", SIDDHAM), + ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), + ("Sogdian", SOGDIAN), + ("Sora_Sompeng", SORA_SOMPENG), + ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), + ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), + ("Tagalog", TAGALOG), + ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), + ("Tai_Tham", TAI_THAM), + ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), + ("Tamil", TAMIL), + ("Tangsa", TANGSA), + ("Tangut", TANGUT), + ("Telugu", TELUGU), + ("Thaana", THAANA), + ("Thai", THAI), + ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), + ("Tirhuta", TIRHUTA), + ("Toto", TOTO), + ("Ugaritic", UGARITIC), + ("Vai", VAI), + ("Vithkuqi", VITHKUQI), + ("Wancho", WANCHO), + ("Warang_Citi", WARANG_CITI), + ("Yezidi", YEZIDI), + ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = + &[('؟', '؟'), ('ـ', 'ـ'), ('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')]; + +pub const AHOM: &'static [(char, char)] = + &[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑝆')]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𔐀', '𔙆')]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), + ('؆', '\u{6dc}'), + ('۞', 'ۿ'), + ('ݐ', 'ݿ'), + ('ࡰ', 'ࢎ'), + ('\u{890}', '\u{891}'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{8ff}'), + ('ﭐ', '﯂'), + ('ﯓ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('﷏', '﷏'), + ('ﷰ', '﷿'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('\u{102e0}', '𐋻'), + ('𐹠', '𐹾'), + ('\u{10efd}', '\u{10eff}'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), +]; + +pub const ARMENIAN: &'static [(char, char)] = + &[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')]; + +pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; + +pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭌ'), ('᭐', '᭾')]; + +pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')]; + +pub const BASSA_VAH: &'static [(char, char)] = + &[('𖫐', '𖫭'), ('\u{16af0}', '𖫵')]; + +pub const BATAK: &'static [(char, char)] = &[('ᯀ', '᯳'), ('᯼', '᯿')]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('ঀ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', '\u{9fe}'), + ('\u{1cd0}', '\u{1cd0}'), + ('\u{1cd2}', '\u{1cd2}'), + ('\u{1cd5}', '\u{1cd6}'), + ('\u{1cd8}', '\u{1cd8}'), + ('᳡', '᳡'), + ('ᳪ', 'ᳪ'), + ('\u{1ced}', '\u{1ced}'), + ('ᳲ', 'ᳲ'), + ('ᳵ', '᳷'), + ('\u{a8f1}', '\u{a8f1}'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = + &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')]; + +pub const BOPOMOFO: &'static [(char, char)] = &[ + ('˪', '˫'), + ('、', '〃'), + ('〈', '】'), + ('〓', '〟'), + ('\u{302a}', '\u{302d}'), + ('〰', '〰'), + ('〷', '〷'), + ('・', '・'), + ('ㄅ', 'ㄯ'), + ('ㆠ', 'ㆿ'), + ('﹅', '﹆'), + ('。', '・'), +]; + +pub const BRAHMI: &'static [(char, char)] = + &[('𑀀', '𑁍'), ('𑁒', '𑁵'), ('\u{1107f}', '\u{1107f}')]; + +pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')]; + +pub const BUGINESE: &'static [(char, char)] = + &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟'), ('ꧏ', 'ꧏ')]; + +pub const BUHID: &'static [(char, char)] = &[('᜵', '᜶'), ('ᝀ', '\u{1753}')]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = + &[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), ('𑪰', '𑪿')]; + +pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = + &[('𐔰', '𐕣'), ('𐕯', '𐕯')]; + +pub const CHAKMA: &'static [(char, char)] = + &[('০', '৯'), ('၀', '၉'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑅇')]; + +pub const CHAM: &'static [(char, char)] = + &[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')]; + +pub const CHEROKEE: &'static [(char, char)] = + &[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')]; + +pub const CHORASMIAN: &'static [(char, char)] = &[('𐾰', '𐿋')]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\0', '@'), + ('[', '`'), + ('{', '©'), + ('«', '¹'), + ('»', '¿'), + ('×', '×'), + ('÷', '÷'), + ('ʹ', '˟'), + ('˥', '˩'), + ('ˬ', '˿'), + ('ʹ', 'ʹ'), + (';', ';'), + ('΅', '΅'), + ('·', '·'), + ('\u{605}', '\u{605}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{8e2}', '\u{8e2}'), + ('฿', '฿'), + ('࿕', '࿘'), + ('᛫', '᛭'), + ('\u{2000}', '\u{200b}'), + ('\u{200e}', '\u{202e}'), + ('‰', '\u{2064}'), + ('\u{2066}', '⁰'), + ('⁴', '⁾'), + ('₀', '₎'), + ('₠', '⃀'), + ('℀', '℥'), + ('℧', '℩'), + ('ℬ', 'ℱ'), + ('ℳ', '⅍'), + ('⅏', '⅟'), + ('↉', '↋'), + ('←', '␦'), + ('⑀', '⑊'), + ('①', '⟿'), + ('⤀', '⭳'), + ('⭶', '⮕'), + ('⮗', '⯿'), + ('⸀', '⹂'), + ('⹄', '⹝'), + ('⿰', '⿻'), + ('\u{3000}', '\u{3000}'), + ('〄', '〄'), + ('〒', '〒'), + ('〠', '〠'), + ('〶', '〶'), + ('㉈', '㉟'), + ('㉿', '㉿'), + ('㊱', '㊿'), + ('㋌', '㋏'), + ('㍱', '㍺'), + ('㎀', '㏟'), + ('㏿', '㏿'), + ('䷀', '䷿'), + ('꜈', '꜡'), + ('ꞈ', '꞊'), + ('꭛', '꭛'), + ('꭪', '꭫'), + ('︐', '︙'), + ('︰', '﹄'), + ('﹇', '﹒'), + ('﹔', '﹦'), + ('﹨', '﹫'), + ('\u{feff}', '\u{feff}'), + ('!', '@'), + ('[', '`'), + ('{', '⦆'), + ('¢', '₩'), + ('│', '○'), + ('\u{fff9}', '�'), + ('𐆐', '𐆜'), + ('𐇐', '𐇼'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅦'), + ('𝅪', '\u{1d17a}'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝌀', '𝍖'), + ('𝍲', '𝍸'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝟋'), + ('𝟎', '𝟿'), + ('𞱱', '𞲴'), + ('𞴁', '𞴽'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄀', '🆭'), + ('🇦', '🇿'), + ('🈁', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉠', '🉥'), + ('🌀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), + ('🯰', '🯹'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = + &[('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿'), ('\u{102e0}', '𐋻')]; + +pub const CUNEIFORM: &'static [(char, char)] = + &[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')]; + +pub const CYPRIOT: &'static [(char, char)] = &[ + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐄿'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐠿'), +]; + +pub const CYPRO_MINOAN: &'static [(char, char)] = &[('𐄀', '𐄁'), ('𒾐', '𒿲')]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Ѐ', 'ԯ'), + ('ᲀ', 'ᲈ'), + ('ᴫ', 'ᴫ'), + ('ᵸ', 'ᵸ'), + ('\u{1df8}', '\u{1df8}'), + ('\u{2de0}', '\u{2dff}'), + ('⹃', '⹃'), + ('Ꙁ', '\u{a69f}'), + ('\u{fe2e}', '\u{fe2f}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), +]; + +pub const DESERET: &'static [(char, char)] = &[('𐐀', '𐑏')]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('\u{900}', '\u{952}'), + ('\u{955}', 'ॿ'), + ('\u{1cd0}', 'ᳶ'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{20f0}', '\u{20f0}'), + ('꠰', '꠹'), + ('\u{a8e0}', '\u{a8ff}'), + ('𑬀', '𑬉'), +]; + +pub const DIVES_AKURU: &'static [(char, char)] = &[ + ('𑤀', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '𑥆'), + ('𑥐', '𑥙'), +]; + +pub const DOGRA: &'static [(char, char)] = + &[('।', '९'), ('꠰', '꠹'), ('𑠀', '𑠻')]; + +pub const DUPLOYAN: &'static [(char, char)] = + &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '\u{1bca3}')]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = + &[('𓀀', '\u{13455}')]; + +pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')]; + +pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('ሀ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '፼'), + ('ᎀ', '᎙'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჿ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('\u{484}', '\u{484}'), + ('\u{487}', '\u{487}'), + ('Ⰰ', 'ⱟ'), + ('⹃', '⹃'), + ('\u{a66f}', '\u{a66f}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[('𐌰', '𐍊')]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('௦', '௳'), + ('\u{1cd0}', '\u{1cd0}'), + ('\u{1cd2}', '᳓'), + ('ᳲ', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{20f0}', '\u{20f0}'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑿐', '𑿑'), + ('𑿓', '𑿓'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('\u{342}', '\u{342}'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'ͳ'), + ('͵', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('΄', '΄'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϡ'), + ('ϰ', 'Ͽ'), + ('ᴦ', 'ᴪ'), + ('ᵝ', 'ᵡ'), + ('ᵦ', 'ᵪ'), + ('ᶿ', '\u{1dc1}'), + ('ἀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), + ('῝', '`'), + ('ῲ', 'ῴ'), + ('ῶ', '῾'), + ('Ω', 'Ω'), + ('ꭥ', 'ꭥ'), + ('𐅀', '𐆎'), + ('𐆠', '𐆠'), + ('𝈀', '𝉅'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૱'), + ('ૹ', '\u{aff}'), + ('꠰', '꠹'), +]; + +pub const GUNJALA_GONDI: &'static [(char, char)] = &[ + ('।', '॥'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '੶'), + ('꠰', '꠹'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('、', '〃'), + ('々', '】'), + ('〓', '〟'), + ('〡', '\u{302d}'), + ('〰', '〰'), + ('〷', '〿'), + ('・', '・'), + ('㆐', '㆟'), + ('㇀', '㇣'), + ('㈠', '㉇'), + ('㊀', '㊰'), + ('㋀', '㋋'), + ('㋿', '㋿'), + ('㍘', '㍰'), + ('㍻', '㍿'), + ('㏠', '㏾'), + ('㐀', '䶿'), + ('一', '鿿'), + ('꜀', '꜇'), + ('豈', '舘'), + ('並', '龎'), + ('﹅', '﹆'), + ('。', '・'), + ('𖿢', '𖿣'), + ('𖿰', '𖿱'), + ('𝍠', '𝍱'), + ('🉐', '🉑'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('ᄀ', 'ᇿ'), + ('、', '〃'), + ('〈', '】'), + ('〓', '〟'), + ('\u{302e}', '〰'), + ('〷', '〷'), + ('・', '・'), + ('ㄱ', 'ㆎ'), + ('㈀', '㈞'), + ('㉠', '㉾'), + ('ꥠ', 'ꥼ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('﹅', '﹆'), + ('。', '・'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), +]; + +pub const HANIFI_ROHINGYA: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('ـ', 'ـ'), + ('۔', '۔'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), +]; + +pub const HANUNOO: &'static [(char, char)] = &[('ᜠ', '᜶')]; + +pub const HATRAN: &'static [(char, char)] = + &[('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿')]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('\u{591}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', '״'), + ('יִ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﭏ'), +]; + +pub const HIRAGANA: &'static [(char, char)] = &[ + ('、', '〃'), + ('〈', '】'), + ('〓', '〟'), + ('〰', '〵'), + ('〷', '〷'), + ('〼', '〽'), + ('ぁ', 'ゖ'), + ('\u{3099}', '゠'), + ('・', 'ー'), + ('﹅', '﹆'), + ('。', '・'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + ('𛀁', '𛄟'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('🈀', '🈀'), +]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = + &[('𐡀', '𐡕'), ('𐡗', '𐡟')]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('\u{300}', '\u{341}'), + ('\u{343}', '\u{344}'), + ('\u{346}', '\u{362}'), + ('\u{953}', '\u{954}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1dc2}', '\u{1df7}'), + ('\u{1df9}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{200c}', '\u{200d}'), + ('\u{20d0}', '\u{20ef}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2d}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = + &[('𐭠', '𐭲'), ('𐭸', '𐭿')]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = + &[('𐭀', '𐭕'), ('𐭘', '𐭟')]; + +pub const JAVANESE: &'static [(char, char)] = + &[('\u{a980}', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟')]; + +pub const KAITHI: &'static [(char, char)] = &[ + ('०', '९'), + ('꠰', '꠹'), + ('\u{11080}', '\u{110c2}'), + ('\u{110cd}', '\u{110cd}'), +]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('ಀ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{1cd0}', '\u{1cd0}'), + ('\u{1cd2}', '\u{1cd2}'), + ('\u{1cda}', '\u{1cda}'), + ('ᳲ', 'ᳲ'), + ('\u{1cf4}', '\u{1cf4}'), + ('꠰', '꠵'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('、', '〃'), + ('〈', '】'), + ('〓', '〟'), + ('〰', '〵'), + ('〷', '〷'), + ('〼', '〽'), + ('\u{3099}', '゜'), + ('゠', 'ヿ'), + ('ㇰ', 'ㇿ'), + ('㋐', '㋾'), + ('㌀', '㍗'), + ('﹅', '﹆'), + ('。', '\u{ff9f}'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛀀'), + ('𛄠', '𛄢'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), +]; + +pub const KAWI: &'static [(char, char)] = + &[('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '𑽙')]; + +pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '꤯')]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '𐩈'), + ('𐩐', '𐩘'), +]; + +pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] = + &[('\u{16fe4}', '\u{16fe4}'), ('𘬀', '𘳕')]; + +pub const KHMER: &'static [(char, char)] = + &[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')]; + +pub const KHOJKI: &'static [(char, char)] = + &[('૦', '૯'), ('꠰', '꠹'), ('𑈀', '𑈑'), ('𑈓', '\u{11241}')]; + +pub const KHUDAWADI: &'static [(char, char)] = + &[('।', '॥'), ('꠰', '꠹'), ('𑊰', '\u{112ea}'), ('𑋰', '𑋹')]; + +pub const LAO: &'static [(char, char)] = &[ + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ʸ'), + ('ˠ', 'ˤ'), + ('\u{363}', '\u{36f}'), + ('\u{485}', '\u{486}'), + ('\u{951}', '\u{952}'), + ('჻', '჻'), + ('ᴀ', 'ᴥ'), + ('ᴬ', 'ᵜ'), + ('ᵢ', 'ᵥ'), + ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶾ'), + ('Ḁ', 'ỿ'), + ('\u{202f}', '\u{202f}'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20f0}', '\u{20f0}'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⱡ', 'Ɀ'), + ('꜀', '꜇'), + ('Ꜣ', 'ꞇ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꟿ'), + ('꤮', '꤮'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭤ'), + ('ꭦ', 'ꭩ'), + ('ff', 'st'), + ('A', 'Z'), + ('a', 'z'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), +]; + +pub const LEPCHA: &'static [(char, char)] = + &[('ᰀ', '\u{1c37}'), ('᰻', '᱉'), ('ᱍ', 'ᱏ')]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('॥', '॥'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥀', '᥀'), + ('᥄', '᥏'), +]; + +pub const LINEAR_A: &'static [(char, char)] = + &[('𐄇', '𐄳'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧')]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐄿'), +]; + +pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿'), ('𑾰', '𑾰')]; + +pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')]; + +pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; + +pub const MAHAJANI: &'static [(char, char)] = + &[('।', '९'), ('꠰', '꠹'), ('𑅐', '𑅶')]; + +pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '൏'), + ('ൔ', '\u{d63}'), + ('൦', 'ൿ'), + ('\u{1cda}', '\u{1cda}'), + ('꠰', '꠲'), +]; + +pub const MANDAIC: &'static [(char, char)] = + &[('ـ', 'ـ'), ('ࡀ', '\u{85b}'), ('࡞', '࡞')]; + +pub const MANICHAEAN: &'static [(char, char)] = + &[('ـ', 'ـ'), ('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')]; + +pub const MARCHEN: &'static [(char, char)] = + &[('𑱰', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}')]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('।', '॥'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), +]; + +pub const MEDEFAIDRIN: &'static [(char, char)] = &[('𖹀', '𖺚')]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = + &[('ꫠ', '\u{aaf6}'), ('ꯀ', '\u{abed}'), ('꯰', '꯹')]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = + &[('𞠀', '𞣄'), ('𞣇', '\u{1e8d6}')]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = + &[('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿')]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')]; + +pub const MIAO: &'static [(char, char)] = + &[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')]; + +pub const MODI: &'static [(char, char)] = + &[('꠰', '꠹'), ('𑘀', '𑙄'), ('𑙐', '𑙙')]; + +pub const MONGOLIAN: &'static [(char, char)] = &[ + ('᠀', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('\u{202f}', '\u{202f}'), + ('𑙠', '𑙬'), +]; + +pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; + +pub const MULTANI: &'static [(char, char)] = + &[('੦', '੯'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')]; + +pub const MYANMAR: &'static [(char, char)] = + &[('က', '႟'), ('꤮', '꤮'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')]; + +pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; + +pub const NAG_MUNDARI: &'static [(char, char)] = &[('𞓐', '𞓹')]; + +pub const NANDINAGARI: &'static [(char, char)] = &[ + ('।', '॥'), + ('೦', '೯'), + ('ᳩ', 'ᳩ'), + ('ᳲ', 'ᳲ'), + ('ᳺ', 'ᳺ'), + ('꠰', '꠵'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧤'), +]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = + &[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')]; + +pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '𑑡')]; + +pub const NKO: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('߀', 'ߺ'), + ('\u{7fd}', '߿'), + ('﴾', '﴿'), +]; + +pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; + +pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = + &[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')]; + +pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')]; + +pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = + &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')]; + +pub const OLD_PERMIC: &'static [(char, char)] = + &[('\u{483}', '\u{483}'), ('𐍐', '\u{1037a}')]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; + +pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('𐩠', '𐩿')]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[('𐰀', '𐱈')]; + +pub const OLD_UYGHUR: &'static [(char, char)] = + &[('ـ', 'ـ'), ('𐫲', '𐫲'), ('𐽰', '𐾉')]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୷'), + ('\u{1cda}', '\u{1cda}'), + ('ᳲ', 'ᳲ'), +]; + +pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; + +pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = + &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')]; + +pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')]; + +pub const PHAGS_PA: &'static [(char, char)] = + &[('᠂', '᠃'), ('᠅', '᠅'), ('ꡀ', '꡷')]; + +pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = + &[('ـ', 'ـ'), ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')]; + +pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')]; + +pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')]; + +pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; + +pub const SAURASHTRA: &'static [(char, char)] = + &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')]; + +pub const SHARADA: &'static [(char, char)] = &[ + ('\u{951}', '\u{951}'), + ('\u{1cd7}', '\u{1cd7}'), + ('\u{1cd9}', '\u{1cd9}'), + ('\u{1cdc}', '\u{1cdd}'), + ('\u{1ce0}', '\u{1ce0}'), + ('\u{11180}', '𑇟'), +]; + +pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')]; + +pub const SIDDHAM: &'static [(char, char)] = + &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')]; + +pub const SIGNWRITING: &'static [(char, char)] = + &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('।', '॥'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', '෴'), + ('𑇡', '𑇴'), +]; + +pub const SOGDIAN: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐼰', '𐽙')]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; + +pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')]; + +pub const SUNDANESE: &'static [(char, char)] = + &[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = + &[('।', '॥'), ('০', '৯'), ('ꠀ', '\u{a82c}')]; + +pub const SYRIAC: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '\u{61c}'), + ('؟', '؟'), + ('ـ', 'ـ'), + ('\u{64b}', '\u{655}'), + ('\u{670}', '\u{670}'), + ('܀', '܍'), + ('\u{70f}', '\u{74a}'), + ('ݍ', 'ݏ'), + ('ࡠ', 'ࡪ'), + ('\u{1df8}', '\u{1df8}'), + ('\u{1dfa}', '\u{1dfa}'), +]; + +pub const TAGALOG: &'static [(char, char)] = + &[('ᜀ', '᜕'), ('ᜟ', 'ᜟ'), ('᜵', '᜶')]; + +pub const TAGBANWA: &'static [(char, char)] = + &[('᜵', '᜶'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')]; + +pub const TAI_LE: &'static [(char, char)] = + &[('၀', '၉'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('᪠', '᪭'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; + +pub const TAKRI: &'static [(char, char)] = + &[('।', '॥'), ('꠰', '꠹'), ('𑚀', '𑚹'), ('𑛀', '𑛉')]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௺'), + ('\u{1cda}', '\u{1cda}'), + ('ꣳ', 'ꣳ'), + ('\u{11301}', '\u{11301}'), + ('𑌃', '𑌃'), + ('\u{1133b}', '\u{1133c}'), + ('𑿀', '𑿱'), + ('𑿿', '𑿿'), +]; + +pub const TANGSA: &'static [(char, char)] = &[('𖩰', '𖪾'), ('𖫀', '𖫉')]; + +pub const TANGUT: &'static [(char, char)] = + &[('𖿠', '𖿠'), ('𗀀', '𘟷'), ('𘠀', '𘫿'), ('𘴀', '𘴈')]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('౷', '౿'), + ('\u{1cda}', '\u{1cda}'), + ('ᳲ', 'ᳲ'), +]; + +pub const THAANA: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '\u{61c}'), + ('؟', '؟'), + ('٠', '٩'), + ('ހ', 'ޱ'), + ('ﷲ', 'ﷲ'), + ('﷽', '﷽'), +]; + +pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('ༀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('྾', '࿌'), + ('࿎', '࿔'), + ('࿙', '࿚'), +]; + +pub const TIFINAGH: &'static [(char, char)] = + &[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')]; + +pub const TIRHUTA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('ᳲ', 'ᳲ'), + ('꠰', '꠹'), + ('𑒀', '𑓇'), + ('𑓐', '𑓙'), +]; + +pub const TOTO: &'static [(char, char)] = &[('𞊐', '\u{1e2ae}')]; + +pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; + +pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')]; + +pub const VITHKUQI: &'static [(char, char)] = &[ + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), +]; + +pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')]; + +pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; + +pub const YEZIDI: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('٠', '٩'), + ('𐺀', '𐺩'), + ('\u{10eab}', '𐺭'), + ('𐺰', '𐺱'), +]; + +pub const YI: &'static [(char, char)] = &[ + ('、', '。'), + ('〈', '】'), + ('〔', '〛'), + ('・', '・'), + ('ꀀ', 'ꒌ'), + ('꒐', '꓆'), + ('。', '・'), +]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('𑨀', '\u{11a47}')]; diff --git a/vendor/regex-syntax/src/unicode_tables/sentence_break.rs b/vendor/regex-syntax/src/unicode_tables/sentence_break.rs new file mode 100644 index 0000000..2434873 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/sentence_break.rs @@ -0,0 +1,2477 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate sentence-break ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ATerm", ATERM), + ("CR", CR), + ("Close", CLOSE), + ("Extend", EXTEND), + ("Format", FORMAT), + ("LF", LF), + ("Lower", LOWER), + ("Numeric", NUMERIC), + ("OLetter", OLETTER), + ("SContinue", SCONTINUE), + ("STerm", STERM), + ("Sep", SEP), + ("Sp", SP), + ("Upper", UPPER), +]; + +pub const ATERM: &'static [(char, char)] = + &[('.', '.'), ('․', '․'), ('﹒', '﹒'), ('.', '.')]; + +pub const CR: &'static [(char, char)] = &[('\r', '\r')]; + +pub const CLOSE: &'static [(char, char)] = &[ + ('"', '"'), + ('\'', ')'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('«', '«'), + ('»', '»'), + ('༺', '༽'), + ('᚛', '᚜'), + ('‘', '‟'), + ('‹', '›'), + ('⁅', '⁆'), + ('⁽', '⁾'), + ('₍', '₎'), + ('⌈', '⌋'), + ('〈', '〉'), + ('❛', '❠'), + ('❨', '❵'), + ('⟅', '⟆'), + ('⟦', '⟯'), + ('⦃', '⦘'), + ('⧘', '⧛'), + ('⧼', '⧽'), + ('⸀', '⸍'), + ('⸜', '⸝'), + ('⸠', '⸩'), + ('⹂', '⹂'), + ('⹕', '⹜'), + ('〈', '】'), + ('〔', '〛'), + ('〝', '〟'), + ('﴾', '﴿'), + ('︗', '︘'), + ('︵', '﹄'), + ('﹇', '﹈'), + ('﹙', '﹞'), + ('(', ')'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('⦅', '⦆'), + ('「', '」'), + ('🙶', '🙸'), +]; + +pub const EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', 'ः'), + ('\u{93a}', '\u{93c}'), + ('ा', 'ॏ'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'ঃ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('\u{abc}', '\u{abc}'), + ('ા', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ಃ'), + ('\u{cbc}', '\u{cbc}'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('ೳ', 'ೳ'), + ('\u{d00}', 'ഃ'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'ඃ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', '༿'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('ါ', '\u{103e}'), + ('ၖ', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '᜕'), + ('\u{1732}', '᜴'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('\u{1a17}', '\u{1a1b}'), + ('ᩕ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᬄ'), + ('\u{1b34}', '᭄'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'ᮂ'), + ('ᮡ', '\u{1bad}'), + ('\u{1be6}', '᯳'), + ('ᰤ', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('᳷', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200d}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('ꠣ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꢀ', 'ꢁ'), + ('ꢴ', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '꥓'), + ('\u{a980}', 'ꦃ'), + ('\u{a9b3}', '꧀'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ꩍ'), + ('ꩻ', 'ꩽ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ꫫ', 'ꫯ'), + ('ꫵ', '\u{aaf6}'), + ('ꯣ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('𑀀', '𑀂'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '𑂂'), + ('𑂰', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11134}'), + ('𑅅', '𑅆'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '𑆂'), + ('𑆳', '𑇀'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '\u{111cf}'), + ('𑈬', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112ea}'), + ('\u{11300}', '𑌃'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐵', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114c3}'), + ('\u{115af}', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('𑘰', '\u{11640}'), + ('\u{116ab}', '\u{116b7}'), + ('\u{1171d}', '\u{1172b}'), + ('𑠬', '\u{1183a}'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193e}'), + ('𑥀', '𑥀'), + ('𑥂', '\u{11943}'), + ('𑧑', '\u{119d7}'), + ('\u{119da}', '\u{119e0}'), + ('𑧤', '𑧤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '𑨹'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a99}'), + ('𑰯', '\u{11c36}'), + ('\u{11c38}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('𑶊', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '\u{11d97}'), + ('\u{11ef3}', '𑻶'), + ('\u{11f00}', '\u{11f01}'), + ('𑼃', '𑼃'), + ('𑼴', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽑', '𖾇'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{600}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), + ('\u{200b}', '\u{200b}'), + ('\u{200e}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{13430}', '\u{1343f}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), +]; + +pub const LF: &'static [(char, char)] = &[('\n', '\n')]; + +pub const LOWER: &'static [(char, char)] = &[ + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ĸ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƍ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƛ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƪ', 'ƫ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƺ'), + ('ƽ', 'ƿ'), + ('dž', 'dž'), + ('lj', 'lj'), + ('nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'ǰ'), + ('dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȡ', 'ȡ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȹ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ʓ'), + ('ʕ', 'ʸ'), + ('ˀ', 'ˁ'), + ('ˠ', 'ˤ'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϼ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ՠ', 'ֈ'), + ('ჼ', 'ჼ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᴀ', 'ᶿ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẝ'), + ('ẟ', 'ẟ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), + ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℊ', 'ℊ'), + ('ℎ', 'ℏ'), + ('ℓ', 'ℓ'), + ('ℯ', 'ℯ'), + ('ℴ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℽ'), + ('ⅆ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), + ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱱ', 'ⱱ'), + ('ⱳ', 'ⱴ'), + ('ⱶ', 'ⱽ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳤ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚝ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜱ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝸ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞎ', 'ꞎ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞕ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞯ', 'ꞯ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟕ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟲ', 'ꟴ'), + ('ꟶ', 'ꟶ'), + ('ꟸ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐞀', '𐞀'), + ('𐞃', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𝐚', '𝐳'), + ('𝑎', '𝑔'), + ('𝑖', '𝑧'), + ('𝒂', '𝒛'), + ('𝒶', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝓏'), + ('𝓪', '𝔃'), + ('𝔞', '𝔷'), + ('𝕒', '𝕫'), + ('𝖆', '𝖟'), + ('𝖺', '𝗓'), + ('𝗮', '𝘇'), + ('𝘢', '𝘻'), + ('𝙖', '𝙯'), + ('𝚊', '𝚥'), + ('𝛂', '𝛚'), + ('𝛜', '𝛡'), + ('𝛼', '𝜔'), + ('𝜖', '𝜛'), + ('𝜶', '𝝎'), + ('𝝐', '𝝕'), + ('𝝰', '𝞈'), + ('𝞊', '𝞏'), + ('𝞪', '𝟂'), + ('𝟄', '𝟉'), + ('𝟋', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞤢', '𞥃'), +]; + +pub const NUMERIC: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('٫', '٬'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑥐', '𑥙'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞥐', '𞥙'), + ('🯰', '🯹'), +]; + +pub const OLETTER: &'static [(char, char)] = &[ + ('ƻ', 'ƻ'), + ('ǀ', 'ǃ'), + ('ʔ', 'ʔ'), + ('ʹ', 'ʿ'), + ('ˆ', 'ˑ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('ʹ', 'ʹ'), + ('ՙ', 'ՙ'), + ('א', 'ת'), + ('ׯ', '׳'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('เ', 'ๆ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('ა', 'ჺ'), + ('ჽ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ℵ', 'ℸ'), + ('ↀ', 'ↂ'), + ('ↅ', 'ↈ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ⸯ', 'ⸯ'), + ('々', '〇'), + ('〡', '〩'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('ꙮ', 'ꙮ'), + ('ꙿ', 'ꙿ'), + ('ꚠ', 'ꛯ'), + ('ꜗ', 'ꜟ'), + ('ꞈ', 'ꞈ'), + ('ꞏ', 'ꞏ'), + ('ꟷ', 'ꟷ'), + ('ꟻ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꯀ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('ヲ', 'ン'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐑐', '𐒝'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞁', '𐞂'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝼊', '𝼊'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const SCONTINUE: &'static [(char, char)] = &[ + (',', '-'), + (':', ':'), + ('՝', '՝'), + ('،', '؍'), + ('߸', '߸'), + ('᠂', '᠂'), + ('᠈', '᠈'), + ('–', '—'), + ('、', '、'), + ('︐', '︑'), + ('︓', '︓'), + ('︱', '︲'), + ('﹐', '﹑'), + ('﹕', '﹕'), + ('﹘', '﹘'), + ('﹣', '﹣'), + (',', '-'), + (':', ':'), + ('、', '、'), +]; + +pub const STERM: &'static [(char, char)] = &[ + ('!', '!'), + ('?', '?'), + ('։', '։'), + ('؝', '؟'), + ('۔', '۔'), + ('܀', '܂'), + ('߹', '߹'), + ('࠷', '࠷'), + ('࠹', '࠹'), + ('࠽', '࠾'), + ('।', '॥'), + ('၊', '။'), + ('።', '።'), + ('፧', '፨'), + ('᙮', '᙮'), + ('᜵', '᜶'), + ('᠃', '᠃'), + ('᠉', '᠉'), + ('᥄', '᥅'), + ('᪨', '᪫'), + ('᭚', '᭛'), + ('᭞', '᭟'), + ('᭽', '᭾'), + ('᰻', '᰼'), + ('᱾', '᱿'), + ('‼', '‽'), + ('⁇', '⁉'), + ('⸮', '⸮'), + ('⸼', '⸼'), + ('⹓', '⹔'), + ('。', '。'), + ('꓿', '꓿'), + ('꘎', '꘏'), + ('꛳', '꛳'), + ('꛷', '꛷'), + ('꡶', '꡷'), + ('꣎', '꣏'), + ('꤯', '꤯'), + ('꧈', '꧉'), + ('꩝', '꩟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('﹖', '﹗'), + ('!', '!'), + ('?', '?'), + ('。', '。'), + ('𐩖', '𐩗'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁈'), + ('𑂾', '𑃁'), + ('𑅁', '𑅃'), + ('𑇅', '𑇆'), + ('𑇍', '𑇍'), + ('𑇞', '𑇟'), + ('𑈸', '𑈹'), + ('𑈻', '𑈼'), + ('𑊩', '𑊩'), + ('𑑋', '𑑌'), + ('𑗂', '𑗃'), + ('𑗉', '𑗗'), + ('𑙁', '𑙂'), + ('𑜼', '𑜾'), + ('𑥄', '𑥄'), + ('𑥆', '𑥆'), + ('𑩂', '𑩃'), + ('𑪛', '𑪜'), + ('𑱁', '𑱂'), + ('𑻷', '𑻸'), + ('𑽃', '𑽄'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬸'), + ('𖭄', '𖭄'), + ('𖺘', '𖺘'), + ('𛲟', '𛲟'), + ('𝪈', '𝪈'), +]; + +pub const SEP: &'static [(char, char)] = + &[('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')]; + +pub const SP: &'static [(char, char)] = &[ + ('\t', '\t'), + ('\u{b}', '\u{c}'), + (' ', ' '), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const UPPER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('À', 'Ö'), + ('Ø', 'Þ'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'Dž'), + ('LJ', 'Lj'), + ('NJ', 'Nj'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'Dz'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϴ', 'ϴ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), + ('ᾨ', 'ᾯ'), + ('Ᾰ', 'ᾼ'), + ('Ὲ', 'ῌ'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('Ὸ', 'ῼ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℋ', 'ℍ'), + ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℰ', 'ℳ'), + ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), + ('Ⅰ', 'Ⅿ'), + ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𝐀', '𝐙'), + ('𝐴', '𝑍'), + ('𝑨', '𝒁'), + ('𝒜', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒵'), + ('𝓐', '𝓩'), + ('𝔄', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔸', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕬', '𝖅'), + ('𝖠', '𝖹'), + ('𝗔', '𝗭'), + ('𝘈', '𝘡'), + ('𝘼', '𝙕'), + ('𝙰', '𝚉'), + ('𝚨', '𝛀'), + ('𝛢', '𝛺'), + ('𝜜', '𝜴'), + ('𝝖', '𝝮'), + ('𝞐', '𝞨'), + ('𝟊', '𝟊'), + ('𞤀', '𞤡'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/word_break.rs b/vendor/regex-syntax/src/unicode_tables/word_break.rs new file mode 100644 index 0000000..c071495 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/word_break.rs @@ -0,0 +1,1120 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate word-break ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ALetter", ALETTER), + ("CR", CR), + ("Double_Quote", DOUBLE_QUOTE), + ("Extend", EXTEND), + ("ExtendNumLet", EXTENDNUMLET), + ("Format", FORMAT), + ("Hebrew_Letter", HEBREW_LETTER), + ("Katakana", KATAKANA), + ("LF", LF), + ("MidLetter", MIDLETTER), + ("MidNum", MIDNUM), + ("MidNumLet", MIDNUMLET), + ("Newline", NEWLINE), + ("Numeric", NUMERIC), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("Single_Quote", SINGLE_QUOTE), + ("WSegSpace", WSEGSPACE), + ("ZWJ", ZWJ), +]; + +pub const ALETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', '˗'), + ('˞', '˿'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', '՜'), + ('՞', '՞'), + ('ՠ', 'ֈ'), + ('֊', '֊'), + ('׳', '׳'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᨀ', 'ᨖ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ⸯ', 'ⸯ'), + ('々', '々'), + ('〻', '〼'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ꀀ', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), + ('꜈', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭩ'), + ('ꭰ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('ﭐ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑠀', '𑠫'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const CR: &'static [(char, char)] = &[('\r', '\r')]; + +pub const DOUBLE_QUOTE: &'static [(char, char)] = &[('"', '"')]; + +pub const EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', 'ः'), + ('\u{93a}', '\u{93c}'), + ('ा', 'ॏ'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'ঃ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('\u{abc}', '\u{abc}'), + ('ા', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ಃ'), + ('\u{cbc}', '\u{cbc}'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('ೳ', 'ೳ'), + ('\u{d00}', 'ഃ'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'ඃ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', '༿'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('ါ', '\u{103e}'), + ('ၖ', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '᜕'), + ('\u{1732}', '᜴'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('\u{1a17}', '\u{1a1b}'), + ('ᩕ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᬄ'), + ('\u{1b34}', '᭄'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'ᮂ'), + ('ᮡ', '\u{1bad}'), + ('\u{1be6}', '᯳'), + ('ᰤ', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('᳷', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200c}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('ꠣ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꢀ', 'ꢁ'), + ('ꢴ', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '꥓'), + ('\u{a980}', 'ꦃ'), + ('\u{a9b3}', '꧀'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ꩍ'), + ('ꩻ', 'ꩽ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ꫫ', 'ꫯ'), + ('ꫵ', '\u{aaf6}'), + ('ꯣ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('𑀀', '𑀂'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '𑂂'), + ('𑂰', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11134}'), + ('𑅅', '𑅆'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '𑆂'), + ('𑆳', '𑇀'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '\u{111cf}'), + ('𑈬', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112ea}'), + ('\u{11300}', '𑌃'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐵', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114c3}'), + ('\u{115af}', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('𑘰', '\u{11640}'), + ('\u{116ab}', '\u{116b7}'), + ('\u{1171d}', '\u{1172b}'), + ('𑠬', '\u{1183a}'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193e}'), + ('𑥀', '𑥀'), + ('𑥂', '\u{11943}'), + ('𑧑', '\u{119d7}'), + ('\u{119da}', '\u{119e0}'), + ('𑧤', '𑧤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '𑨹'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a99}'), + ('𑰯', '\u{11c36}'), + ('\u{11c38}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('𑶊', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '\u{11d97}'), + ('\u{11ef3}', '𑻶'), + ('\u{11f00}', '\u{11f01}'), + ('𑼃', '𑼃'), + ('𑼴', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽑', '𖾇'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('🏻', '🏿'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const EXTENDNUMLET: &'static [(char, char)] = &[ + ('_', '_'), + ('\u{202f}', '\u{202f}'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('︳', '︴'), + ('﹍', '﹏'), + ('_', '_'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{600}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), + ('\u{200e}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{13430}', '\u{1343f}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), +]; + +pub const HEBREW_LETTER: &'static [(char, char)] = &[ + ('א', 'ת'), + ('ׯ', 'ײ'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﭏ'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('〱', '〵'), + ('゛', '゜'), + ('゠', 'ヺ'), + ('ー', 'ヿ'), + ('ㇰ', 'ㇿ'), + ('㋐', '㋾'), + ('㌀', '㍗'), + ('ヲ', 'ン'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛀀'), + ('𛄠', '𛄢'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), +]; + +pub const LF: &'static [(char, char)] = &[('\n', '\n')]; + +pub const MIDLETTER: &'static [(char, char)] = &[ + (':', ':'), + ('·', '·'), + ('·', '·'), + ('՟', '՟'), + ('״', '״'), + ('‧', '‧'), + ('︓', '︓'), + ('﹕', '﹕'), + (':', ':'), +]; + +pub const MIDNUM: &'static [(char, char)] = &[ + (',', ','), + (';', ';'), + (';', ';'), + ('։', '։'), + ('،', '؍'), + ('٬', '٬'), + ('߸', '߸'), + ('⁄', '⁄'), + ('︐', '︐'), + ('︔', '︔'), + ('﹐', '﹐'), + ('﹔', '﹔'), + (',', ','), + (';', ';'), +]; + +pub const MIDNUMLET: &'static [(char, char)] = &[ + ('.', '.'), + ('‘', '’'), + ('․', '․'), + ('﹒', '﹒'), + (''', '''), + ('.', '.'), +]; + +pub const NEWLINE: &'static [(char, char)] = + &[('\u{b}', '\u{c}'), ('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')]; + +pub const NUMERIC: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('٫', '٫'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑥐', '𑥙'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞥐', '𞥙'), + ('🯰', '🯹'), +]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')]; + +pub const SINGLE_QUOTE: &'static [(char, char)] = &[('\'', '\'')]; + +pub const WSEGSPACE: &'static [(char, char)] = &[ + (' ', ' '), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{2006}'), + ('\u{2008}', '\u{200a}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')]; diff --git a/vendor/regex-syntax/src/utf8.rs b/vendor/regex-syntax/src/utf8.rs new file mode 100644 index 0000000..e13b55a --- /dev/null +++ b/vendor/regex-syntax/src/utf8.rs @@ -0,0 +1,592 @@ +/*! +Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes. + +This is sub-module is useful for constructing byte based automatons that need +to embed UTF-8 decoding. The most common use of this module is in conjunction +with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type. + +See the documentation on the `Utf8Sequences` iterator for more details and +an example. + +# Wait, what is this? + +This is simplest to explain with an example. Let's say you wanted to test +whether a particular byte sequence was a Cyrillic character. One possible +scalar value range is `[0400-04FF]`. The set of allowed bytes for this +range can be expressed as a sequence of byte ranges: + +```text +[D0-D3][80-BF] +``` + +This is simple enough: simply encode the boundaries, `0400` encodes to +`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each +corresponding pair of bytes: `D0` to `D3` and `80` to `BF`. + +However, what if you wanted to add the Cyrillic Supplementary characters to +your range? Your range might then become `[0400-052F]`. The same procedure +as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges +you'd get from the previous transformation would be `[D0-D4][80-AF]`. However, +this isn't quite correct because this range doesn't capture many characters, +for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`). + +Instead, you need multiple sequences of byte ranges: + +```text +[D0-D3][80-BF] # matches codepoints 0400-04FF +[D4][80-AF] # matches codepoints 0500-052F +``` + +This gets even more complicated if you want bigger ranges, particularly if +they naively contain surrogate codepoints. For example, the sequence of byte +ranges for the basic multilingual plane (`[0000-FFFF]`) look like this: + +```text +[0-7F] +[C2-DF][80-BF] +[E0][A0-BF][80-BF] +[E1-EC][80-BF][80-BF] +[ED][80-9F][80-BF] +[EE-EF][80-BF][80-BF] +``` + +Note that the byte ranges above will *not* match any erroneous encoding of +UTF-8, including encodings of surrogate codepoints. + +And, of course, for all of Unicode (`[000000-10FFFF]`): + +```text +[0-7F] +[C2-DF][80-BF] +[E0][A0-BF][80-BF] +[E1-EC][80-BF][80-BF] +[ED][80-9F][80-BF] +[EE-EF][80-BF][80-BF] +[F0][90-BF][80-BF][80-BF] +[F1-F3][80-BF][80-BF][80-BF] +[F4][80-8F][80-BF][80-BF] +``` + +This module automates the process of creating these byte ranges from ranges of +Unicode scalar values. + +# Lineage + +I got the idea and general implementation strategy from Russ Cox in his +[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2. +Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?). +I also got the idea from +[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java), +which uses it for executing automata on their term index. +*/ + +use core::{char, fmt, iter::FusedIterator, slice}; + +use alloc::{vec, vec::Vec}; + +const MAX_UTF8_BYTES: usize = 4; + +/// Utf8Sequence represents a sequence of byte ranges. +/// +/// To match a Utf8Sequence, a candidate byte sequence must match each +/// successive range. +/// +/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte +/// sequence `\xDD\x61` would not match because `0x61 < 0x80`. +#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)] +pub enum Utf8Sequence { + /// One byte range. + One(Utf8Range), + /// Two successive byte ranges. + Two([Utf8Range; 2]), + /// Three successive byte ranges. + Three([Utf8Range; 3]), + /// Four successive byte ranges. + Four([Utf8Range; 4]), +} + +impl Utf8Sequence { + /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value + /// range. + /// + /// This assumes that `start` and `end` have the same length. + fn from_encoded_range(start: &[u8], end: &[u8]) -> Self { + assert_eq!(start.len(), end.len()); + match start.len() { + 2 => Utf8Sequence::Two([ + Utf8Range::new(start[0], end[0]), + Utf8Range::new(start[1], end[1]), + ]), + 3 => Utf8Sequence::Three([ + Utf8Range::new(start[0], end[0]), + Utf8Range::new(start[1], end[1]), + Utf8Range::new(start[2], end[2]), + ]), + 4 => Utf8Sequence::Four([ + Utf8Range::new(start[0], end[0]), + Utf8Range::new(start[1], end[1]), + Utf8Range::new(start[2], end[2]), + Utf8Range::new(start[3], end[3]), + ]), + n => unreachable!("invalid encoded length: {}", n), + } + } + + /// Returns the underlying sequence of byte ranges as a slice. + pub fn as_slice(&self) -> &[Utf8Range] { + use self::Utf8Sequence::*; + match *self { + One(ref r) => slice::from_ref(r), + Two(ref r) => &r[..], + Three(ref r) => &r[..], + Four(ref r) => &r[..], + } + } + + /// Returns the number of byte ranges in this sequence. + /// + /// The length is guaranteed to be in the closed interval `[1, 4]`. + pub fn len(&self) -> usize { + self.as_slice().len() + } + + /// Reverses the ranges in this sequence. + /// + /// For example, if this corresponds to the following sequence: + /// + /// ```text + /// [D0-D3][80-BF] + /// ``` + /// + /// Then after reversal, it will be + /// + /// ```text + /// [80-BF][D0-D3] + /// ``` + /// + /// This is useful when one is constructing a UTF-8 automaton to match + /// character classes in reverse. + pub fn reverse(&mut self) { + match *self { + Utf8Sequence::One(_) => {} + Utf8Sequence::Two(ref mut x) => x.reverse(), + Utf8Sequence::Three(ref mut x) => x.reverse(), + Utf8Sequence::Four(ref mut x) => x.reverse(), + } + } + + /// Returns true if and only if a prefix of `bytes` matches this sequence + /// of byte ranges. + pub fn matches(&self, bytes: &[u8]) -> bool { + if bytes.len() < self.len() { + return false; + } + for (&b, r) in bytes.iter().zip(self) { + if !r.matches(b) { + return false; + } + } + true + } +} + +impl<'a> IntoIterator for &'a Utf8Sequence { + type IntoIter = slice::Iter<'a, Utf8Range>; + type Item = &'a Utf8Range; + + fn into_iter(self) -> Self::IntoIter { + self.as_slice().iter() + } +} + +impl fmt::Debug for Utf8Sequence { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::Utf8Sequence::*; + match *self { + One(ref r) => write!(f, "{:?}", r), + Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]), + Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]), + Four(ref r) => { + write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3]) + } + } + } +} + +/// A single inclusive range of UTF-8 bytes. +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +pub struct Utf8Range { + /// Start of byte range (inclusive). + pub start: u8, + /// End of byte range (inclusive). + pub end: u8, +} + +impl Utf8Range { + fn new(start: u8, end: u8) -> Self { + Utf8Range { start, end } + } + + /// Returns true if and only if the given byte is in this range. + pub fn matches(&self, b: u8) -> bool { + self.start <= b && b <= self.end + } +} + +impl fmt::Debug for Utf8Range { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.start == self.end { + write!(f, "[{:X}]", self.start) + } else { + write!(f, "[{:X}-{:X}]", self.start, self.end) + } + } +} + +/// An iterator over ranges of matching UTF-8 byte sequences. +/// +/// The iteration represents an alternation of comprehensive byte sequences +/// that match precisely the set of UTF-8 encoded scalar values. +/// +/// A byte sequence corresponds to one of the scalar values in the range given +/// if and only if it completely matches exactly one of the sequences of byte +/// ranges produced by this iterator. +/// +/// Each sequence of byte ranges matches a unique set of bytes. That is, no two +/// sequences will match the same bytes. +/// +/// # Example +/// +/// This shows how to match an arbitrary byte sequence against a range of +/// scalar values. +/// +/// ```rust +/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence}; +/// +/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool { +/// for range in seqs { +/// if range.matches(bytes) { +/// return true; +/// } +/// } +/// false +/// } +/// +/// // Test the basic multilingual plane. +/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect(); +/// +/// // UTF-8 encoding of 'a'. +/// assert!(matches(&seqs, &[0x61])); +/// // UTF-8 encoding of '☃' (`\u{2603}`). +/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83])); +/// // UTF-8 encoding of `\u{10348}` (outside the BMP). +/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88])); +/// // Tries to match against a UTF-8 encoding of a surrogate codepoint, +/// // which is invalid UTF-8, and therefore fails, despite the fact that +/// // the corresponding codepoint (0xD800) falls in the range given. +/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80])); +/// // And fails against plain old invalid UTF-8. +/// assert!(!matches(&seqs, &[0xFF, 0xFF])); +/// ``` +/// +/// If this example seems circuitous, that's because it is! It's meant to be +/// illustrative. In practice, you could just try to decode your byte sequence +/// and compare it with the scalar value range directly. However, this is not +/// always possible (for example, in a byte based automaton). +#[derive(Debug)] +pub struct Utf8Sequences { + range_stack: Vec, +} + +impl Utf8Sequences { + /// Create a new iterator over UTF-8 byte ranges for the scalar value range + /// given. + pub fn new(start: char, end: char) -> Self { + let mut it = Utf8Sequences { range_stack: vec![] }; + it.push(u32::from(start), u32::from(end)); + it + } + + /// reset resets the scalar value range. + /// Any existing state is cleared, but resources may be reused. + /// + /// N.B. Benchmarks say that this method is dubious. + #[doc(hidden)] + pub fn reset(&mut self, start: char, end: char) { + self.range_stack.clear(); + self.push(u32::from(start), u32::from(end)); + } + + fn push(&mut self, start: u32, end: u32) { + self.range_stack.push(ScalarRange { start, end }); + } +} + +struct ScalarRange { + start: u32, + end: u32, +} + +impl fmt::Debug for ScalarRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "ScalarRange({:X}, {:X})", self.start, self.end) + } +} + +impl Iterator for Utf8Sequences { + type Item = Utf8Sequence; + + fn next(&mut self) -> Option { + 'TOP: while let Some(mut r) = self.range_stack.pop() { + 'INNER: loop { + if let Some((r1, r2)) = r.split() { + self.push(r2.start, r2.end); + r.start = r1.start; + r.end = r1.end; + continue 'INNER; + } + if !r.is_valid() { + continue 'TOP; + } + for i in 1..MAX_UTF8_BYTES { + let max = max_scalar_value(i); + if r.start <= max && max < r.end { + self.push(max + 1, r.end); + r.end = max; + continue 'INNER; + } + } + if let Some(ascii_range) = r.as_ascii() { + return Some(Utf8Sequence::One(ascii_range)); + } + for i in 1..MAX_UTF8_BYTES { + let m = (1 << (6 * i)) - 1; + if (r.start & !m) != (r.end & !m) { + if (r.start & m) != 0 { + self.push((r.start | m) + 1, r.end); + r.end = r.start | m; + continue 'INNER; + } + if (r.end & m) != m { + self.push(r.end & !m, r.end); + r.end = (r.end & !m) - 1; + continue 'INNER; + } + } + } + let mut start = [0; MAX_UTF8_BYTES]; + let mut end = [0; MAX_UTF8_BYTES]; + let n = r.encode(&mut start, &mut end); + return Some(Utf8Sequence::from_encoded_range( + &start[0..n], + &end[0..n], + )); + } + } + None + } +} + +impl FusedIterator for Utf8Sequences {} + +impl ScalarRange { + /// split splits this range if it overlaps with a surrogate codepoint. + /// + /// Either or both ranges may be invalid. + fn split(&self) -> Option<(ScalarRange, ScalarRange)> { + if self.start < 0xE000 && self.end > 0xD7FF { + Some(( + ScalarRange { start: self.start, end: 0xD7FF }, + ScalarRange { start: 0xE000, end: self.end }, + )) + } else { + None + } + } + + /// is_valid returns true if and only if start <= end. + fn is_valid(&self) -> bool { + self.start <= self.end + } + + /// as_ascii returns this range as a Utf8Range if and only if all scalar + /// values in this range can be encoded as a single byte. + fn as_ascii(&self) -> Option { + if self.is_ascii() { + let start = u8::try_from(self.start).unwrap(); + let end = u8::try_from(self.end).unwrap(); + Some(Utf8Range::new(start, end)) + } else { + None + } + } + + /// is_ascii returns true if the range is ASCII only (i.e., takes a single + /// byte to encode any scalar value). + fn is_ascii(&self) -> bool { + self.is_valid() && self.end <= 0x7f + } + + /// encode writes the UTF-8 encoding of the start and end of this range + /// to the corresponding destination slices, and returns the number of + /// bytes written. + /// + /// The slices should have room for at least `MAX_UTF8_BYTES`. + fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize { + let cs = char::from_u32(self.start).unwrap(); + let ce = char::from_u32(self.end).unwrap(); + let ss = cs.encode_utf8(start); + let se = ce.encode_utf8(end); + assert_eq!(ss.len(), se.len()); + ss.len() + } +} + +fn max_scalar_value(nbytes: usize) -> u32 { + match nbytes { + 1 => 0x007F, + 2 => 0x07FF, + 3 => 0xFFFF, + 4 => 0x0010_FFFF, + _ => unreachable!("invalid UTF-8 byte sequence size"), + } +} + +#[cfg(test)] +mod tests { + use core::char; + + use alloc::{vec, vec::Vec}; + + use crate::utf8::{Utf8Range, Utf8Sequences}; + + fn rutf8(s: u8, e: u8) -> Utf8Range { + Utf8Range::new(s, e) + } + + fn never_accepts_surrogate_codepoints(start: char, end: char) { + for cp in 0xD800..0xE000 { + let buf = encode_surrogate(cp); + for r in Utf8Sequences::new(start, end) { + if r.matches(&buf) { + panic!( + "Sequence ({:X}, {:X}) contains range {:?}, \ + which matches surrogate code point {:X} \ + with encoded bytes {:?}", + u32::from(start), + u32::from(end), + r, + cp, + buf, + ); + } + } + } + } + + #[test] + fn codepoints_no_surrogates() { + never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}'); + never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}'); + never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}'); + never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}'); + never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}'); + } + + #[test] + fn single_codepoint_one_sequence() { + // Tests that every range of scalar values that contains a single + // scalar value is recognized by one sequence of byte ranges. + for i in 0x0..=0x0010_FFFF { + let c = match char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let seqs: Vec<_> = Utf8Sequences::new(c, c).collect(); + assert_eq!(seqs.len(), 1); + } + } + + #[test] + fn bmp() { + use crate::utf8::Utf8Sequence::*; + + let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::>(); + assert_eq!( + seqs, + vec![ + One(rutf8(0x0, 0x7F)), + Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]), + Three([ + rutf8(0xE0, 0xE0), + rutf8(0xA0, 0xBF), + rutf8(0x80, 0xBF) + ]), + Three([ + rutf8(0xE1, 0xEC), + rutf8(0x80, 0xBF), + rutf8(0x80, 0xBF) + ]), + Three([ + rutf8(0xED, 0xED), + rutf8(0x80, 0x9F), + rutf8(0x80, 0xBF) + ]), + Three([ + rutf8(0xEE, 0xEF), + rutf8(0x80, 0xBF), + rutf8(0x80, 0xBF) + ]), + ] + ); + } + + #[test] + fn reverse() { + use crate::utf8::Utf8Sequence::*; + + let mut s = One(rutf8(0xA, 0xB)); + s.reverse(); + assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]); + + let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]); + s.reverse(); + assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]); + + let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]); + s.reverse(); + assert_eq!( + s.as_slice(), + &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)] + ); + + let mut s = Four([ + rutf8(0xA, 0xB), + rutf8(0xB, 0xC), + rutf8(0xC, 0xD), + rutf8(0xD, 0xE), + ]); + s.reverse(); + assert_eq!( + s.as_slice(), + &[ + rutf8(0xD, 0xE), + rutf8(0xC, 0xD), + rutf8(0xB, 0xC), + rutf8(0xA, 0xB) + ] + ); + } + + fn encode_surrogate(cp: u32) -> [u8; 3] { + const TAG_CONT: u8 = 0b1000_0000; + const TAG_THREE_B: u8 = 0b1110_0000; + + assert!(0xD800 <= cp && cp < 0xE000); + let mut dst = [0; 3]; + dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B; + dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT; + dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT; + dst + } +} diff --git a/vendor/regex-syntax/test b/vendor/regex-syntax/test new file mode 100755 index 0000000..8626c3b --- /dev/null +++ b/vendor/regex-syntax/test @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +# cd to the directory containing this crate's Cargo.toml so that we don't need +# to pass --manifest-path to every `cargo` command. +cd "$(dirname "$0")" + +# This is a convenience script for running a broad swath of the syntax tests. +echo "===== DEFAULT FEATURES ===" +cargo test + +features=( + std + unicode + unicode-age + unicode-bool + unicode-case + unicode-gencat + unicode-perl + unicode-script + unicode-segment +) +for f in "${features[@]}"; do + echo "=== FEATURE: $f ===" + # We only run library tests because I couldn't figure out how to easily + # make doc tests run in 'no_std' mode. In particular, without the Error + # trait, using '?' in doc tests seems tricky. + cargo test --no-default-features --lib --features "$f" +done -- cgit v1.2.3