summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/rust/regex-syntax/.cargo-checksum.json1
-rw-r--r--third_party/rust/regex-syntax/Cargo.toml41
-rw-r--r--third_party/rust/regex-syntax/LICENSE-APACHE201
-rw-r--r--third_party/rust/regex-syntax/LICENSE-MIT25
-rw-r--r--third_party/rust/regex-syntax/README.md98
-rw-r--r--third_party/rust/regex-syntax/benches/bench.rs63
-rw-r--r--third_party/rust/regex-syntax/src/ast/mod.rs1502
-rw-r--r--third_party/rust/regex-syntax/src/ast/parse.rs5930
-rw-r--r--third_party/rust/regex-syntax/src/ast/print.rs568
-rw-r--r--third_party/rust/regex-syntax/src/ast/visitor.rs517
-rw-r--r--third_party/rust/regex-syntax/src/either.rs8
-rw-r--r--third_party/rust/regex-syntax/src/error.rs324
-rw-r--r--third_party/rust/regex-syntax/src/hir/interval.rs520
-rw-r--r--third_party/rust/regex-syntax/src/hir/literal/mod.rs1686
-rw-r--r--third_party/rust/regex-syntax/src/hir/mod.rs2296
-rw-r--r--third_party/rust/regex-syntax/src/hir/print.rs367
-rw-r--r--third_party/rust/regex-syntax/src/hir/translate.rs3207
-rw-r--r--third_party/rust/regex-syntax/src/hir/visitor.rs203
-rw-r--r--third_party/rust/regex-syntax/src/lib.rs312
-rw-r--r--third_party/rust/regex-syntax/src/parser.rs200
-rw-r--r--third_party/rust/regex-syntax/src/unicode.rs1001
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/LICENSE-UNICODE57
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/age.rs1791
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/case_folding_simple.rs2888
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/general_category.rs6552
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs1416
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/mod.rs57
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/perl_decimal.rs77
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/perl_space.rs23
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/perl_word.rs781
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/property_bool.rs11367
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/property_names.rs264
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/property_values.rs924
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/script.rs1263
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/script_extension.rs1457
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/sentence_break.rs2477
-rw-r--r--third_party/rust/regex-syntax/src/unicode_tables/word_break.rs1120
-rw-r--r--third_party/rust/regex-syntax/src/utf8.rs587
-rwxr-xr-xthird_party/rust/regex-syntax/test22
-rw-r--r--third_party/rust/regex/.cargo-checksum.json1
-rw-r--r--third_party/rust/regex/CHANGELOG.md1095
-rw-r--r--third_party/rust/regex/Cargo.lock98
-rw-r--r--third_party/rust/regex/Cargo.toml149
-rw-r--r--third_party/rust/regex/HACKING.md341
-rw-r--r--third_party/rust/regex/LICENSE-APACHE201
-rw-r--r--third_party/rust/regex/LICENSE-MIT25
-rw-r--r--third_party/rust/regex/PERFORMANCE.md277
-rw-r--r--third_party/rust/regex/README.md246
-rw-r--r--third_party/rust/regex/UNICODE.md259
-rw-r--r--third_party/rust/regex/examples/regexdna-input.txt1671
-rw-r--r--third_party/rust/regex/examples/regexdna-output.txt13
-rw-r--r--third_party/rust/regex/examples/shootout-regex-dna-bytes.rs68
-rw-r--r--third_party/rust/regex/examples/shootout-regex-dna-cheat.rs90
-rw-r--r--third_party/rust/regex/examples/shootout-regex-dna-replace.rs17
-rw-r--r--third_party/rust/regex/examples/shootout-regex-dna-single-cheat.rs75
-rw-r--r--third_party/rust/regex/examples/shootout-regex-dna-single.rs57
-rw-r--r--third_party/rust/regex/examples/shootout-regex-dna.rs68
-rw-r--r--third_party/rust/regex/rustfmt.toml2
-rw-r--r--third_party/rust/regex/src/backtrack.rs282
-rw-r--r--third_party/rust/regex/src/compile.rs1264
-rw-r--r--third_party/rust/regex/src/dfa.rs1945
-rw-r--r--third_party/rust/regex/src/error.rs71
-rw-r--r--third_party/rust/regex/src/exec.rs1655
-rw-r--r--third_party/rust/regex/src/expand.rs239
-rw-r--r--third_party/rust/regex/src/find_byte.rs18
-rw-r--r--third_party/rust/regex/src/freqs.rs261
-rw-r--r--third_party/rust/regex/src/input.rs432
-rw-r--r--third_party/rust/regex/src/lib.rs769
-rw-r--r--third_party/rust/regex/src/literal/imp.rs402
-rw-r--r--third_party/rust/regex/src/literal/mod.rs55
-rw-r--r--third_party/rust/regex/src/pattern.rs63
-rw-r--r--third_party/rust/regex/src/pikevm.rs360
-rw-r--r--third_party/rust/regex/src/pool.rs333
-rw-r--r--third_party/rust/regex/src/prog.rs447
-rw-r--r--third_party/rust/regex/src/re_builder.rs421
-rw-r--r--third_party/rust/regex/src/re_bytes.rs1260
-rw-r--r--third_party/rust/regex/src/re_set.rs507
-rw-r--r--third_party/rust/regex/src/re_trait.rs294
-rw-r--r--third_party/rust/regex/src/re_unicode.rs1311
-rw-r--r--third_party/rust/regex/src/sparse.rs84
-rw-r--r--third_party/rust/regex/src/testdata/LICENSE19
-rw-r--r--third_party/rust/regex/src/testdata/README17
-rw-r--r--third_party/rust/regex/src/testdata/basic.dat221
-rw-r--r--third_party/rust/regex/src/testdata/nullsubexpr.dat79
-rw-r--r--third_party/rust/regex/src/testdata/repetition.dat163
-rw-r--r--third_party/rust/regex/src/utf8.rs264
-rwxr-xr-xthird_party/rust/regex/test30
-rw-r--r--third_party/rust/regex/tests/api.rs234
-rw-r--r--third_party/rust/regex/tests/api_str.rs34
-rw-r--r--third_party/rust/regex/tests/bytes.rs107
-rw-r--r--third_party/rust/regex/tests/consistent.rs238
-rw-r--r--third_party/rust/regex/tests/crates_regex.rs3287
-rw-r--r--third_party/rust/regex/tests/crazy.rs459
-rw-r--r--third_party/rust/regex/tests/flags.rs31
-rw-r--r--third_party/rust/regex/tests/fowler.rs1588
-rw-r--r--third_party/rust/regex/tests/macros.rs160
-rw-r--r--third_party/rust/regex/tests/macros_bytes.rs39
-rw-r--r--third_party/rust/regex/tests/macros_str.rs38
-rw-r--r--third_party/rust/regex/tests/misc.rs4
-rw-r--r--third_party/rust/regex/tests/multiline.rs144
-rw-r--r--third_party/rust/regex/tests/noparse.rs45
-rw-r--r--third_party/rust/regex/tests/regression.rs222
-rw-r--r--third_party/rust/regex/tests/regression_fuzz.rs31
-rw-r--r--third_party/rust/regex/tests/replace.rs248
-rw-r--r--third_party/rust/regex/tests/searcher.rs95
-rw-r--r--third_party/rust/regex/tests/set.rs67
-rw-r--r--third_party/rust/regex/tests/shortest_match.rs14
-rw-r--r--third_party/rust/regex/tests/suffix_reverse.rs6
-rw-r--r--third_party/rust/regex/tests/test_backtrack.rs56
-rw-r--r--third_party/rust/regex/tests/test_backtrack_bytes.rs55
-rw-r--r--third_party/rust/regex/tests/test_backtrack_utf8bytes.rs58
-rw-r--r--third_party/rust/regex/tests/test_crates_regex.rs54
-rw-r--r--third_party/rust/regex/tests/test_default.rs222
-rw-r--r--third_party/rust/regex/tests/test_default_bytes.rs75
-rw-r--r--third_party/rust/regex/tests/test_nfa.rs50
-rw-r--r--third_party/rust/regex/tests/test_nfa_bytes.rs55
-rw-r--r--third_party/rust/regex/tests/test_nfa_utf8bytes.rs54
-rw-r--r--third_party/rust/regex/tests/unicode.rs251
-rw-r--r--third_party/rust/regex/tests/word_boundary.rs89
-rw-r--r--third_party/rust/regex/tests/word_boundary_ascii.rs9
-rw-r--r--third_party/rust/regex/tests/word_boundary_unicode.rs6
121 files changed, 78337 insertions, 0 deletions
diff --git a/third_party/rust/regex-syntax/.cargo-checksum.json b/third_party/rust/regex-syntax/.cargo-checksum.json
new file mode 100644
index 0000000000..9c05f32bc5
--- /dev/null
+++ b/third_party/rust/regex-syntax/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"Cargo.toml":"238d0bbc855edbecf9a6a6936efc20bd2759f36bc8fa4d53bdef33a1629a9a0f","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"67a3e673a9da6826fd4db5be6902841c821b52b98dc22c300f6e327872392b0a","benches/bench.rs":"d2b6ae5b939abd6093064f144b981b7739d7f474ec0698a1268052fc92406635","src/ast/mod.rs":"91b277a9bb979f85a44a67e39f17f77bde033764eea1f1a93aad1b61f1250089","src/ast/parse.rs":"150b42e944f766fdca70d654dbe32f8a17498432729c78b9eb50b73ae7f91f86","src/ast/print.rs":"d12f2cc75cd62f35623e1eb7a77ab8ac804b971752082700d2c4f550f834b249","src/ast/visitor.rs":"1a7b473147e4f6b89623ef1744a9e87f665bcf160fe08a33ce8e35011811ba71","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"b3c5903a8937d2aff229a3ec65d4571d01ec4d9874c9a242ed6562c32702bcbd","src/hir/interval.rs":"e767fed363bebe4bbda0d78b8f07e73f321eaf4f837e2d7bd14a1617387e9a89","src/hir/literal/mod.rs":"ffe9a0aff7827f97bffd29eb2f4ba96627b16953161dce6c50a2f760e76bbd98","src/hir/mod.rs":"7f83c828223a54d236d348e48d5cedf015c904812110b6c38e9d52039c2b1572","src/hir/print.rs":"651b5d9776532a78612a5f9081372a57bad693890639ac19e3128b4defa96662","src/hir/translate.rs":"c7cd9693f73760263fd49a968714d27e7985ebe840211b2d83bca6686b0602a8","src/hir/visitor.rs":"e5bf7f8c09f6155e59c9d676fe25437f7e3700f9bf5d91101d7e246a64c11d5a","src/lib.rs":"a004f65196dd5745b3112e4acc8c467b18495cecac64a58d6608b35de67371cb","src/parser.rs":"0dfb553a152e008b2755f115663e553ed99c4b8e6a4dcbcad1662737534de49d","src/unicode.rs":"2ad48193433fefbede0837bd645f4288f6b39b1facb59dbb7d541bce7bf19109","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"2a2599a4e406fbbd0efd16aa6ce385c3f97b87c34820d6686a9f9113a5231c67","src/unicode_tables/case_folding_simple.rs":"9583803d4a10486da372b76979dbd26349b40766229467238eff972c1d78e47b","src/unicode_tables/general_category.rs":"36a93ba1cdeed96a00ff29a5ab5afd2c578a89541bf4dd8b18478146cebda0aa","src/unicode_tables/grapheme_cluster_break.rs":"39c388e9805a8391d3d3e69d74d831ce4fb99aa7e13e52c64dd2bd16d4765301","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"a98ea4afe71c2947023ae12bd25c46bf4c7de48eeb40979eca5c96ba62cee02e","src/unicode_tables/perl_space.rs":"ea2b3b84b4a48334082dadc6c37d9fcc9c9ded84b40e8f5c9c9314898638967e","src/unicode_tables/perl_word.rs":"6f1156bd6af32151ecffea4abe07a38fa04b1fc1b227ec1a8dac5d5f08d9d74b","src/unicode_tables/property_bool.rs":"0bd64f6e3228eaecf47824e238bdf1f8a9eef113ace6e790a57f045a8106701c","src/unicode_tables/property_names.rs":"5ca25437927eb70c62adf7d038e99a601cfb8a718677fd6de832589664d3c481","src/unicode_tables/property_values.rs":"5b4cc02392d382cf7af60455fc87b9980e97409b62a4b8d6c5843190d2e2d21d","src/unicode_tables/script.rs":"ea1d771b6d0a4b12d143f9bad2ea9342a0887878cbbe3c11262b6eabedaf2dd4","src/unicode_tables/script_extension.rs":"beeb8349703d903ff861beb8401bfd2599e457dc25df872e69d6ad1615f8b5e9","src/unicode_tables/sentence_break.rs":"2befe2a27cc4e8aecb624e310ef9f371462470dd3b2f572cec1f5873a5e30aa9","src/unicode_tables/word_break.rs":"94679177731b515f0c360eff394286a1f99b59527bdbc826cbf51d32f9666187","src/utf8.rs":"de854b3bfb3f7dbefc422f6a25935aaeef55ead2c35386c712a1fe9bf81a7b6f","test":"8a9bd1bd9fb389e08288f951319a9bbb0d4c5284a2ba63cbdab7f6afa2c2f76e"},"package":"456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"} \ No newline at end of file
diff --git a/third_party/rust/regex-syntax/Cargo.toml b/third_party/rust/regex-syntax/Cargo.toml
new file mode 100644
index 0000000000..8d87f7a429
--- /dev/null
+++ b/third_party/rust/regex-syntax/Cargo.toml
@@ -0,0 +1,41 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2018"
+name = "regex-syntax"
+version = "0.6.28"
+authors = ["The Rust Project Developers"]
+description = "A regular expression parser."
+homepage = "https://github.com/rust-lang/regex"
+documentation = "https://docs.rs/regex-syntax"
+readme = "README.md"
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/rust-lang/regex"
+
+[features]
+default = ["unicode"]
+unicode = [
+ "unicode-age",
+ "unicode-bool",
+ "unicode-case",
+ "unicode-gencat",
+ "unicode-perl",
+ "unicode-script",
+ "unicode-segment",
+]
+unicode-age = []
+unicode-bool = []
+unicode-case = []
+unicode-gencat = []
+unicode-perl = []
+unicode-script = []
+unicode-segment = []
diff --git a/third_party/rust/regex-syntax/LICENSE-APACHE b/third_party/rust/regex-syntax/LICENSE-APACHE
new file mode 100644
index 0000000000..16fe87b06e
--- /dev/null
+++ b/third_party/rust/regex-syntax/LICENSE-APACHE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/third_party/rust/regex-syntax/LICENSE-MIT b/third_party/rust/regex-syntax/LICENSE-MIT
new file mode 100644
index 0000000000..39d4bdb5ac
--- /dev/null
+++ b/third_party/rust/regex-syntax/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2014 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/third_party/rust/regex-syntax/README.md b/third_party/rust/regex-syntax/README.md
new file mode 100644
index 0000000000..592f842686
--- /dev/null
+++ b/third_party/rust/regex-syntax/README.md
@@ -0,0 +1,98 @@
+regex-syntax
+============
+This crate provides a robust regular expression parser.
+
+[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions)
+[![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax)
+[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
+
+
+### Documentation
+
+https://docs.rs/regex-syntax
+
+
+### Overview
+
+There are two primary types exported by this crate: `Ast` and `Hir`. The former
+is a faithful abstract syntax of a regular expression, and can convert regular
+expressions back to their concrete syntax while mostly preserving its original
+form. The latter type is a high level intermediate representation of a regular
+expression that is amenable to analysis and compilation into byte codes or
+automata. An `Hir` achieves this by drastically simplifying the syntactic
+structure of the regular expression. While an `Hir` can be converted back to
+its equivalent concrete syntax, the result is unlikely to resemble the original
+concrete syntax that produced the `Hir`.
+
+
+### Example
+
+This example shows how to parse a pattern string into its HIR:
+
+```rust
+use regex_syntax::Parser;
+use regex_syntax::hir::{self, Hir};
+
+let hir = Parser::new().parse("a|b").unwrap();
+assert_eq!(hir, Hir::alternation(vec![
+ Hir::literal(hir::Literal::Unicode('a')),
+ Hir::literal(hir::Literal::Unicode('b')),
+]));
+```
+
+
+### Safety
+
+This crate has no `unsafe` code and sets `forbid(unsafe_code)`. While it's
+possible this crate could use `unsafe` code in the future, the standard
+for doing so is extremely high. In general, most code in this crate is not
+performance critical, since it tends to be dwarfed by the time it takes to
+compile a regular expression into an automaton. Therefore, there is little need
+for extreme optimization, and therefore, use of `unsafe`.
+
+The standard for using `unsafe` in this crate is extremely high because this
+crate is intended to be reasonably safe to use with user supplied regular
+expressions. Therefore, while there may be bugs in the regex parser itself,
+they should _never_ result in memory unsafety unless there is either a bug
+in the compiler or the standard library. (Since `regex-syntax` has zero
+dependencies.)
+
+
+### Crate features
+
+By default, this crate bundles a fairly large amount of Unicode data tables
+(a source size of ~750KB). Because of their large size, one can disable some
+or all of these data tables. If a regular expression attempts to use Unicode
+data that is not available, then an error will occur when translating the `Ast`
+to the `Hir`.
+
+The full set of features one can disable are
+[in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features).
+
+
+### Testing
+
+Simply running `cargo test` will give you very good coverage. However, because
+of the large number of features exposed by this crate, a `test` script is
+included in this directory which will test several feature combinations. This
+is the same script that is run in CI.
+
+
+### Motivation
+
+The primary purpose of this crate is to provide the parser used by `regex`.
+Specifically, this crate is treated as an implementation detail of the `regex`,
+and is primarily developed for the needs of `regex`.
+
+Since this crate is an implementation detail of `regex`, it may experience
+breaking change releases at a different cadence from `regex`. This is only
+possible because this crate is _not_ a public dependency of `regex`.
+
+Another consequence of this de-coupling is that there is no direct way to
+compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must
+first convert the `Hir` to a string (via its `std::fmt::Display`) and then
+compile that via `Regex::new`. While this does repeat some work, compilation
+typically takes much longer than parsing.
+
+Stated differently, the coupling between `regex` and `regex-syntax` exists only
+at the level of the concrete syntax.
diff --git a/third_party/rust/regex-syntax/benches/bench.rs b/third_party/rust/regex-syntax/benches/bench.rs
new file mode 100644
index 0000000000..d4703d4fc1
--- /dev/null
+++ b/third_party/rust/regex-syntax/benches/bench.rs
@@ -0,0 +1,63 @@
+#![feature(test)]
+
+extern crate test;
+
+use regex_syntax::Parser;
+use test::Bencher;
+
+#[bench]
+fn parse_simple1(b: &mut Bencher) {
+ b.iter(|| {
+ let re = r"^bc(d|e)*$";
+ Parser::new().parse(re).unwrap()
+ });
+}
+
+#[bench]
+fn parse_simple2(b: &mut Bencher) {
+ b.iter(|| {
+ let re = r"'[a-zA-Z_][a-zA-Z0-9_]*(')\b";
+ Parser::new().parse(re).unwrap()
+ });
+}
+
+#[bench]
+fn parse_small1(b: &mut Bencher) {
+ b.iter(|| {
+ let re = r"\p{L}|\p{N}|\s|.|\d";
+ Parser::new().parse(re).unwrap()
+ });
+}
+
+#[bench]
+fn parse_medium1(b: &mut Bencher) {
+ b.iter(|| {
+ let re = r"\pL\p{Greek}\p{Hiragana}\p{Alphabetic}\p{Hebrew}\p{Arabic}";
+ Parser::new().parse(re).unwrap()
+ });
+}
+
+#[bench]
+fn parse_medium2(b: &mut Bencher) {
+ b.iter(|| {
+ let re = r"\s\S\w\W\d\D";
+ Parser::new().parse(re).unwrap()
+ });
+}
+
+#[bench]
+fn parse_medium3(b: &mut Bencher) {
+ b.iter(|| {
+ let re =
+ r"\p{age:3.2}\p{hira}\p{scx:hira}\p{alphabetic}\p{sc:Greek}\pL";
+ Parser::new().parse(re).unwrap()
+ });
+}
+
+#[bench]
+fn parse_huge(b: &mut Bencher) {
+ b.iter(|| {
+ let re = r"\p{L}{100}";
+ Parser::new().parse(re).unwrap()
+ });
+}
diff --git a/third_party/rust/regex-syntax/src/ast/mod.rs b/third_party/rust/regex-syntax/src/ast/mod.rs
new file mode 100644
index 0000000000..387ea3a698
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/ast/mod.rs
@@ -0,0 +1,1502 @@
+/*!
+Defines an abstract syntax for regular expressions.
+*/
+
+use std::cmp::Ordering;
+use std::error;
+use std::fmt;
+
+pub use crate::ast::visitor::{visit, Visitor};
+
+pub mod parse;
+pub mod print;
+mod visitor;
+
+/// An error that occurred while parsing a regular expression into an abstract
+/// syntax tree.
+///
+/// Note that not all ASTs represents a valid regular expression. For example,
+/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
+/// valid Unicode property name. That particular error is reported when
+/// translating an AST to the high-level intermediate representation (`HIR`).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Error {
+ /// The kind of error.
+ kind: ErrorKind,
+ /// The original pattern that the parser generated the error from. Every
+ /// span in an error is a valid range into this string.
+ pattern: String,
+ /// The span of this error.
+ span: Span,
+}
+
+impl Error {
+ /// Return the type of this error.
+ pub fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
+
+ /// The original pattern string in which this error occurred.
+ ///
+ /// Every span reported by this error is reported in terms of this string.
+ pub fn pattern(&self) -> &str {
+ &self.pattern
+ }
+
+ /// Return the span at which this error occurred.
+ pub fn span(&self) -> &Span {
+ &self.span
+ }
+
+ /// Return an auxiliary span. This span exists only for some errors that
+ /// benefit from being able to point to two locations in the original
+ /// regular expression. For example, "duplicate" errors will have the
+ /// main error position set to the duplicate occurrence while its
+ /// auxiliary span will be set to the initial occurrence.
+ pub fn auxiliary_span(&self) -> Option<&Span> {
+ use self::ErrorKind::*;
+ match self.kind {
+ FlagDuplicate { ref original } => Some(original),
+ FlagRepeatedNegation { ref original, .. } => Some(original),
+ GroupNameDuplicate { ref original, .. } => Some(original),
+ _ => None,
+ }
+ }
+}
+
+/// The type of an error that occurred while building an AST.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ErrorKind {
+ /// The capturing group limit was exceeded.
+ ///
+ /// Note that this represents a limit on the total number of capturing
+ /// groups in a regex and not necessarily the number of nested capturing
+ /// groups. That is, the nest limit can be low and it is still possible for
+ /// this error to occur.
+ CaptureLimitExceeded,
+ /// An invalid escape sequence was found in a character class set.
+ ClassEscapeInvalid,
+ /// An invalid character class range was found. An invalid range is any
+ /// range where the start is greater than the end.
+ ClassRangeInvalid,
+ /// An invalid range boundary was found in a character class. Range
+ /// boundaries must be a single literal codepoint, but this error indicates
+ /// that something else was found, such as a nested class.
+ ClassRangeLiteral,
+ /// An opening `[` was found with no corresponding closing `]`.
+ ClassUnclosed,
+ /// Note that this error variant is no longer used. Namely, a decimal
+ /// number can only appear as a repetition quantifier. When the number
+ /// in a repetition quantifier is empty, then it gets its own specialized
+ /// error, `RepetitionCountDecimalEmpty`.
+ DecimalEmpty,
+ /// An invalid decimal number was given where one was expected.
+ DecimalInvalid,
+ /// A bracketed hex literal was empty.
+ EscapeHexEmpty,
+ /// A bracketed hex literal did not correspond to a Unicode scalar value.
+ EscapeHexInvalid,
+ /// An invalid hexadecimal digit was found.
+ EscapeHexInvalidDigit,
+ /// EOF was found before an escape sequence was completed.
+ EscapeUnexpectedEof,
+ /// An unrecognized escape sequence.
+ EscapeUnrecognized,
+ /// A dangling negation was used when setting flags, e.g., `i-`.
+ FlagDanglingNegation,
+ /// A flag was used twice, e.g., `i-i`.
+ FlagDuplicate {
+ /// The position of the original flag. The error position
+ /// points to the duplicate flag.
+ original: Span,
+ },
+ /// The negation operator was used twice, e.g., `-i-s`.
+ FlagRepeatedNegation {
+ /// The position of the original negation operator. The error position
+ /// points to the duplicate negation operator.
+ original: Span,
+ },
+ /// Expected a flag but got EOF, e.g., `(?`.
+ FlagUnexpectedEof,
+ /// Unrecognized flag, e.g., `a`.
+ FlagUnrecognized,
+ /// A duplicate capture name was found.
+ GroupNameDuplicate {
+ /// The position of the initial occurrence of the capture name. The
+ /// error position itself points to the duplicate occurrence.
+ original: Span,
+ },
+ /// A capture group name is empty, e.g., `(?P<>abc)`.
+ GroupNameEmpty,
+ /// An invalid character was seen for a capture group name. This includes
+ /// errors where the first character is a digit (even though subsequent
+ /// characters are allowed to be digits).
+ GroupNameInvalid,
+ /// A closing `>` could not be found for a capture group name.
+ GroupNameUnexpectedEof,
+ /// An unclosed group, e.g., `(ab`.
+ ///
+ /// The span of this error corresponds to the unclosed parenthesis.
+ GroupUnclosed,
+ /// An unopened group, e.g., `ab)`.
+ GroupUnopened,
+ /// The nest limit was exceeded. The limit stored here is the limit
+ /// configured in the parser.
+ NestLimitExceeded(u32),
+ /// The range provided in a counted repetition operator is invalid. The
+ /// range is invalid if the start is greater than the end.
+ RepetitionCountInvalid,
+ /// An opening `{` was not followed by a valid decimal value.
+ /// For example, `x{}` or `x{]}` would fail.
+ RepetitionCountDecimalEmpty,
+ /// An opening `{` was found with no corresponding closing `}`.
+ RepetitionCountUnclosed,
+ /// A repetition operator was applied to a missing sub-expression. This
+ /// occurs, for example, in the regex consisting of just a `*` or even
+ /// `(?i)*`. It is, however, possible to create a repetition operating on
+ /// an empty sub-expression. For example, `()*` is still considered valid.
+ RepetitionMissing,
+ /// The Unicode class is not valid. This typically occurs when a `\p` is
+ /// followed by something other than a `{`.
+ UnicodeClassInvalid,
+ /// When octal support is disabled, this error is produced when an octal
+ /// escape is used. The octal escape is assumed to be an invocation of
+ /// a backreference, which is the common case.
+ UnsupportedBackreference,
+ /// When syntax similar to PCRE's look-around is used, this error is
+ /// returned. Some example syntaxes that are rejected include, but are
+ /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
+ /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
+ /// error is used to improve the user experience.
+ UnsupportedLookAround,
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl error::Error for Error {
+ // TODO: Remove this method entirely on the next breaking semver release.
+ #[allow(deprecated)]
+ fn description(&self) -> &str {
+ use self::ErrorKind::*;
+ match self.kind {
+ CaptureLimitExceeded => "capture group limit exceeded",
+ ClassEscapeInvalid => "invalid escape sequence in character class",
+ ClassRangeInvalid => "invalid character class range",
+ ClassRangeLiteral => "invalid range boundary, must be a literal",
+ ClassUnclosed => "unclosed character class",
+ DecimalEmpty => "empty decimal literal",
+ DecimalInvalid => "invalid decimal literal",
+ EscapeHexEmpty => "empty hexadecimal literal",
+ EscapeHexInvalid => "invalid hexadecimal literal",
+ EscapeHexInvalidDigit => "invalid hexadecimal digit",
+ EscapeUnexpectedEof => "unexpected eof (escape sequence)",
+ EscapeUnrecognized => "unrecognized escape sequence",
+ FlagDanglingNegation => "dangling flag negation operator",
+ FlagDuplicate { .. } => "duplicate flag",
+ FlagRepeatedNegation { .. } => "repeated negation",
+ FlagUnexpectedEof => "unexpected eof (flag)",
+ FlagUnrecognized => "unrecognized flag",
+ GroupNameDuplicate { .. } => "duplicate capture group name",
+ GroupNameEmpty => "empty capture group name",
+ GroupNameInvalid => "invalid capture group name",
+ GroupNameUnexpectedEof => "unclosed capture group name",
+ GroupUnclosed => "unclosed group",
+ GroupUnopened => "unopened group",
+ NestLimitExceeded(_) => "nest limit exceeded",
+ RepetitionCountInvalid => "invalid repetition count range",
+ RepetitionCountUnclosed => "unclosed counted repetition",
+ RepetitionMissing => "repetition operator missing expression",
+ UnicodeClassInvalid => "invalid Unicode character class",
+ UnsupportedBackreference => "backreferences are not supported",
+ UnsupportedLookAround => "look-around is not supported",
+ _ => unreachable!(),
+ }
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ crate::error::Formatter::from(self).fmt(f)
+ }
+}
+
+impl fmt::Display for ErrorKind {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use self::ErrorKind::*;
+ match *self {
+ CaptureLimitExceeded => write!(
+ f,
+ "exceeded the maximum number of \
+ capturing groups ({})",
+ ::std::u32::MAX
+ ),
+ ClassEscapeInvalid => {
+ write!(f, "invalid escape sequence found in character class")
+ }
+ ClassRangeInvalid => write!(
+ f,
+ "invalid character class range, \
+ the start must be <= the end"
+ ),
+ ClassRangeLiteral => {
+ write!(f, "invalid range boundary, must be a literal")
+ }
+ ClassUnclosed => write!(f, "unclosed character class"),
+ DecimalEmpty => write!(f, "decimal literal empty"),
+ DecimalInvalid => write!(f, "decimal literal invalid"),
+ EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
+ EscapeHexInvalid => {
+ write!(f, "hexadecimal literal is not a Unicode scalar value")
+ }
+ EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
+ EscapeUnexpectedEof => write!(
+ f,
+ "incomplete escape sequence, \
+ reached end of pattern prematurely"
+ ),
+ EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
+ FlagDanglingNegation => {
+ write!(f, "dangling flag negation operator")
+ }
+ FlagDuplicate { .. } => write!(f, "duplicate flag"),
+ FlagRepeatedNegation { .. } => {
+ write!(f, "flag negation operator repeated")
+ }
+ FlagUnexpectedEof => {
+ write!(f, "expected flag but got end of regex")
+ }
+ FlagUnrecognized => write!(f, "unrecognized flag"),
+ GroupNameDuplicate { .. } => {
+ write!(f, "duplicate capture group name")
+ }
+ GroupNameEmpty => write!(f, "empty capture group name"),
+ GroupNameInvalid => write!(f, "invalid capture group character"),
+ GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
+ GroupUnclosed => write!(f, "unclosed group"),
+ GroupUnopened => write!(f, "unopened group"),
+ NestLimitExceeded(limit) => write!(
+ f,
+ "exceed the maximum number of \
+ nested parentheses/brackets ({})",
+ limit
+ ),
+ RepetitionCountInvalid => write!(
+ f,
+ "invalid repetition count range, \
+ the start must be <= the end"
+ ),
+ RepetitionCountDecimalEmpty => {
+ write!(f, "repetition quantifier expects a valid decimal")
+ }
+ RepetitionCountUnclosed => {
+ write!(f, "unclosed counted repetition")
+ }
+ RepetitionMissing => {
+ write!(f, "repetition operator missing expression")
+ }
+ UnicodeClassInvalid => {
+ write!(f, "invalid Unicode character class")
+ }
+ UnsupportedBackreference => {
+ write!(f, "backreferences are not supported")
+ }
+ UnsupportedLookAround => write!(
+ f,
+ "look-around, including look-ahead and look-behind, \
+ is not supported"
+ ),
+ _ => unreachable!(),
+ }
+ }
+}
+
+/// Span represents the position information of a single AST item.
+///
+/// All span positions are absolute byte offsets that can be used on the
+/// original regular expression that was parsed.
+#[derive(Clone, Copy, Eq, PartialEq)]
+pub struct Span {
+ /// The start byte offset.
+ pub start: Position,
+ /// The end byte offset.
+ pub end: Position,
+}
+
+impl fmt::Debug for Span {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "Span({:?}, {:?})", self.start, self.end)
+ }
+}
+
+impl Ord for Span {
+ fn cmp(&self, other: &Span) -> Ordering {
+ (&self.start, &self.end).cmp(&(&other.start, &other.end))
+ }
+}
+
+impl PartialOrd for Span {
+ fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+/// A single position in a regular expression.
+///
+/// A position encodes one half of a span, and include the byte offset, line
+/// number and column number.
+#[derive(Clone, Copy, Eq, PartialEq)]
+pub struct Position {
+ /// The absolute offset of this position, starting at `0` from the
+ /// beginning of the regular expression pattern string.
+ pub offset: usize,
+ /// The line number, starting at `1`.
+ pub line: usize,
+ /// The approximate column number, starting at `1`.
+ pub column: usize,
+}
+
+impl fmt::Debug for Position {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(
+ f,
+ "Position(o: {:?}, l: {:?}, c: {:?})",
+ self.offset, self.line, self.column
+ )
+ }
+}
+
+impl Ord for Position {
+ fn cmp(&self, other: &Position) -> Ordering {
+ self.offset.cmp(&other.offset)
+ }
+}
+
+impl PartialOrd for Position {
+ fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Span {
+ /// Create a new span with the given positions.
+ pub fn new(start: Position, end: Position) -> Span {
+ Span { start, end }
+ }
+
+ /// Create a new span using the given position as the start and end.
+ pub fn splat(pos: Position) -> Span {
+ Span::new(pos, pos)
+ }
+
+ /// Create a new span by replacing the starting the position with the one
+ /// given.
+ pub fn with_start(self, pos: Position) -> Span {
+ Span { start: pos, ..self }
+ }
+
+ /// Create a new span by replacing the ending the position with the one
+ /// given.
+ pub fn with_end(self, pos: Position) -> Span {
+ Span { end: pos, ..self }
+ }
+
+ /// Returns true if and only if this span occurs on a single line.
+ pub fn is_one_line(&self) -> bool {
+ self.start.line == self.end.line
+ }
+
+ /// Returns true if and only if this span is empty. That is, it points to
+ /// a single position in the concrete syntax of a regular expression.
+ pub fn is_empty(&self) -> bool {
+ self.start.offset == self.end.offset
+ }
+}
+
+impl Position {
+ /// Create a new position with the given information.
+ ///
+ /// `offset` is the absolute offset of the position, starting at `0` from
+ /// the beginning of the regular expression pattern string.
+ ///
+ /// `line` is the line number, starting at `1`.
+ ///
+ /// `column` is the approximate column number, starting at `1`.
+ pub fn new(offset: usize, line: usize, column: usize) -> Position {
+ Position { offset, line, column }
+ }
+}
+
+/// An abstract syntax tree for a singular expression along with comments
+/// found.
+///
+/// Comments are not stored in the tree itself to avoid complexity. Each
+/// comment contains a span of precisely where it occurred in the original
+/// regular expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct WithComments {
+ /// The actual ast.
+ pub ast: Ast,
+ /// All comments found in the original regular expression.
+ pub comments: Vec<Comment>,
+}
+
+/// A comment from a regular expression with an associated span.
+///
+/// A regular expression can only contain comments when the `x` flag is
+/// enabled.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Comment {
+ /// The span of this comment, including the beginning `#` and ending `\n`.
+ pub span: Span,
+ /// The comment text, starting with the first character following the `#`
+ /// and ending with the last character preceding the `\n`.
+ pub comment: String,
+}
+
+/// An abstract syntax tree for a single regular expression.
+///
+/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
+/// space proportional to the size of the `Ast`.
+///
+/// This type defines its own destructor that uses constant stack space and
+/// heap space proportional to the size of the `Ast`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Ast {
+ /// An empty regex that matches everything.
+ Empty(Span),
+ /// A set of flags, e.g., `(?is)`.
+ Flags(SetFlags),
+ /// A single character literal, which includes escape sequences.
+ Literal(Literal),
+ /// The "any character" class.
+ Dot(Span),
+ /// A single zero-width assertion.
+ Assertion(Assertion),
+ /// A single character class. This includes all forms of character classes
+ /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
+ Class(Class),
+ /// A repetition operator applied to an arbitrary regular expression.
+ Repetition(Repetition),
+ /// A grouped regular expression.
+ Group(Group),
+ /// An alternation of regular expressions.
+ Alternation(Alternation),
+ /// A concatenation of regular expressions.
+ Concat(Concat),
+}
+
+impl Ast {
+ /// Return the span of this abstract syntax tree.
+ pub fn span(&self) -> &Span {
+ match *self {
+ Ast::Empty(ref span) => span,
+ Ast::Flags(ref x) => &x.span,
+ Ast::Literal(ref x) => &x.span,
+ Ast::Dot(ref span) => span,
+ Ast::Assertion(ref x) => &x.span,
+ Ast::Class(ref x) => x.span(),
+ Ast::Repetition(ref x) => &x.span,
+ Ast::Group(ref x) => &x.span,
+ Ast::Alternation(ref x) => &x.span,
+ Ast::Concat(ref x) => &x.span,
+ }
+ }
+
+ /// Return true if and only if this Ast is empty.
+ pub fn is_empty(&self) -> bool {
+ match *self {
+ Ast::Empty(_) => true,
+ _ => false,
+ }
+ }
+
+ /// Returns true if and only if this AST has any (including possibly empty)
+ /// subexpressions.
+ fn has_subexprs(&self) -> bool {
+ match *self {
+ Ast::Empty(_)
+ | Ast::Flags(_)
+ | Ast::Literal(_)
+ | Ast::Dot(_)
+ | Ast::Assertion(_) => false,
+ Ast::Class(_)
+ | Ast::Repetition(_)
+ | Ast::Group(_)
+ | Ast::Alternation(_)
+ | Ast::Concat(_) => true,
+ }
+ }
+}
+
+/// Print a display representation of this Ast.
+///
+/// This does not preserve any of the original whitespace formatting that may
+/// have originally been present in the concrete syntax from which this Ast
+/// was generated.
+///
+/// This implementation uses constant stack space and heap space proportional
+/// to the size of the `Ast`.
+impl fmt::Display for Ast {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use crate::ast::print::Printer;
+ Printer::new().print(self, f)
+ }
+}
+
+/// An alternation of regular expressions.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Alternation {
+ /// The span of this alternation.
+ pub span: Span,
+ /// The alternate regular expressions.
+ pub asts: Vec<Ast>,
+}
+
+impl Alternation {
+ /// Return this alternation as an AST.
+ ///
+ /// If this alternation contains zero ASTs, then Ast::Empty is
+ /// returned. If this alternation contains exactly 1 AST, then the
+ /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
+ pub fn into_ast(mut self) -> Ast {
+ match self.asts.len() {
+ 0 => Ast::Empty(self.span),
+ 1 => self.asts.pop().unwrap(),
+ _ => Ast::Alternation(self),
+ }
+ }
+}
+
+/// A concatenation of regular expressions.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Concat {
+ /// The span of this concatenation.
+ pub span: Span,
+ /// The concatenation regular expressions.
+ pub asts: Vec<Ast>,
+}
+
+impl Concat {
+ /// Return this concatenation as an AST.
+ ///
+ /// If this concatenation contains zero ASTs, then Ast::Empty is
+ /// returned. If this concatenation contains exactly 1 AST, then the
+ /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
+ pub fn into_ast(mut self) -> Ast {
+ match self.asts.len() {
+ 0 => Ast::Empty(self.span),
+ 1 => self.asts.pop().unwrap(),
+ _ => Ast::Concat(self),
+ }
+ }
+}
+
+/// A single literal expression.
+///
+/// A literal corresponds to a single Unicode scalar value. Literals may be
+/// represented in their literal form, e.g., `a` or in their escaped form,
+/// e.g., `\x61`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Literal {
+ /// The span of this literal.
+ pub span: Span,
+ /// The kind of this literal.
+ pub kind: LiteralKind,
+ /// The Unicode scalar value corresponding to this literal.
+ pub c: char,
+}
+
+impl Literal {
+ /// If this literal was written as a `\x` hex escape, then this returns
+ /// the corresponding byte value. Otherwise, this returns `None`.
+ pub fn byte(&self) -> Option<u8> {
+ let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
+ if self.c as u32 <= 255 && self.kind == short_hex {
+ Some(self.c as u8)
+ } else {
+ None
+ }
+ }
+}
+
+/// The kind of a single literal expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum LiteralKind {
+ /// The literal is written verbatim, e.g., `a` or `☃`.
+ Verbatim,
+ /// The literal is written as an escape because it is punctuation, e.g.,
+ /// `\*` or `\[`.
+ Punctuation,
+ /// The literal is written as an octal escape, e.g., `\141`.
+ Octal,
+ /// The literal is written as a hex code with a fixed number of digits
+ /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
+ /// `\U00000061`.
+ HexFixed(HexLiteralKind),
+ /// The literal is written as a hex code with a bracketed number of
+ /// digits. The only restriction is that the bracketed hex code must refer
+ /// to a valid Unicode scalar value.
+ HexBrace(HexLiteralKind),
+ /// The literal is written as a specially recognized escape, e.g., `\f`
+ /// or `\n`.
+ Special(SpecialLiteralKind),
+}
+
+/// The type of a special literal.
+///
+/// A special literal is a special escape sequence recognized by the regex
+/// parser, e.g., `\f` or `\n`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum SpecialLiteralKind {
+ /// Bell, spelled `\a` (`\x07`).
+ Bell,
+ /// Form feed, spelled `\f` (`\x0C`).
+ FormFeed,
+ /// Tab, spelled `\t` (`\x09`).
+ Tab,
+ /// Line feed, spelled `\n` (`\x0A`).
+ LineFeed,
+ /// Carriage return, spelled `\r` (`\x0D`).
+ CarriageReturn,
+ /// Vertical tab, spelled `\v` (`\x0B`).
+ VerticalTab,
+ /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
+ /// parsing in verbose mode.
+ Space,
+}
+
+/// The type of a Unicode hex literal.
+///
+/// Note that all variants behave the same when used with brackets. They only
+/// differ when used without brackets in the number of hex digits that must
+/// follow.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum HexLiteralKind {
+ /// A `\x` prefix. When used without brackets, this form is limited to
+ /// two digits.
+ X,
+ /// A `\u` prefix. When used without brackets, this form is limited to
+ /// four digits.
+ UnicodeShort,
+ /// A `\U` prefix. When used without brackets, this form is limited to
+ /// eight digits.
+ UnicodeLong,
+}
+
+impl HexLiteralKind {
+ /// The number of digits that must be used with this literal form when
+ /// used without brackets. When used with brackets, there is no
+ /// restriction on the number of digits.
+ pub fn digits(&self) -> u32 {
+ match *self {
+ HexLiteralKind::X => 2,
+ HexLiteralKind::UnicodeShort => 4,
+ HexLiteralKind::UnicodeLong => 8,
+ }
+ }
+}
+
+/// A single character class expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Class {
+ /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
+ Unicode(ClassUnicode),
+ /// A perl character class, e.g., `\d` or `\W`.
+ Perl(ClassPerl),
+ /// A bracketed character class set, which may contain zero or more
+ /// character ranges and/or zero or more nested classes. e.g.,
+ /// `[a-zA-Z\pL]`.
+ Bracketed(ClassBracketed),
+}
+
+impl Class {
+ /// Return the span of this character class.
+ pub fn span(&self) -> &Span {
+ match *self {
+ Class::Perl(ref x) => &x.span,
+ Class::Unicode(ref x) => &x.span,
+ Class::Bracketed(ref x) => &x.span,
+ }
+ }
+}
+
+/// A Perl character class.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassPerl {
+ /// The span of this class.
+ pub span: Span,
+ /// The kind of Perl class.
+ pub kind: ClassPerlKind,
+ /// Whether the class is negated or not. e.g., `\d` is not negated but
+ /// `\D` is.
+ pub negated: bool,
+}
+
+/// The available Perl character classes.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassPerlKind {
+ /// Decimal numbers.
+ Digit,
+ /// Whitespace.
+ Space,
+ /// Word characters.
+ Word,
+}
+
+/// An ASCII character class.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassAscii {
+ /// The span of this class.
+ pub span: Span,
+ /// The kind of ASCII class.
+ pub kind: ClassAsciiKind,
+ /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
+ /// but `[[:^alpha:]]` is.
+ pub negated: bool,
+}
+
+/// The available ASCII character classes.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassAsciiKind {
+ /// `[0-9A-Za-z]`
+ Alnum,
+ /// `[A-Za-z]`
+ Alpha,
+ /// `[\x00-\x7F]`
+ Ascii,
+ /// `[ \t]`
+ Blank,
+ /// `[\x00-\x1F\x7F]`
+ Cntrl,
+ /// `[0-9]`
+ Digit,
+ /// `[!-~]`
+ Graph,
+ /// `[a-z]`
+ Lower,
+ /// `[ -~]`
+ Print,
+ /// `[!-/:-@\[-`{-~]`
+ Punct,
+ /// `[\t\n\v\f\r ]`
+ Space,
+ /// `[A-Z]`
+ Upper,
+ /// `[0-9A-Za-z_]`
+ Word,
+ /// `[0-9A-Fa-f]`
+ Xdigit,
+}
+
+impl ClassAsciiKind {
+ /// Return the corresponding ClassAsciiKind variant for the given name.
+ ///
+ /// The name given should correspond to the lowercase version of the
+ /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
+ ///
+ /// If no variant with the corresponding name exists, then `None` is
+ /// returned.
+ pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
+ use self::ClassAsciiKind::*;
+ match name {
+ "alnum" => Some(Alnum),
+ "alpha" => Some(Alpha),
+ "ascii" => Some(Ascii),
+ "blank" => Some(Blank),
+ "cntrl" => Some(Cntrl),
+ "digit" => Some(Digit),
+ "graph" => Some(Graph),
+ "lower" => Some(Lower),
+ "print" => Some(Print),
+ "punct" => Some(Punct),
+ "space" => Some(Space),
+ "upper" => Some(Upper),
+ "word" => Some(Word),
+ "xdigit" => Some(Xdigit),
+ _ => None,
+ }
+ }
+}
+
+/// A Unicode character class.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassUnicode {
+ /// The span of this class.
+ pub span: Span,
+ /// Whether this class is negated or not.
+ ///
+ /// Note: be careful when using this attribute. This specifically refers
+ /// to whether the class is written as `\p` or `\P`, where the latter
+ /// is `negated = true`. However, it also possible to write something like
+ /// `\P{scx!=Katakana}` which is actually equivalent to
+ /// `\p{scx=Katakana}` and is therefore not actually negated even though
+ /// `negated = true` here. To test whether this class is truly negated
+ /// or not, use the `is_negated` method.
+ pub negated: bool,
+ /// The kind of Unicode class.
+ pub kind: ClassUnicodeKind,
+}
+
+impl ClassUnicode {
+ /// Returns true if this class has been negated.
+ ///
+ /// Note that this takes the Unicode op into account, if it's present.
+ /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
+ pub fn is_negated(&self) -> bool {
+ match self.kind {
+ ClassUnicodeKind::NamedValue {
+ op: ClassUnicodeOpKind::NotEqual,
+ ..
+ } => !self.negated,
+ _ => self.negated,
+ }
+ }
+}
+
+/// The available forms of Unicode character classes.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassUnicodeKind {
+ /// A one letter abbreviated class, e.g., `\pN`.
+ OneLetter(char),
+ /// A binary property, general category or script. The string may be
+ /// empty.
+ Named(String),
+ /// A property name and an associated value.
+ NamedValue {
+ /// The type of Unicode op used to associate `name` with `value`.
+ op: ClassUnicodeOpKind,
+ /// The property name (which may be empty).
+ name: String,
+ /// The property value (which may be empty).
+ value: String,
+ },
+}
+
+/// The type of op used in a Unicode character class.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassUnicodeOpKind {
+ /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
+ Equal,
+ /// A property set to a specific value using a colon, e.g.,
+ /// `\p{scx:Katakana}`.
+ Colon,
+ /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
+ NotEqual,
+}
+
+impl ClassUnicodeOpKind {
+ /// Whether the op is an equality op or not.
+ pub fn is_equal(&self) -> bool {
+ match *self {
+ ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
+ _ => false,
+ }
+ }
+}
+
+/// A bracketed character class, e.g., `[a-z0-9]`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassBracketed {
+ /// The span of this class.
+ pub span: Span,
+ /// Whether this class is negated or not. e.g., `[a]` is not negated but
+ /// `[^a]` is.
+ pub negated: bool,
+ /// The type of this set. A set is either a normal union of things, e.g.,
+ /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
+ pub kind: ClassSet,
+}
+
+/// A character class set.
+///
+/// This type corresponds to the internal structure of a bracketed character
+/// class. That is, every bracketed character is one of two types: a union of
+/// items (literals, ranges, other bracketed classes) or a tree of binary set
+/// operations.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassSet {
+ /// An item, which can be a single literal, range, nested character class
+ /// or a union of items.
+ Item(ClassSetItem),
+ /// A single binary operation (i.e., &&, -- or ~~).
+ BinaryOp(ClassSetBinaryOp),
+}
+
+impl ClassSet {
+ /// Build a set from a union.
+ pub fn union(ast: ClassSetUnion) -> ClassSet {
+ ClassSet::Item(ClassSetItem::Union(ast))
+ }
+
+ /// Return the span of this character class set.
+ pub fn span(&self) -> &Span {
+ match *self {
+ ClassSet::Item(ref x) => x.span(),
+ ClassSet::BinaryOp(ref x) => &x.span,
+ }
+ }
+
+ /// Return true if and only if this class set is empty.
+ fn is_empty(&self) -> bool {
+ match *self {
+ ClassSet::Item(ClassSetItem::Empty(_)) => true,
+ _ => false,
+ }
+ }
+}
+
+/// A single component of a character class set.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ClassSetItem {
+ /// An empty item.
+ ///
+ /// Note that a bracketed character class cannot contain a single empty
+ /// item. Empty items can appear when using one of the binary operators.
+ /// For example, `[&&]` is the intersection of two empty classes.
+ Empty(Span),
+ /// A single literal.
+ Literal(Literal),
+ /// A range between two literals.
+ Range(ClassSetRange),
+ /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
+ Ascii(ClassAscii),
+ /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
+ Unicode(ClassUnicode),
+ /// A perl character class, e.g., `\d` or `\W`.
+ Perl(ClassPerl),
+ /// A bracketed character class set, which may contain zero or more
+ /// character ranges and/or zero or more nested classes. e.g.,
+ /// `[a-zA-Z\pL]`.
+ Bracketed(Box<ClassBracketed>),
+ /// A union of items.
+ Union(ClassSetUnion),
+}
+
+impl ClassSetItem {
+ /// Return the span of this character class set item.
+ pub fn span(&self) -> &Span {
+ match *self {
+ ClassSetItem::Empty(ref span) => span,
+ ClassSetItem::Literal(ref x) => &x.span,
+ ClassSetItem::Range(ref x) => &x.span,
+ ClassSetItem::Ascii(ref x) => &x.span,
+ ClassSetItem::Perl(ref x) => &x.span,
+ ClassSetItem::Unicode(ref x) => &x.span,
+ ClassSetItem::Bracketed(ref x) => &x.span,
+ ClassSetItem::Union(ref x) => &x.span,
+ }
+ }
+}
+
+/// A single character class range in a set.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassSetRange {
+ /// The span of this range.
+ pub span: Span,
+ /// The start of this range.
+ pub start: Literal,
+ /// The end of this range.
+ pub end: Literal,
+}
+
+impl ClassSetRange {
+ /// Returns true if and only if this character class range is valid.
+ ///
+ /// The only case where a range is invalid is if its start is greater than
+ /// its end.
+ pub fn is_valid(&self) -> bool {
+ self.start.c <= self.end.c
+ }
+}
+
+/// A union of items inside a character class set.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassSetUnion {
+ /// The span of the items in this operation. e.g., the `a-z0-9` in
+ /// `[^a-z0-9]`
+ pub span: Span,
+ /// The sequence of items that make up this union.
+ pub items: Vec<ClassSetItem>,
+}
+
+impl ClassSetUnion {
+ /// Push a new item in this union.
+ ///
+ /// The ending position of this union's span is updated to the ending
+ /// position of the span of the item given. If the union is empty, then
+ /// the starting position of this union is set to the starting position
+ /// of this item.
+ ///
+ /// In other words, if you only use this method to add items to a union
+ /// and you set the spans on each item correctly, then you should never
+ /// need to adjust the span of the union directly.
+ pub fn push(&mut self, item: ClassSetItem) {
+ if self.items.is_empty() {
+ self.span.start = item.span().start;
+ }
+ self.span.end = item.span().end;
+ self.items.push(item);
+ }
+
+ /// Return this union as a character class set item.
+ ///
+ /// If this union contains zero items, then an empty union is
+ /// returned. If this concatenation contains exactly 1 item, then the
+ /// corresponding item is returned. Otherwise, ClassSetItem::Union is
+ /// returned.
+ pub fn into_item(mut self) -> ClassSetItem {
+ match self.items.len() {
+ 0 => ClassSetItem::Empty(self.span),
+ 1 => self.items.pop().unwrap(),
+ _ => ClassSetItem::Union(self),
+ }
+ }
+}
+
+/// A Unicode character class set operation.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassSetBinaryOp {
+ /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
+ pub span: Span,
+ /// The type of this set operation.
+ pub kind: ClassSetBinaryOpKind,
+ /// The left hand side of the operation.
+ pub lhs: Box<ClassSet>,
+ /// The right hand side of the operation.
+ pub rhs: Box<ClassSet>,
+}
+
+/// The type of a Unicode character class set operation.
+///
+/// Note that this doesn't explicitly represent union since there is no
+/// explicit union operator. Concatenation inside a character class corresponds
+/// to the union operation.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum ClassSetBinaryOpKind {
+ /// The intersection of two sets, e.g., `\pN&&[a-z]`.
+ Intersection,
+ /// The difference of two sets, e.g., `\pN--[0-9]`.
+ Difference,
+ /// The symmetric difference of two sets. The symmetric difference is the
+ /// set of elements belonging to one but not both sets.
+ /// e.g., `[\pL~~[:ascii:]]`.
+ SymmetricDifference,
+}
+
+/// A single zero-width assertion.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Assertion {
+ /// The span of this assertion.
+ pub span: Span,
+ /// The assertion kind, e.g., `\b` or `^`.
+ pub kind: AssertionKind,
+}
+
+/// An assertion kind.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum AssertionKind {
+ /// `^`
+ StartLine,
+ /// `$`
+ EndLine,
+ /// `\A`
+ StartText,
+ /// `\z`
+ EndText,
+ /// `\b`
+ WordBoundary,
+ /// `\B`
+ NotWordBoundary,
+}
+
+/// A repetition operation applied to a regular expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Repetition {
+ /// The span of this operation.
+ pub span: Span,
+ /// The actual operation.
+ pub op: RepetitionOp,
+ /// Whether this operation was applied greedily or not.
+ pub greedy: bool,
+ /// The regular expression under repetition.
+ pub ast: Box<Ast>,
+}
+
+/// The repetition operator itself.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct RepetitionOp {
+ /// The span of this operator. This includes things like `+`, `*?` and
+ /// `{m,n}`.
+ pub span: Span,
+ /// The type of operation.
+ pub kind: RepetitionKind,
+}
+
+/// The kind of a repetition operator.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum RepetitionKind {
+ /// `?`
+ ZeroOrOne,
+ /// `*`
+ ZeroOrMore,
+ /// `+`
+ OneOrMore,
+ /// `{m,n}`
+ Range(RepetitionRange),
+}
+
+/// A range repetition operator.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum RepetitionRange {
+ /// `{m}`
+ Exactly(u32),
+ /// `{m,}`
+ AtLeast(u32),
+ /// `{m,n}`
+ Bounded(u32, u32),
+}
+
+impl RepetitionRange {
+ /// Returns true if and only if this repetition range is valid.
+ ///
+ /// The only case where a repetition range is invalid is if it is bounded
+ /// and its start is greater than its end.
+ pub fn is_valid(&self) -> bool {
+ match *self {
+ RepetitionRange::Bounded(s, e) if s > e => false,
+ _ => true,
+ }
+ }
+}
+
+/// A grouped regular expression.
+///
+/// This includes both capturing and non-capturing groups. This does **not**
+/// include flag-only groups like `(?is)`, but does contain any group that
+/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
+/// `(?is:a)`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Group {
+ /// The span of this group.
+ pub span: Span,
+ /// The kind of this group.
+ pub kind: GroupKind,
+ /// The regular expression in this group.
+ pub ast: Box<Ast>,
+}
+
+impl Group {
+ /// If this group is non-capturing, then this returns the (possibly empty)
+ /// set of flags. Otherwise, `None` is returned.
+ pub fn flags(&self) -> Option<&Flags> {
+ match self.kind {
+ GroupKind::NonCapturing(ref flags) => Some(flags),
+ _ => None,
+ }
+ }
+
+ /// Returns true if and only if this group is capturing.
+ pub fn is_capturing(&self) -> bool {
+ match self.kind {
+ GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
+ GroupKind::NonCapturing(_) => false,
+ }
+ }
+
+ /// Returns the capture index of this group, if this is a capturing group.
+ ///
+ /// This returns a capture index precisely when `is_capturing` is `true`.
+ pub fn capture_index(&self) -> Option<u32> {
+ match self.kind {
+ GroupKind::CaptureIndex(i) => Some(i),
+ GroupKind::CaptureName(ref x) => Some(x.index),
+ GroupKind::NonCapturing(_) => None,
+ }
+ }
+}
+
+/// The kind of a group.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum GroupKind {
+ /// `(a)`
+ CaptureIndex(u32),
+ /// `(?P<name>a)`
+ CaptureName(CaptureName),
+ /// `(?:a)` and `(?i:a)`
+ NonCapturing(Flags),
+}
+
+/// A capture name.
+///
+/// This corresponds to the name itself between the angle brackets in, e.g.,
+/// `(?P<foo>expr)`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct CaptureName {
+ /// The span of this capture name.
+ pub span: Span,
+ /// The capture name.
+ pub name: String,
+ /// The capture index.
+ pub index: u32,
+}
+
+/// A group of flags that is not applied to a particular regular expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct SetFlags {
+ /// The span of these flags, including the grouping parentheses.
+ pub span: Span,
+ /// The actual sequence of flags.
+ pub flags: Flags,
+}
+
+/// A group of flags.
+///
+/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Flags {
+ /// The span of this group of flags.
+ pub span: Span,
+ /// A sequence of flag items. Each item is either a flag or a negation
+ /// operator.
+ pub items: Vec<FlagsItem>,
+}
+
+impl Flags {
+ /// Add the given item to this sequence of flags.
+ ///
+ /// If the item was added successfully, then `None` is returned. If the
+ /// given item is a duplicate, then `Some(i)` is returned, where
+ /// `items[i].kind == item.kind`.
+ pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
+ for (i, x) in self.items.iter().enumerate() {
+ if x.kind == item.kind {
+ return Some(i);
+ }
+ }
+ self.items.push(item);
+ None
+ }
+
+ /// Returns the state of the given flag in this set.
+ ///
+ /// If the given flag is in the set but is negated, then `Some(false)` is
+ /// returned.
+ ///
+ /// If the given flag is in the set and is not negated, then `Some(true)`
+ /// is returned.
+ ///
+ /// Otherwise, `None` is returned.
+ pub fn flag_state(&self, flag: Flag) -> Option<bool> {
+ let mut negated = false;
+ for x in &self.items {
+ match x.kind {
+ FlagsItemKind::Negation => {
+ negated = true;
+ }
+ FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
+ return Some(!negated);
+ }
+ _ => {}
+ }
+ }
+ None
+ }
+}
+
+/// A single item in a group of flags.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct FlagsItem {
+ /// The span of this item.
+ pub span: Span,
+ /// The kind of this item.
+ pub kind: FlagsItemKind,
+}
+
+/// The kind of an item in a group of flags.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum FlagsItemKind {
+ /// A negation operator applied to all subsequent flags in the enclosing
+ /// group.
+ Negation,
+ /// A single flag in a group.
+ Flag(Flag),
+}
+
+impl FlagsItemKind {
+ /// Returns true if and only if this item is a negation operator.
+ pub fn is_negation(&self) -> bool {
+ match *self {
+ FlagsItemKind::Negation => true,
+ _ => false,
+ }
+ }
+}
+
+/// A single flag.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Flag {
+ /// `i`
+ CaseInsensitive,
+ /// `m`
+ MultiLine,
+ /// `s`
+ DotMatchesNewLine,
+ /// `U`
+ SwapGreed,
+ /// `u`
+ Unicode,
+ /// `x`
+ IgnoreWhitespace,
+}
+
+/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
+/// space but heap space proportional to the depth of the `Ast`.
+impl Drop for Ast {
+ fn drop(&mut self) {
+ use std::mem;
+
+ match *self {
+ Ast::Empty(_)
+ | Ast::Flags(_)
+ | Ast::Literal(_)
+ | Ast::Dot(_)
+ | Ast::Assertion(_)
+ // Classes are recursive, so they get their own Drop impl.
+ | Ast::Class(_) => return,
+ Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
+ Ast::Group(ref x) if !x.ast.has_subexprs() => return,
+ Ast::Alternation(ref x) if x.asts.is_empty() => return,
+ Ast::Concat(ref x) if x.asts.is_empty() => return,
+ _ => {}
+ }
+
+ let empty_span = || Span::splat(Position::new(0, 0, 0));
+ let empty_ast = || Ast::Empty(empty_span());
+ let mut stack = vec![mem::replace(self, empty_ast())];
+ while let Some(mut ast) = stack.pop() {
+ match ast {
+ Ast::Empty(_)
+ | Ast::Flags(_)
+ | Ast::Literal(_)
+ | Ast::Dot(_)
+ | Ast::Assertion(_)
+ // Classes are recursive, so they get their own Drop impl.
+ | Ast::Class(_) => {}
+ Ast::Repetition(ref mut x) => {
+ stack.push(mem::replace(&mut x.ast, empty_ast()));
+ }
+ Ast::Group(ref mut x) => {
+ stack.push(mem::replace(&mut x.ast, empty_ast()));
+ }
+ Ast::Alternation(ref mut x) => {
+ stack.extend(x.asts.drain(..));
+ }
+ Ast::Concat(ref mut x) => {
+ stack.extend(x.asts.drain(..));
+ }
+ }
+ }
+ }
+}
+
+/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
+/// stack space but heap space proportional to the depth of the `ClassSet`.
+impl Drop for ClassSet {
+ fn drop(&mut self) {
+ use std::mem;
+
+ match *self {
+ ClassSet::Item(ref item) => match *item {
+ ClassSetItem::Empty(_)
+ | ClassSetItem::Literal(_)
+ | ClassSetItem::Range(_)
+ | ClassSetItem::Ascii(_)
+ | ClassSetItem::Unicode(_)
+ | ClassSetItem::Perl(_) => return,
+ ClassSetItem::Bracketed(ref x) => {
+ if x.kind.is_empty() {
+ return;
+ }
+ }
+ ClassSetItem::Union(ref x) => {
+ if x.items.is_empty() {
+ return;
+ }
+ }
+ },
+ ClassSet::BinaryOp(ref op) => {
+ if op.lhs.is_empty() && op.rhs.is_empty() {
+ return;
+ }
+ }
+ }
+
+ let empty_span = || Span::splat(Position::new(0, 0, 0));
+ let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
+ let mut stack = vec![mem::replace(self, empty_set())];
+ while let Some(mut set) = stack.pop() {
+ match set {
+ ClassSet::Item(ref mut item) => match *item {
+ ClassSetItem::Empty(_)
+ | ClassSetItem::Literal(_)
+ | ClassSetItem::Range(_)
+ | ClassSetItem::Ascii(_)
+ | ClassSetItem::Unicode(_)
+ | ClassSetItem::Perl(_) => {}
+ ClassSetItem::Bracketed(ref mut x) => {
+ stack.push(mem::replace(&mut x.kind, empty_set()));
+ }
+ ClassSetItem::Union(ref mut x) => {
+ stack.extend(x.items.drain(..).map(ClassSet::Item));
+ }
+ },
+ ClassSet::BinaryOp(ref mut op) => {
+ stack.push(mem::replace(&mut op.lhs, empty_set()));
+ stack.push(mem::replace(&mut op.rhs, empty_set()));
+ }
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // We use a thread with an explicit stack size to test that our destructor
+ // for Ast can handle arbitrarily sized expressions in constant stack
+ // space. In case we run on a platform without threads (WASM?), we limit
+ // this test to Windows/Unix.
+ #[test]
+ #[cfg(any(unix, windows))]
+ fn no_stack_overflow_on_drop() {
+ use std::thread;
+
+ let run = || {
+ let span = || Span::splat(Position::new(0, 0, 0));
+ let mut ast = Ast::Empty(span());
+ for i in 0..200 {
+ ast = Ast::Group(Group {
+ span: span(),
+ kind: GroupKind::CaptureIndex(i),
+ ast: Box::new(ast),
+ });
+ }
+ assert!(!ast.is_empty());
+ };
+
+ // We run our test on a thread with a small stack size so we can
+ // force the issue more easily.
+ thread::Builder::new()
+ .stack_size(1 << 10)
+ .spawn(run)
+ .unwrap()
+ .join()
+ .unwrap();
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/ast/parse.rs b/third_party/rust/regex-syntax/src/ast/parse.rs
new file mode 100644
index 0000000000..6e9c9aca06
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/ast/parse.rs
@@ -0,0 +1,5930 @@
+/*!
+This module provides a regular expression parser.
+*/
+
+use std::borrow::Borrow;
+use std::cell::{Cell, RefCell};
+use std::mem;
+use std::result;
+
+use crate::ast::{self, Ast, Position, Span};
+use crate::either::Either;
+
+use crate::is_meta_character;
+
+type Result<T> = result::Result<T, ast::Error>;
+
+/// A primitive is an expression with no sub-expressions. This includes
+/// literals, assertions and non-set character classes. This representation
+/// is used as intermediate state in the parser.
+///
+/// This does not include ASCII character classes, since they can only appear
+/// within a set character class.
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum Primitive {
+ Literal(ast::Literal),
+ Assertion(ast::Assertion),
+ Dot(Span),
+ Perl(ast::ClassPerl),
+ Unicode(ast::ClassUnicode),
+}
+
+impl Primitive {
+ /// Return the span of this primitive.
+ fn span(&self) -> &Span {
+ match *self {
+ Primitive::Literal(ref x) => &x.span,
+ Primitive::Assertion(ref x) => &x.span,
+ Primitive::Dot(ref span) => span,
+ Primitive::Perl(ref x) => &x.span,
+ Primitive::Unicode(ref x) => &x.span,
+ }
+ }
+
+ /// Convert this primitive into a proper AST.
+ fn into_ast(self) -> Ast {
+ match self {
+ Primitive::Literal(lit) => Ast::Literal(lit),
+ Primitive::Assertion(assert) => Ast::Assertion(assert),
+ Primitive::Dot(span) => Ast::Dot(span),
+ Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
+ Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
+ }
+ }
+
+ /// Convert this primitive into an item in a character class.
+ ///
+ /// If this primitive is not a legal item (i.e., an assertion or a dot),
+ /// then return an error.
+ fn into_class_set_item<P: Borrow<Parser>>(
+ self,
+ p: &ParserI<'_, P>,
+ ) -> Result<ast::ClassSetItem> {
+ use self::Primitive::*;
+ use crate::ast::ClassSetItem;
+
+ match self {
+ Literal(lit) => Ok(ClassSetItem::Literal(lit)),
+ Perl(cls) => Ok(ClassSetItem::Perl(cls)),
+ Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
+ x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
+ }
+ }
+
+ /// Convert this primitive into a literal in a character class. In
+ /// particular, literals are the only valid items that can appear in
+ /// ranges.
+ ///
+ /// If this primitive is not a legal item (i.e., a class, assertion or a
+ /// dot), then return an error.
+ fn into_class_literal<P: Borrow<Parser>>(
+ self,
+ p: &ParserI<'_, P>,
+ ) -> Result<ast::Literal> {
+ use self::Primitive::*;
+
+ match self {
+ Literal(lit) => Ok(lit),
+ x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
+ }
+ }
+}
+
+/// Returns true if the given character is a hexadecimal digit.
+fn is_hex(c: char) -> bool {
+ ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
+}
+
+/// Returns true if the given character is a valid in a capture group name.
+///
+/// If `first` is true, then `c` is treated as the first character in the
+/// group name (which must be alphabetic or underscore).
+fn is_capture_char(c: char, first: bool) -> bool {
+ c == '_'
+ || (!first
+ && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
+ || ('A' <= c && c <= 'Z')
+ || ('a' <= c && c <= 'z')
+}
+
+/// A builder for a regular expression parser.
+///
+/// This builder permits modifying configuration options for the parser.
+#[derive(Clone, Debug)]
+pub struct ParserBuilder {
+ ignore_whitespace: bool,
+ nest_limit: u32,
+ octal: bool,
+}
+
+impl Default for ParserBuilder {
+ fn default() -> ParserBuilder {
+ ParserBuilder::new()
+ }
+}
+
+impl ParserBuilder {
+ /// Create a new parser builder with a default configuration.
+ pub fn new() -> ParserBuilder {
+ ParserBuilder {
+ ignore_whitespace: false,
+ nest_limit: 250,
+ octal: false,
+ }
+ }
+
+ /// Build a parser from this configuration with the given pattern.
+ pub fn build(&self) -> Parser {
+ Parser {
+ pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
+ capture_index: Cell::new(0),
+ nest_limit: self.nest_limit,
+ octal: self.octal,
+ initial_ignore_whitespace: self.ignore_whitespace,
+ ignore_whitespace: Cell::new(self.ignore_whitespace),
+ comments: RefCell::new(vec![]),
+ stack_group: RefCell::new(vec![]),
+ stack_class: RefCell::new(vec![]),
+ capture_names: RefCell::new(vec![]),
+ scratch: RefCell::new(String::new()),
+ }
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an `Ast` using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire Ast is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since this parser
+ /// implementation will limit itself to heap space proportional to the
+ /// length of the pattern string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation, which results in a nest
+ /// depth of `1`. In general, a nest limit is not something that manifests
+ /// in an obvious way in the concrete syntax, therefore, it should not be
+ /// used in a granular way.
+ pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
+ self.nest_limit = limit;
+ self
+ }
+
+ /// Whether to support octal syntax or not.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints in
+ /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+ /// `\141` are all equivalent regular expressions, where the last example
+ /// shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem, it does
+ /// make good error messages harder. That is, in PCRE based regex engines,
+ /// syntax like `\0` invokes a backreference, which is explicitly
+ /// unsupported in Rust's regex engine. However, many users expect it to
+ /// be supported. Therefore, when octal support is disabled, the error
+ /// message will explicitly mention that backreferences aren't supported.
+ ///
+ /// Octal syntax is disabled by default.
+ pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.octal = yes;
+ self
+ }
+
+ /// Enable verbose mode in the regular expression.
+ ///
+ /// When enabled, verbose mode permits insignificant whitespace in many
+ /// places in the regular expression, as well as comments. Comments are
+ /// started using `#` and continue until the end of the line.
+ ///
+ /// By default, this is disabled. It may be selectively enabled in the
+ /// regular expression by using the `x` flag regardless of this setting.
+ pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.ignore_whitespace = yes;
+ self
+ }
+}
+
+/// A regular expression parser.
+///
+/// This parses a string representation of a regular expression into an
+/// abstract syntax tree. The size of the tree is proportional to the length
+/// of the regular expression pattern.
+///
+/// A `Parser` can be configured in more detail via a
+/// [`ParserBuilder`](struct.ParserBuilder.html).
+#[derive(Clone, Debug)]
+pub struct Parser {
+ /// The current position of the parser.
+ pos: Cell<Position>,
+ /// The current capture index.
+ capture_index: Cell<u32>,
+ /// The maximum number of open parens/brackets allowed. If the parser
+ /// exceeds this number, then an error is returned.
+ nest_limit: u32,
+ /// Whether to support octal syntax or not. When `false`, the parser will
+ /// return an error helpfully pointing out that backreferences are not
+ /// supported.
+ octal: bool,
+ /// The initial setting for `ignore_whitespace` as provided by
+ /// `ParserBuilder`. It is used when resetting the parser's state.
+ initial_ignore_whitespace: bool,
+ /// Whether whitespace should be ignored. When enabled, comments are
+ /// also permitted.
+ ignore_whitespace: Cell<bool>,
+ /// A list of comments, in order of appearance.
+ comments: RefCell<Vec<ast::Comment>>,
+ /// A stack of grouped sub-expressions, including alternations.
+ stack_group: RefCell<Vec<GroupState>>,
+ /// A stack of nested character classes. This is only non-empty when
+ /// parsing a class.
+ stack_class: RefCell<Vec<ClassState>>,
+ /// A sorted sequence of capture names. This is used to detect duplicate
+ /// capture names and report an error if one is detected.
+ capture_names: RefCell<Vec<ast::CaptureName>>,
+ /// A scratch buffer used in various places. Mostly this is used to
+ /// accumulate relevant characters from parts of a pattern.
+ scratch: RefCell<String>,
+}
+
+/// ParserI is the internal parser implementation.
+///
+/// We use this separate type so that we can carry the provided pattern string
+/// along with us. In particular, a `Parser` internal state is not tied to any
+/// one pattern, but `ParserI` is.
+///
+/// This type also lets us use `ParserI<&Parser>` in production code while
+/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
+/// work against the internal interface of the parser.
+#[derive(Clone, Debug)]
+struct ParserI<'s, P> {
+ /// The parser state/configuration.
+ parser: P,
+ /// The full regular expression provided by the user.
+ pattern: &'s str,
+}
+
+/// GroupState represents a single stack frame while parsing nested groups
+/// and alternations. Each frame records the state up to an opening parenthesis
+/// or a alternating bracket `|`.
+#[derive(Clone, Debug)]
+enum GroupState {
+ /// This state is pushed whenever an opening group is found.
+ Group {
+ /// The concatenation immediately preceding the opening group.
+ concat: ast::Concat,
+ /// The group that has been opened. Its sub-AST is always empty.
+ group: ast::Group,
+ /// Whether this group has the `x` flag enabled or not.
+ ignore_whitespace: bool,
+ },
+ /// This state is pushed whenever a new alternation branch is found. If
+ /// an alternation branch is found and this state is at the top of the
+ /// stack, then this state should be modified to include the new
+ /// alternation.
+ Alternation(ast::Alternation),
+}
+
+/// ClassState represents a single stack frame while parsing character classes.
+/// Each frame records the state up to an intersection, difference, symmetric
+/// difference or nested class.
+///
+/// Note that a parser's character class stack is only non-empty when parsing
+/// a character class. In all other cases, it is empty.
+#[derive(Clone, Debug)]
+enum ClassState {
+ /// This state is pushed whenever an opening bracket is found.
+ Open {
+ /// The union of class items immediately preceding this class.
+ union: ast::ClassSetUnion,
+ /// The class that has been opened. Typically this just corresponds
+ /// to the `[`, but it can also include `[^` since `^` indicates
+ /// negation of the class.
+ set: ast::ClassBracketed,
+ },
+ /// This state is pushed when a operator is seen. When popped, the stored
+ /// set becomes the left hand side of the operator.
+ Op {
+ /// The type of the operation, i.e., &&, -- or ~~.
+ kind: ast::ClassSetBinaryOpKind,
+ /// The left-hand side of the operator.
+ lhs: ast::ClassSet,
+ },
+}
+
+impl Parser {
+ /// Create a new parser with a default configuration.
+ ///
+ /// The parser can be run with either the `parse` or `parse_with_comments`
+ /// methods. The parse methods return an abstract syntax tree.
+ ///
+ /// To set configuration options on the parser, use
+ /// [`ParserBuilder`](struct.ParserBuilder.html).
+ pub fn new() -> Parser {
+ ParserBuilder::new().build()
+ }
+
+ /// Parse the regular expression into an abstract syntax tree.
+ pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
+ ParserI::new(self, pattern).parse()
+ }
+
+ /// Parse the regular expression and return an abstract syntax tree with
+ /// all of the comments found in the pattern.
+ pub fn parse_with_comments(
+ &mut self,
+ pattern: &str,
+ ) -> Result<ast::WithComments> {
+ ParserI::new(self, pattern).parse_with_comments()
+ }
+
+ /// Reset the internal state of a parser.
+ ///
+ /// This is called at the beginning of every parse. This prevents the
+ /// parser from running with inconsistent state (say, if a previous
+ /// invocation returned an error and the parser is reused).
+ fn reset(&self) {
+ // These settings should be in line with the construction
+ // in `ParserBuilder::build`.
+ self.pos.set(Position { offset: 0, line: 1, column: 1 });
+ self.ignore_whitespace.set(self.initial_ignore_whitespace);
+ self.comments.borrow_mut().clear();
+ self.stack_group.borrow_mut().clear();
+ self.stack_class.borrow_mut().clear();
+ }
+}
+
+impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
+ /// Build an internal parser from a parser configuration and a pattern.
+ fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
+ ParserI { parser, pattern }
+ }
+
+ /// Return a reference to the parser state.
+ fn parser(&self) -> &Parser {
+ self.parser.borrow()
+ }
+
+ /// Return a reference to the pattern being parsed.
+ fn pattern(&self) -> &str {
+ self.pattern.borrow()
+ }
+
+ /// Create a new error with the given span and error type.
+ fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
+ ast::Error { kind, pattern: self.pattern().to_string(), span }
+ }
+
+ /// Return the current offset of the parser.
+ ///
+ /// The offset starts at `0` from the beginning of the regular expression
+ /// pattern string.
+ fn offset(&self) -> usize {
+ self.parser().pos.get().offset
+ }
+
+ /// Return the current line number of the parser.
+ ///
+ /// The line number starts at `1`.
+ fn line(&self) -> usize {
+ self.parser().pos.get().line
+ }
+
+ /// Return the current column of the parser.
+ ///
+ /// The column number starts at `1` and is reset whenever a `\n` is seen.
+ fn column(&self) -> usize {
+ self.parser().pos.get().column
+ }
+
+ /// Return the next capturing index. Each subsequent call increments the
+ /// internal index.
+ ///
+ /// The span given should correspond to the location of the opening
+ /// parenthesis.
+ ///
+ /// If the capture limit is exceeded, then an error is returned.
+ fn next_capture_index(&self, span: Span) -> Result<u32> {
+ let current = self.parser().capture_index.get();
+ let i = current.checked_add(1).ok_or_else(|| {
+ self.error(span, ast::ErrorKind::CaptureLimitExceeded)
+ })?;
+ self.parser().capture_index.set(i);
+ Ok(i)
+ }
+
+ /// Adds the given capture name to this parser. If this capture name has
+ /// already been used, then an error is returned.
+ fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
+ let mut names = self.parser().capture_names.borrow_mut();
+ match names
+ .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
+ {
+ Err(i) => {
+ names.insert(i, cap.clone());
+ Ok(())
+ }
+ Ok(i) => Err(self.error(
+ cap.span,
+ ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
+ )),
+ }
+ }
+
+ /// Return whether the parser should ignore whitespace or not.
+ fn ignore_whitespace(&self) -> bool {
+ self.parser().ignore_whitespace.get()
+ }
+
+ /// Return the character at the current position of the parser.
+ ///
+ /// This panics if the current position does not point to a valid char.
+ fn char(&self) -> char {
+ self.char_at(self.offset())
+ }
+
+ /// Return the character at the given position.
+ ///
+ /// This panics if the given position does not point to a valid char.
+ fn char_at(&self, i: usize) -> char {
+ self.pattern()[i..]
+ .chars()
+ .next()
+ .unwrap_or_else(|| panic!("expected char at offset {}", i))
+ }
+
+ /// Bump the parser to the next Unicode scalar value.
+ ///
+ /// If the end of the input has been reached, then `false` is returned.
+ fn bump(&self) -> bool {
+ if self.is_eof() {
+ return false;
+ }
+ let Position { mut offset, mut line, mut column } = self.pos();
+ if self.char() == '\n' {
+ line = line.checked_add(1).unwrap();
+ column = 1;
+ } else {
+ column = column.checked_add(1).unwrap();
+ }
+ offset += self.char().len_utf8();
+ self.parser().pos.set(Position { offset, line, column });
+ self.pattern()[self.offset()..].chars().next().is_some()
+ }
+
+ /// If the substring starting at the current position of the parser has
+ /// the given prefix, then bump the parser to the character immediately
+ /// following the prefix and return true. Otherwise, don't bump the parser
+ /// and return false.
+ fn bump_if(&self, prefix: &str) -> bool {
+ if self.pattern()[self.offset()..].starts_with(prefix) {
+ for _ in 0..prefix.chars().count() {
+ self.bump();
+ }
+ true
+ } else {
+ false
+ }
+ }
+
+ /// Returns true if and only if the parser is positioned at a look-around
+ /// prefix. The conditions under which this returns true must always
+ /// correspond to a regular expression that would otherwise be consider
+ /// invalid.
+ ///
+ /// This should only be called immediately after parsing the opening of
+ /// a group or a set of flags.
+ fn is_lookaround_prefix(&self) -> bool {
+ self.bump_if("?=")
+ || self.bump_if("?!")
+ || self.bump_if("?<=")
+ || self.bump_if("?<!")
+ }
+
+ /// Bump the parser, and if the `x` flag is enabled, bump through any
+ /// subsequent spaces. Return true if and only if the parser is not at
+ /// EOF.
+ fn bump_and_bump_space(&self) -> bool {
+ if !self.bump() {
+ return false;
+ }
+ self.bump_space();
+ !self.is_eof()
+ }
+
+ /// If the `x` flag is enabled (i.e., whitespace insensitivity with
+ /// comments), then this will advance the parser through all whitespace
+ /// and comments to the next non-whitespace non-comment byte.
+ ///
+ /// If the `x` flag is disabled, then this is a no-op.
+ ///
+ /// This should be used selectively throughout the parser where
+ /// arbitrary whitespace is permitted when the `x` flag is enabled. For
+ /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
+ fn bump_space(&self) {
+ if !self.ignore_whitespace() {
+ return;
+ }
+ while !self.is_eof() {
+ if self.char().is_whitespace() {
+ self.bump();
+ } else if self.char() == '#' {
+ let start = self.pos();
+ let mut comment_text = String::new();
+ self.bump();
+ while !self.is_eof() {
+ let c = self.char();
+ self.bump();
+ if c == '\n' {
+ break;
+ }
+ comment_text.push(c);
+ }
+ let comment = ast::Comment {
+ span: Span::new(start, self.pos()),
+ comment: comment_text,
+ };
+ self.parser().comments.borrow_mut().push(comment);
+ } else {
+ break;
+ }
+ }
+ }
+
+ /// Peek at the next character in the input without advancing the parser.
+ ///
+ /// If the input has been exhausted, then this returns `None`.
+ fn peek(&self) -> Option<char> {
+ if self.is_eof() {
+ return None;
+ }
+ self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
+ }
+
+ /// Like peek, but will ignore spaces when the parser is in whitespace
+ /// insensitive mode.
+ fn peek_space(&self) -> Option<char> {
+ if !self.ignore_whitespace() {
+ return self.peek();
+ }
+ if self.is_eof() {
+ return None;
+ }
+ let mut start = self.offset() + self.char().len_utf8();
+ let mut in_comment = false;
+ for (i, c) in self.pattern()[start..].char_indices() {
+ if c.is_whitespace() {
+ continue;
+ } else if !in_comment && c == '#' {
+ in_comment = true;
+ } else if in_comment && c == '\n' {
+ in_comment = false;
+ } else {
+ start += i;
+ break;
+ }
+ }
+ self.pattern()[start..].chars().next()
+ }
+
+ /// Returns true if the next call to `bump` would return false.
+ fn is_eof(&self) -> bool {
+ self.offset() == self.pattern().len()
+ }
+
+ /// Return the current position of the parser, which includes the offset,
+ /// line and column.
+ fn pos(&self) -> Position {
+ self.parser().pos.get()
+ }
+
+ /// Create a span at the current position of the parser. Both the start
+ /// and end of the span are set.
+ fn span(&self) -> Span {
+ Span::splat(self.pos())
+ }
+
+ /// Create a span that covers the current character.
+ fn span_char(&self) -> Span {
+ let mut next = Position {
+ offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
+ line: self.line(),
+ column: self.column().checked_add(1).unwrap(),
+ };
+ if self.char() == '\n' {
+ next.line += 1;
+ next.column = 1;
+ }
+ Span::new(self.pos(), next)
+ }
+
+ /// Parse and push a single alternation on to the parser's internal stack.
+ /// If the top of the stack already has an alternation, then add to that
+ /// instead of pushing a new one.
+ ///
+ /// The concatenation given corresponds to a single alternation branch.
+ /// The concatenation returned starts the next branch and is empty.
+ ///
+ /// This assumes the parser is currently positioned at `|` and will advance
+ /// the parser to the character following `|`.
+ #[inline(never)]
+ fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
+ assert_eq!(self.char(), '|');
+ concat.span.end = self.pos();
+ self.push_or_add_alternation(concat);
+ self.bump();
+ Ok(ast::Concat { span: self.span(), asts: vec![] })
+ }
+
+ /// Pushes or adds the given branch of an alternation to the parser's
+ /// internal stack of state.
+ fn push_or_add_alternation(&self, concat: ast::Concat) {
+ use self::GroupState::*;
+
+ let mut stack = self.parser().stack_group.borrow_mut();
+ if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
+ alts.asts.push(concat.into_ast());
+ return;
+ }
+ stack.push(Alternation(ast::Alternation {
+ span: Span::new(concat.span.start, self.pos()),
+ asts: vec![concat.into_ast()],
+ }));
+ }
+
+ /// Parse and push a group AST (and its parent concatenation) on to the
+ /// parser's internal stack. Return a fresh concatenation corresponding
+ /// to the group's sub-AST.
+ ///
+ /// If a set of flags was found (with no group), then the concatenation
+ /// is returned with that set of flags added.
+ ///
+ /// This assumes that the parser is currently positioned on the opening
+ /// parenthesis. It advances the parser to the character at the start
+ /// of the sub-expression (or adjoining expression).
+ ///
+ /// If there was a problem parsing the start of the group, then an error
+ /// is returned.
+ #[inline(never)]
+ fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
+ assert_eq!(self.char(), '(');
+ match self.parse_group()? {
+ Either::Left(set) => {
+ let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
+ if let Some(v) = ignore {
+ self.parser().ignore_whitespace.set(v);
+ }
+
+ concat.asts.push(Ast::Flags(set));
+ Ok(concat)
+ }
+ Either::Right(group) => {
+ let old_ignore_whitespace = self.ignore_whitespace();
+ let new_ignore_whitespace = group
+ .flags()
+ .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
+ .unwrap_or(old_ignore_whitespace);
+ self.parser().stack_group.borrow_mut().push(
+ GroupState::Group {
+ concat,
+ group,
+ ignore_whitespace: old_ignore_whitespace,
+ },
+ );
+ self.parser().ignore_whitespace.set(new_ignore_whitespace);
+ Ok(ast::Concat { span: self.span(), asts: vec![] })
+ }
+ }
+ }
+
+ /// Pop a group AST from the parser's internal stack and set the group's
+ /// AST to the given concatenation. Return the concatenation containing
+ /// the group.
+ ///
+ /// This assumes that the parser is currently positioned on the closing
+ /// parenthesis and advances the parser to the character following the `)`.
+ ///
+ /// If no such group could be popped, then an unopened group error is
+ /// returned.
+ #[inline(never)]
+ fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
+ use self::GroupState::*;
+
+ assert_eq!(self.char(), ')');
+ let mut stack = self.parser().stack_group.borrow_mut();
+ let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
+ .pop()
+ {
+ Some(Group { concat, group, ignore_whitespace }) => {
+ (concat, group, ignore_whitespace, None)
+ }
+ Some(Alternation(alt)) => match stack.pop() {
+ Some(Group { concat, group, ignore_whitespace }) => {
+ (concat, group, ignore_whitespace, Some(alt))
+ }
+ None | Some(Alternation(_)) => {
+ return Err(self.error(
+ self.span_char(),
+ ast::ErrorKind::GroupUnopened,
+ ));
+ }
+ },
+ None => {
+ return Err(self
+ .error(self.span_char(), ast::ErrorKind::GroupUnopened));
+ }
+ };
+ self.parser().ignore_whitespace.set(ignore_whitespace);
+ group_concat.span.end = self.pos();
+ self.bump();
+ group.span.end = self.pos();
+ match alt {
+ Some(mut alt) => {
+ alt.span.end = group_concat.span.end;
+ alt.asts.push(group_concat.into_ast());
+ group.ast = Box::new(alt.into_ast());
+ }
+ None => {
+ group.ast = Box::new(group_concat.into_ast());
+ }
+ }
+ prior_concat.asts.push(Ast::Group(group));
+ Ok(prior_concat)
+ }
+
+ /// Pop the last state from the parser's internal stack, if it exists, and
+ /// add the given concatenation to it. There either must be no state or a
+ /// single alternation item on the stack. Any other scenario produces an
+ /// error.
+ ///
+ /// This assumes that the parser has advanced to the end.
+ #[inline(never)]
+ fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
+ concat.span.end = self.pos();
+ let mut stack = self.parser().stack_group.borrow_mut();
+ let ast = match stack.pop() {
+ None => Ok(concat.into_ast()),
+ Some(GroupState::Alternation(mut alt)) => {
+ alt.span.end = self.pos();
+ alt.asts.push(concat.into_ast());
+ Ok(Ast::Alternation(alt))
+ }
+ Some(GroupState::Group { group, .. }) => {
+ return Err(
+ self.error(group.span, ast::ErrorKind::GroupUnclosed)
+ );
+ }
+ };
+ // If we try to pop again, there should be nothing.
+ match stack.pop() {
+ None => ast,
+ Some(GroupState::Alternation(_)) => {
+ // This unreachable is unfortunate. This case can't happen
+ // because the only way we can be here is if there were two
+ // `GroupState::Alternation`s adjacent in the parser's stack,
+ // which we guarantee to never happen because we never push a
+ // `GroupState::Alternation` if one is already at the top of
+ // the stack.
+ unreachable!()
+ }
+ Some(GroupState::Group { group, .. }) => {
+ Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
+ }
+ }
+ }
+
+ /// Parse the opening of a character class and push the current class
+ /// parsing context onto the parser's stack. This assumes that the parser
+ /// is positioned at an opening `[`. The given union should correspond to
+ /// the union of set items built up before seeing the `[`.
+ ///
+ /// If there was a problem parsing the opening of the class, then an error
+ /// is returned. Otherwise, a new union of set items for the class is
+ /// returned (which may be populated with either a `]` or a `-`).
+ #[inline(never)]
+ fn push_class_open(
+ &self,
+ parent_union: ast::ClassSetUnion,
+ ) -> Result<ast::ClassSetUnion> {
+ assert_eq!(self.char(), '[');
+
+ let (nested_set, nested_union) = self.parse_set_class_open()?;
+ self.parser()
+ .stack_class
+ .borrow_mut()
+ .push(ClassState::Open { union: parent_union, set: nested_set });
+ Ok(nested_union)
+ }
+
+ /// Parse the end of a character class set and pop the character class
+ /// parser stack. The union given corresponds to the last union built
+ /// before seeing the closing `]`. The union returned corresponds to the
+ /// parent character class set with the nested class added to it.
+ ///
+ /// This assumes that the parser is positioned at a `]` and will advance
+ /// the parser to the byte immediately following the `]`.
+ ///
+ /// If the stack is empty after popping, then this returns the final
+ /// "top-level" character class AST (where a "top-level" character class
+ /// is one that is not nested inside any other character class).
+ ///
+ /// If there is no corresponding opening bracket on the parser's stack,
+ /// then an error is returned.
+ #[inline(never)]
+ fn pop_class(
+ &self,
+ nested_union: ast::ClassSetUnion,
+ ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
+ assert_eq!(self.char(), ']');
+
+ let item = ast::ClassSet::Item(nested_union.into_item());
+ let prevset = self.pop_class_op(item);
+ let mut stack = self.parser().stack_class.borrow_mut();
+ match stack.pop() {
+ None => {
+ // We can never observe an empty stack:
+ //
+ // 1) We are guaranteed to start with a non-empty stack since
+ // the character class parser is only initiated when it sees
+ // a `[`.
+ // 2) If we ever observe an empty stack while popping after
+ // seeing a `]`, then we signal the character class parser
+ // to terminate.
+ panic!("unexpected empty character class stack")
+ }
+ Some(ClassState::Op { .. }) => {
+ // This panic is unfortunate, but this case is impossible
+ // since we already popped the Op state if one exists above.
+ // Namely, every push to the class parser stack is guarded by
+ // whether an existing Op is already on the top of the stack.
+ // If it is, the existing Op is modified. That is, the stack
+ // can never have consecutive Op states.
+ panic!("unexpected ClassState::Op")
+ }
+ Some(ClassState::Open { mut union, mut set }) => {
+ self.bump();
+ set.span.end = self.pos();
+ set.kind = prevset;
+ if stack.is_empty() {
+ Ok(Either::Right(ast::Class::Bracketed(set)))
+ } else {
+ union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
+ Ok(Either::Left(union))
+ }
+ }
+ }
+ }
+
+ /// Return an "unclosed class" error whose span points to the most
+ /// recently opened class.
+ ///
+ /// This should only be called while parsing a character class.
+ #[inline(never)]
+ fn unclosed_class_error(&self) -> ast::Error {
+ for state in self.parser().stack_class.borrow().iter().rev() {
+ if let ClassState::Open { ref set, .. } = *state {
+ return self.error(set.span, ast::ErrorKind::ClassUnclosed);
+ }
+ }
+ // We are guaranteed to have a non-empty stack with at least
+ // one open bracket, so we should never get here.
+ panic!("no open character class found")
+ }
+
+ /// Push the current set of class items on to the class parser's stack as
+ /// the left hand side of the given operator.
+ ///
+ /// A fresh set union is returned, which should be used to build the right
+ /// hand side of this operator.
+ #[inline(never)]
+ fn push_class_op(
+ &self,
+ next_kind: ast::ClassSetBinaryOpKind,
+ next_union: ast::ClassSetUnion,
+ ) -> ast::ClassSetUnion {
+ let item = ast::ClassSet::Item(next_union.into_item());
+ let new_lhs = self.pop_class_op(item);
+ self.parser()
+ .stack_class
+ .borrow_mut()
+ .push(ClassState::Op { kind: next_kind, lhs: new_lhs });
+ ast::ClassSetUnion { span: self.span(), items: vec![] }
+ }
+
+ /// Pop a character class set from the character class parser stack. If the
+ /// top of the stack is just an item (not an operation), then return the
+ /// given set unchanged. If the top of the stack is an operation, then the
+ /// given set will be used as the rhs of the operation on the top of the
+ /// stack. In that case, the binary operation is returned as a set.
+ #[inline(never)]
+ fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
+ let mut stack = self.parser().stack_class.borrow_mut();
+ let (kind, lhs) = match stack.pop() {
+ Some(ClassState::Op { kind, lhs }) => (kind, lhs),
+ Some(state @ ClassState::Open { .. }) => {
+ stack.push(state);
+ return rhs;
+ }
+ None => unreachable!(),
+ };
+ let span = Span::new(lhs.span().start, rhs.span().end);
+ ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
+ span,
+ kind,
+ lhs: Box::new(lhs),
+ rhs: Box::new(rhs),
+ })
+ }
+}
+
+impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
+ /// Parse the regular expression into an abstract syntax tree.
+ fn parse(&self) -> Result<Ast> {
+ self.parse_with_comments().map(|astc| astc.ast)
+ }
+
+ /// Parse the regular expression and return an abstract syntax tree with
+ /// all of the comments found in the pattern.
+ fn parse_with_comments(&self) -> Result<ast::WithComments> {
+ assert_eq!(self.offset(), 0, "parser can only be used once");
+ self.parser().reset();
+ let mut concat = ast::Concat { span: self.span(), asts: vec![] };
+ loop {
+ self.bump_space();
+ if self.is_eof() {
+ break;
+ }
+ match self.char() {
+ '(' => concat = self.push_group(concat)?,
+ ')' => concat = self.pop_group(concat)?,
+ '|' => concat = self.push_alternate(concat)?,
+ '[' => {
+ let class = self.parse_set_class()?;
+ concat.asts.push(Ast::Class(class));
+ }
+ '?' => {
+ concat = self.parse_uncounted_repetition(
+ concat,
+ ast::RepetitionKind::ZeroOrOne,
+ )?;
+ }
+ '*' => {
+ concat = self.parse_uncounted_repetition(
+ concat,
+ ast::RepetitionKind::ZeroOrMore,
+ )?;
+ }
+ '+' => {
+ concat = self.parse_uncounted_repetition(
+ concat,
+ ast::RepetitionKind::OneOrMore,
+ )?;
+ }
+ '{' => {
+ concat = self.parse_counted_repetition(concat)?;
+ }
+ _ => concat.asts.push(self.parse_primitive()?.into_ast()),
+ }
+ }
+ let ast = self.pop_group_end(concat)?;
+ NestLimiter::new(self).check(&ast)?;
+ Ok(ast::WithComments {
+ ast,
+ comments: mem::replace(
+ &mut *self.parser().comments.borrow_mut(),
+ vec![],
+ ),
+ })
+ }
+
+ /// Parses an uncounted repetition operation. An uncounted repetition
+ /// operator includes ?, * and +, but does not include the {m,n} syntax.
+ /// The given `kind` should correspond to the operator observed by the
+ /// caller.
+ ///
+ /// This assumes that the parser is currently positioned at the repetition
+ /// operator and advances the parser to the first character after the
+ /// operator. (Note that the operator may include a single additional `?`,
+ /// which makes the operator ungreedy.)
+ ///
+ /// The caller should include the concatenation that is being built. The
+ /// concatenation returned includes the repetition operator applied to the
+ /// last expression in the given concatenation.
+ #[inline(never)]
+ fn parse_uncounted_repetition(
+ &self,
+ mut concat: ast::Concat,
+ kind: ast::RepetitionKind,
+ ) -> Result<ast::Concat> {
+ assert!(
+ self.char() == '?' || self.char() == '*' || self.char() == '+'
+ );
+ let op_start = self.pos();
+ let ast = match concat.asts.pop() {
+ Some(ast) => ast,
+ None => {
+ return Err(
+ self.error(self.span(), ast::ErrorKind::RepetitionMissing)
+ )
+ }
+ };
+ match ast {
+ Ast::Empty(_) | Ast::Flags(_) => {
+ return Err(
+ self.error(self.span(), ast::ErrorKind::RepetitionMissing)
+ )
+ }
+ _ => {}
+ }
+ let mut greedy = true;
+ if self.bump() && self.char() == '?' {
+ greedy = false;
+ self.bump();
+ }
+ concat.asts.push(Ast::Repetition(ast::Repetition {
+ span: ast.span().with_end(self.pos()),
+ op: ast::RepetitionOp {
+ span: Span::new(op_start, self.pos()),
+ kind,
+ },
+ greedy,
+ ast: Box::new(ast),
+ }));
+ Ok(concat)
+ }
+
+ /// Parses a counted repetition operation. A counted repetition operator
+ /// corresponds to the {m,n} syntax, and does not include the ?, * or +
+ /// operators.
+ ///
+ /// This assumes that the parser is currently positioned at the opening `{`
+ /// and advances the parser to the first character after the operator.
+ /// (Note that the operator may include a single additional `?`, which
+ /// makes the operator ungreedy.)
+ ///
+ /// The caller should include the concatenation that is being built. The
+ /// concatenation returned includes the repetition operator applied to the
+ /// last expression in the given concatenation.
+ #[inline(never)]
+ fn parse_counted_repetition(
+ &self,
+ mut concat: ast::Concat,
+ ) -> Result<ast::Concat> {
+ assert!(self.char() == '{');
+ let start = self.pos();
+ let ast = match concat.asts.pop() {
+ Some(ast) => ast,
+ None => {
+ return Err(
+ self.error(self.span(), ast::ErrorKind::RepetitionMissing)
+ )
+ }
+ };
+ match ast {
+ Ast::Empty(_) | Ast::Flags(_) => {
+ return Err(
+ self.error(self.span(), ast::ErrorKind::RepetitionMissing)
+ )
+ }
+ _ => {}
+ }
+ if !self.bump_and_bump_space() {
+ return Err(self.error(
+ Span::new(start, self.pos()),
+ ast::ErrorKind::RepetitionCountUnclosed,
+ ));
+ }
+ let count_start = specialize_err(
+ self.parse_decimal(),
+ ast::ErrorKind::DecimalEmpty,
+ ast::ErrorKind::RepetitionCountDecimalEmpty,
+ )?;
+ let mut range = ast::RepetitionRange::Exactly(count_start);
+ if self.is_eof() {
+ return Err(self.error(
+ Span::new(start, self.pos()),
+ ast::ErrorKind::RepetitionCountUnclosed,
+ ));
+ }
+ if self.char() == ',' {
+ if !self.bump_and_bump_space() {
+ return Err(self.error(
+ Span::new(start, self.pos()),
+ ast::ErrorKind::RepetitionCountUnclosed,
+ ));
+ }
+ if self.char() != '}' {
+ let count_end = specialize_err(
+ self.parse_decimal(),
+ ast::ErrorKind::DecimalEmpty,
+ ast::ErrorKind::RepetitionCountDecimalEmpty,
+ )?;
+ range = ast::RepetitionRange::Bounded(count_start, count_end);
+ } else {
+ range = ast::RepetitionRange::AtLeast(count_start);
+ }
+ }
+ if self.is_eof() || self.char() != '}' {
+ return Err(self.error(
+ Span::new(start, self.pos()),
+ ast::ErrorKind::RepetitionCountUnclosed,
+ ));
+ }
+
+ let mut greedy = true;
+ if self.bump_and_bump_space() && self.char() == '?' {
+ greedy = false;
+ self.bump();
+ }
+
+ let op_span = Span::new(start, self.pos());
+ if !range.is_valid() {
+ return Err(
+ self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
+ );
+ }
+ concat.asts.push(Ast::Repetition(ast::Repetition {
+ span: ast.span().with_end(self.pos()),
+ op: ast::RepetitionOp {
+ span: op_span,
+ kind: ast::RepetitionKind::Range(range),
+ },
+ greedy,
+ ast: Box::new(ast),
+ }));
+ Ok(concat)
+ }
+
+ /// Parse a group (which contains a sub-expression) or a set of flags.
+ ///
+ /// If a group was found, then it is returned with an empty AST. If a set
+ /// of flags is found, then that set is returned.
+ ///
+ /// The parser should be positioned at the opening parenthesis.
+ ///
+ /// This advances the parser to the character before the start of the
+ /// sub-expression (in the case of a group) or to the closing parenthesis
+ /// immediately following the set of flags.
+ ///
+ /// # Errors
+ ///
+ /// If flags are given and incorrectly specified, then a corresponding
+ /// error is returned.
+ ///
+ /// If a capture name is given and it is incorrectly specified, then a
+ /// corresponding error is returned.
+ #[inline(never)]
+ fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
+ assert_eq!(self.char(), '(');
+ let open_span = self.span_char();
+ self.bump();
+ self.bump_space();
+ if self.is_lookaround_prefix() {
+ return Err(self.error(
+ Span::new(open_span.start, self.span().end),
+ ast::ErrorKind::UnsupportedLookAround,
+ ));
+ }
+ let inner_span = self.span();
+ if self.bump_if("?P<") {
+ let capture_index = self.next_capture_index(open_span)?;
+ let cap = self.parse_capture_name(capture_index)?;
+ Ok(Either::Right(ast::Group {
+ span: open_span,
+ kind: ast::GroupKind::CaptureName(cap),
+ ast: Box::new(Ast::Empty(self.span())),
+ }))
+ } else if self.bump_if("?") {
+ if self.is_eof() {
+ return Err(
+ self.error(open_span, ast::ErrorKind::GroupUnclosed)
+ );
+ }
+ let flags = self.parse_flags()?;
+ let char_end = self.char();
+ self.bump();
+ if char_end == ')' {
+ // We don't allow empty flags, e.g., `(?)`. We instead
+ // interpret it as a repetition operator missing its argument.
+ if flags.items.is_empty() {
+ return Err(self.error(
+ inner_span,
+ ast::ErrorKind::RepetitionMissing,
+ ));
+ }
+ Ok(Either::Left(ast::SetFlags {
+ span: Span { end: self.pos(), ..open_span },
+ flags,
+ }))
+ } else {
+ assert_eq!(char_end, ':');
+ Ok(Either::Right(ast::Group {
+ span: open_span,
+ kind: ast::GroupKind::NonCapturing(flags),
+ ast: Box::new(Ast::Empty(self.span())),
+ }))
+ }
+ } else {
+ let capture_index = self.next_capture_index(open_span)?;
+ Ok(Either::Right(ast::Group {
+ span: open_span,
+ kind: ast::GroupKind::CaptureIndex(capture_index),
+ ast: Box::new(Ast::Empty(self.span())),
+ }))
+ }
+ }
+
+ /// Parses a capture group name. Assumes that the parser is positioned at
+ /// the first character in the name following the opening `<` (and may
+ /// possibly be EOF). This advances the parser to the first character
+ /// following the closing `>`.
+ ///
+ /// The caller must provide the capture index of the group for this name.
+ #[inline(never)]
+ fn parse_capture_name(
+ &self,
+ capture_index: u32,
+ ) -> Result<ast::CaptureName> {
+ if self.is_eof() {
+ return Err(self
+ .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
+ }
+ let start = self.pos();
+ loop {
+ if self.char() == '>' {
+ break;
+ }
+ if !is_capture_char(self.char(), self.pos() == start) {
+ return Err(self.error(
+ self.span_char(),
+ ast::ErrorKind::GroupNameInvalid,
+ ));
+ }
+ if !self.bump() {
+ break;
+ }
+ }
+ let end = self.pos();
+ if self.is_eof() {
+ return Err(self
+ .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
+ }
+ assert_eq!(self.char(), '>');
+ self.bump();
+ let name = &self.pattern()[start.offset..end.offset];
+ if name.is_empty() {
+ return Err(self.error(
+ Span::new(start, start),
+ ast::ErrorKind::GroupNameEmpty,
+ ));
+ }
+ let capname = ast::CaptureName {
+ span: Span::new(start, end),
+ name: name.to_string(),
+ index: capture_index,
+ };
+ self.add_capture_name(&capname)?;
+ Ok(capname)
+ }
+
+ /// Parse a sequence of flags starting at the current character.
+ ///
+ /// This advances the parser to the character immediately following the
+ /// flags, which is guaranteed to be either `:` or `)`.
+ ///
+ /// # Errors
+ ///
+ /// If any flags are duplicated, then an error is returned.
+ ///
+ /// If the negation operator is used more than once, then an error is
+ /// returned.
+ ///
+ /// If no flags could be found or if the negation operation is not followed
+ /// by any flags, then an error is returned.
+ #[inline(never)]
+ fn parse_flags(&self) -> Result<ast::Flags> {
+ let mut flags = ast::Flags { span: self.span(), items: vec![] };
+ let mut last_was_negation = None;
+ while self.char() != ':' && self.char() != ')' {
+ if self.char() == '-' {
+ last_was_negation = Some(self.span_char());
+ let item = ast::FlagsItem {
+ span: self.span_char(),
+ kind: ast::FlagsItemKind::Negation,
+ };
+ if let Some(i) = flags.add_item(item) {
+ return Err(self.error(
+ self.span_char(),
+ ast::ErrorKind::FlagRepeatedNegation {
+ original: flags.items[i].span,
+ },
+ ));
+ }
+ } else {
+ last_was_negation = None;
+ let item = ast::FlagsItem {
+ span: self.span_char(),
+ kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
+ };
+ if let Some(i) = flags.add_item(item) {
+ return Err(self.error(
+ self.span_char(),
+ ast::ErrorKind::FlagDuplicate {
+ original: flags.items[i].span,
+ },
+ ));
+ }
+ }
+ if !self.bump() {
+ return Err(
+ self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof)
+ );
+ }
+ }
+ if let Some(span) = last_was_negation {
+ return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
+ }
+ flags.span.end = self.pos();
+ Ok(flags)
+ }
+
+ /// Parse the current character as a flag. Do not advance the parser.
+ ///
+ /// # Errors
+ ///
+ /// If the flag is not recognized, then an error is returned.
+ #[inline(never)]
+ fn parse_flag(&self) -> Result<ast::Flag> {
+ match self.char() {
+ 'i' => Ok(ast::Flag::CaseInsensitive),
+ 'm' => Ok(ast::Flag::MultiLine),
+ 's' => Ok(ast::Flag::DotMatchesNewLine),
+ 'U' => Ok(ast::Flag::SwapGreed),
+ 'u' => Ok(ast::Flag::Unicode),
+ 'x' => Ok(ast::Flag::IgnoreWhitespace),
+ _ => {
+ Err(self
+ .error(self.span_char(), ast::ErrorKind::FlagUnrecognized))
+ }
+ }
+ }
+
+ /// Parse a primitive AST. e.g., A literal, non-set character class or
+ /// assertion.
+ ///
+ /// This assumes that the parser expects a primitive at the current
+ /// location. i.e., All other non-primitive cases have been handled.
+ /// For example, if the parser's position is at `|`, then `|` will be
+ /// treated as a literal (e.g., inside a character class).
+ ///
+ /// This advances the parser to the first character immediately following
+ /// the primitive.
+ fn parse_primitive(&self) -> Result<Primitive> {
+ match self.char() {
+ '\\' => self.parse_escape(),
+ '.' => {
+ let ast = Primitive::Dot(self.span_char());
+ self.bump();
+ Ok(ast)
+ }
+ '^' => {
+ let ast = Primitive::Assertion(ast::Assertion {
+ span: self.span_char(),
+ kind: ast::AssertionKind::StartLine,
+ });
+ self.bump();
+ Ok(ast)
+ }
+ '$' => {
+ let ast = Primitive::Assertion(ast::Assertion {
+ span: self.span_char(),
+ kind: ast::AssertionKind::EndLine,
+ });
+ self.bump();
+ Ok(ast)
+ }
+ c => {
+ let ast = Primitive::Literal(ast::Literal {
+ span: self.span_char(),
+ kind: ast::LiteralKind::Verbatim,
+ c,
+ });
+ self.bump();
+ Ok(ast)
+ }
+ }
+ }
+
+ /// Parse an escape sequence as a primitive AST.
+ ///
+ /// This assumes the parser is positioned at the start of the escape
+ /// sequence, i.e., `\`. It advances the parser to the first position
+ /// immediately following the escape sequence.
+ #[inline(never)]
+ fn parse_escape(&self) -> Result<Primitive> {
+ assert_eq!(self.char(), '\\');
+ let start = self.pos();
+ if !self.bump() {
+ return Err(self.error(
+ Span::new(start, self.pos()),
+ ast::ErrorKind::EscapeUnexpectedEof,
+ ));
+ }
+ let c = self.char();
+ // Put some of the more complicated routines into helpers.
+ match c {
+ '0'..='7' => {
+ if !self.parser().octal {
+ return Err(self.error(
+ Span::new(start, self.span_char().end),
+ ast::ErrorKind::UnsupportedBackreference,
+ ));
+ }
+ let mut lit = self.parse_octal();
+ lit.span.start = start;
+ return Ok(Primitive::Literal(lit));
+ }
+ '8'..='9' if !self.parser().octal => {
+ return Err(self.error(
+ Span::new(start, self.span_char().end),
+ ast::ErrorKind::UnsupportedBackreference,
+ ));
+ }
+ 'x' | 'u' | 'U' => {
+ let mut lit = self.parse_hex()?;
+ lit.span.start = start;
+ return Ok(Primitive::Literal(lit));
+ }
+ 'p' | 'P' => {
+ let mut cls = self.parse_unicode_class()?;
+ cls.span.start = start;
+ return Ok(Primitive::Unicode(cls));
+ }
+ 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
+ let mut cls = self.parse_perl_class();
+ cls.span.start = start;
+ return Ok(Primitive::Perl(cls));
+ }
+ _ => {}
+ }
+
+ // Handle all of the one letter sequences inline.
+ self.bump();
+ let span = Span::new(start, self.pos());
+ if is_meta_character(c) {
+ return Ok(Primitive::Literal(ast::Literal {
+ span,
+ kind: ast::LiteralKind::Punctuation,
+ c,
+ }));
+ }
+ let special = |kind, c| {
+ Ok(Primitive::Literal(ast::Literal {
+ span,
+ kind: ast::LiteralKind::Special(kind),
+ c,
+ }))
+ };
+ match c {
+ 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
+ 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
+ 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
+ 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
+ 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
+ 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
+ ' ' if self.ignore_whitespace() => {
+ special(ast::SpecialLiteralKind::Space, ' ')
+ }
+ 'A' => Ok(Primitive::Assertion(ast::Assertion {
+ span,
+ kind: ast::AssertionKind::StartText,
+ })),
+ 'z' => Ok(Primitive::Assertion(ast::Assertion {
+ span,
+ kind: ast::AssertionKind::EndText,
+ })),
+ 'b' => Ok(Primitive::Assertion(ast::Assertion {
+ span,
+ kind: ast::AssertionKind::WordBoundary,
+ })),
+ 'B' => Ok(Primitive::Assertion(ast::Assertion {
+ span,
+ kind: ast::AssertionKind::NotWordBoundary,
+ })),
+ _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
+ }
+ }
+
+ /// Parse an octal representation of a Unicode codepoint up to 3 digits
+ /// long. This expects the parser to be positioned at the first octal
+ /// digit and advances the parser to the first character immediately
+ /// following the octal number. This also assumes that parsing octal
+ /// escapes is enabled.
+ ///
+ /// Assuming the preconditions are met, this routine can never fail.
+ #[inline(never)]
+ fn parse_octal(&self) -> ast::Literal {
+ use std::char;
+ use std::u32;
+
+ assert!(self.parser().octal);
+ assert!('0' <= self.char() && self.char() <= '7');
+ let start = self.pos();
+ // Parse up to two more digits.
+ while self.bump()
+ && '0' <= self.char()
+ && self.char() <= '7'
+ && self.pos().offset - start.offset <= 2
+ {}
+ let end = self.pos();
+ let octal = &self.pattern()[start.offset..end.offset];
+ // Parsing the octal should never fail since the above guarantees a
+ // valid number.
+ let codepoint =
+ u32::from_str_radix(octal, 8).expect("valid octal number");
+ // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
+ // invalid Unicode scalar values.
+ let c = char::from_u32(codepoint).expect("Unicode scalar value");
+ ast::Literal {
+ span: Span::new(start, end),
+ kind: ast::LiteralKind::Octal,
+ c,
+ }
+ }
+
+ /// Parse a hex representation of a Unicode codepoint. This handles both
+ /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
+ /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
+ /// the first character immediately following the hexadecimal literal.
+ #[inline(never)]
+ fn parse_hex(&self) -> Result<ast::Literal> {
+ assert!(
+ self.char() == 'x' || self.char() == 'u' || self.char() == 'U'
+ );
+
+ let hex_kind = match self.char() {
+ 'x' => ast::HexLiteralKind::X,
+ 'u' => ast::HexLiteralKind::UnicodeShort,
+ _ => ast::HexLiteralKind::UnicodeLong,
+ };
+ if !self.bump_and_bump_space() {
+ return Err(
+ self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
+ );
+ }
+ if self.char() == '{' {
+ self.parse_hex_brace(hex_kind)
+ } else {
+ self.parse_hex_digits(hex_kind)
+ }
+ }
+
+ /// Parse an N-digit hex representation of a Unicode codepoint. This
+ /// expects the parser to be positioned at the first digit and will advance
+ /// the parser to the first character immediately following the escape
+ /// sequence.
+ ///
+ /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
+ /// or 8 (for `\UNNNNNNNN`).
+ #[inline(never)]
+ fn parse_hex_digits(
+ &self,
+ kind: ast::HexLiteralKind,
+ ) -> Result<ast::Literal> {
+ use std::char;
+ use std::u32;
+
+ let mut scratch = self.parser().scratch.borrow_mut();
+ scratch.clear();
+
+ let start = self.pos();
+ for i in 0..kind.digits() {
+ if i > 0 && !self.bump_and_bump_space() {
+ return Err(self
+ .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
+ }
+ if !is_hex(self.char()) {
+ return Err(self.error(
+ self.span_char(),
+ ast::ErrorKind::EscapeHexInvalidDigit,
+ ));
+ }
+ scratch.push(self.char());
+ }
+ // The final bump just moves the parser past the literal, which may
+ // be EOF.
+ self.bump_and_bump_space();
+ let end = self.pos();
+ let hex = scratch.as_str();
+ match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
+ None => Err(self.error(
+ Span::new(start, end),
+ ast::ErrorKind::EscapeHexInvalid,
+ )),
+ Some(c) => Ok(ast::Literal {
+ span: Span::new(start, end),
+ kind: ast::LiteralKind::HexFixed(kind),
+ c,
+ }),
+ }
+ }
+
+ /// Parse a hex representation of any Unicode scalar value. This expects
+ /// the parser to be positioned at the opening brace `{` and will advance
+ /// the parser to the first character following the closing brace `}`.
+ #[inline(never)]
+ fn parse_hex_brace(
+ &self,
+ kind: ast::HexLiteralKind,
+ ) -> Result<ast::Literal> {
+ use std::char;
+ use std::u32;
+
+ let mut scratch = self.parser().scratch.borrow_mut();
+ scratch.clear();
+
+ let brace_pos = self.pos();
+ let start = self.span_char().end;
+ while self.bump_and_bump_space() && self.char() != '}' {
+ if !is_hex(self.char()) {
+ return Err(self.error(
+ self.span_char(),
+ ast::ErrorKind::EscapeHexInvalidDigit,
+ ));
+ }
+ scratch.push(self.char());
+ }
+ if self.is_eof() {
+ return Err(self.error(
+ Span::new(brace_pos, self.pos()),
+ ast::ErrorKind::EscapeUnexpectedEof,
+ ));
+ }
+ let end = self.pos();
+ let hex = scratch.as_str();
+ assert_eq!(self.char(), '}');
+ self.bump_and_bump_space();
+
+ if hex.is_empty() {
+ return Err(self.error(
+ Span::new(brace_pos, self.pos()),
+ ast::ErrorKind::EscapeHexEmpty,
+ ));
+ }
+ match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
+ None => Err(self.error(
+ Span::new(start, end),
+ ast::ErrorKind::EscapeHexInvalid,
+ )),
+ Some(c) => Ok(ast::Literal {
+ span: Span::new(start, self.pos()),
+ kind: ast::LiteralKind::HexBrace(kind),
+ c,
+ }),
+ }
+ }
+
+ /// Parse a decimal number into a u32 while trimming leading and trailing
+ /// whitespace.
+ ///
+ /// This expects the parser to be positioned at the first position where
+ /// a decimal digit could occur. This will advance the parser to the byte
+ /// immediately following the last contiguous decimal digit.
+ ///
+ /// If no decimal digit could be found or if there was a problem parsing
+ /// the complete set of digits into a u32, then an error is returned.
+ fn parse_decimal(&self) -> Result<u32> {
+ let mut scratch = self.parser().scratch.borrow_mut();
+ scratch.clear();
+
+ while !self.is_eof() && self.char().is_whitespace() {
+ self.bump();
+ }
+ let start = self.pos();
+ while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
+ scratch.push(self.char());
+ self.bump_and_bump_space();
+ }
+ let span = Span::new(start, self.pos());
+ while !self.is_eof() && self.char().is_whitespace() {
+ self.bump_and_bump_space();
+ }
+ let digits = scratch.as_str();
+ if digits.is_empty() {
+ return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
+ }
+ match u32::from_str_radix(digits, 10).ok() {
+ Some(n) => Ok(n),
+ None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
+ }
+ }
+
+ /// Parse a standard character class consisting primarily of characters or
+ /// character ranges, but can also contain nested character classes of
+ /// any type (sans `.`).
+ ///
+ /// This assumes the parser is positioned at the opening `[`. If parsing
+ /// is successful, then the parser is advanced to the position immediately
+ /// following the closing `]`.
+ #[inline(never)]
+ fn parse_set_class(&self) -> Result<ast::Class> {
+ assert_eq!(self.char(), '[');
+
+ let mut union =
+ ast::ClassSetUnion { span: self.span(), items: vec![] };
+ loop {
+ self.bump_space();
+ if self.is_eof() {
+ return Err(self.unclosed_class_error());
+ }
+ match self.char() {
+ '[' => {
+ // If we've already parsed the opening bracket, then
+ // attempt to treat this as the beginning of an ASCII
+ // class. If ASCII class parsing fails, then the parser
+ // backs up to `[`.
+ if !self.parser().stack_class.borrow().is_empty() {
+ if let Some(cls) = self.maybe_parse_ascii_class() {
+ union.push(ast::ClassSetItem::Ascii(cls));
+ continue;
+ }
+ }
+ union = self.push_class_open(union)?;
+ }
+ ']' => match self.pop_class(union)? {
+ Either::Left(nested_union) => {
+ union = nested_union;
+ }
+ Either::Right(class) => return Ok(class),
+ },
+ '&' if self.peek() == Some('&') => {
+ assert!(self.bump_if("&&"));
+ union = self.push_class_op(
+ ast::ClassSetBinaryOpKind::Intersection,
+ union,
+ );
+ }
+ '-' if self.peek() == Some('-') => {
+ assert!(self.bump_if("--"));
+ union = self.push_class_op(
+ ast::ClassSetBinaryOpKind::Difference,
+ union,
+ );
+ }
+ '~' if self.peek() == Some('~') => {
+ assert!(self.bump_if("~~"));
+ union = self.push_class_op(
+ ast::ClassSetBinaryOpKind::SymmetricDifference,
+ union,
+ );
+ }
+ _ => {
+ union.push(self.parse_set_class_range()?);
+ }
+ }
+ }
+ }
+
+ /// Parse a single primitive item in a character class set. The item to
+ /// be parsed can either be one of a simple literal character, a range
+ /// between two simple literal characters or a "primitive" character
+ /// class like \w or \p{Greek}.
+ ///
+ /// If an invalid escape is found, or if a character class is found where
+ /// a simple literal is expected (e.g., in a range), then an error is
+ /// returned.
+ #[inline(never)]
+ fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
+ let prim1 = self.parse_set_class_item()?;
+ self.bump_space();
+ if self.is_eof() {
+ return Err(self.unclosed_class_error());
+ }
+ // If the next char isn't a `-`, then we don't have a range.
+ // There are two exceptions. If the char after a `-` is a `]`, then
+ // `-` is interpreted as a literal `-`. Alternatively, if the char
+ // after a `-` is a `-`, then `--` corresponds to a "difference"
+ // operation.
+ if self.char() != '-'
+ || self.peek_space() == Some(']')
+ || self.peek_space() == Some('-')
+ {
+ return prim1.into_class_set_item(self);
+ }
+ // OK, now we're parsing a range, so bump past the `-` and parse the
+ // second half of the range.
+ if !self.bump_and_bump_space() {
+ return Err(self.unclosed_class_error());
+ }
+ let prim2 = self.parse_set_class_item()?;
+ let range = ast::ClassSetRange {
+ span: Span::new(prim1.span().start, prim2.span().end),
+ start: prim1.into_class_literal(self)?,
+ end: prim2.into_class_literal(self)?,
+ };
+ if !range.is_valid() {
+ return Err(
+ self.error(range.span, ast::ErrorKind::ClassRangeInvalid)
+ );
+ }
+ Ok(ast::ClassSetItem::Range(range))
+ }
+
+ /// Parse a single item in a character class as a primitive, where the
+ /// primitive either consists of a verbatim literal or a single escape
+ /// sequence.
+ ///
+ /// This assumes the parser is positioned at the beginning of a primitive,
+ /// and advances the parser to the first position after the primitive if
+ /// successful.
+ ///
+ /// Note that it is the caller's responsibility to report an error if an
+ /// illegal primitive was parsed.
+ #[inline(never)]
+ fn parse_set_class_item(&self) -> Result<Primitive> {
+ if self.char() == '\\' {
+ self.parse_escape()
+ } else {
+ let x = Primitive::Literal(ast::Literal {
+ span: self.span_char(),
+ kind: ast::LiteralKind::Verbatim,
+ c: self.char(),
+ });
+ self.bump();
+ Ok(x)
+ }
+ }
+
+ /// Parses the opening of a character class set. This includes the opening
+ /// bracket along with `^` if present to indicate negation. This also
+ /// starts parsing the opening set of unioned items if applicable, since
+ /// there are special rules applied to certain characters in the opening
+ /// of a character class. For example, `[^]]` is the class of all
+ /// characters not equal to `]`. (`]` would need to be escaped in any other
+ /// position.) Similarly for `-`.
+ ///
+ /// In all cases, the op inside the returned `ast::ClassBracketed` is an
+ /// empty union. This empty union should be replaced with the actual item
+ /// when it is popped from the parser's stack.
+ ///
+ /// This assumes the parser is positioned at the opening `[` and advances
+ /// the parser to the first non-special byte of the character class.
+ ///
+ /// An error is returned if EOF is found.
+ #[inline(never)]
+ fn parse_set_class_open(
+ &self,
+ ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
+ assert_eq!(self.char(), '[');
+ let start = self.pos();
+ if !self.bump_and_bump_space() {
+ return Err(self.error(
+ Span::new(start, self.pos()),
+ ast::ErrorKind::ClassUnclosed,
+ ));
+ }
+
+ let negated = if self.char() != '^' {
+ false
+ } else {
+ if !self.bump_and_bump_space() {
+ return Err(self.error(
+ Span::new(start, self.pos()),
+ ast::ErrorKind::ClassUnclosed,
+ ));
+ }
+ true
+ };
+ // Accept any number of `-` as literal `-`.
+ let mut union =
+ ast::ClassSetUnion { span: self.span(), items: vec![] };
+ while self.char() == '-' {
+ union.push(ast::ClassSetItem::Literal(ast::Literal {
+ span: self.span_char(),
+ kind: ast::LiteralKind::Verbatim,
+ c: '-',
+ }));
+ if !self.bump_and_bump_space() {
+ return Err(self.error(
+ Span::new(start, start),
+ ast::ErrorKind::ClassUnclosed,
+ ));
+ }
+ }
+ // If `]` is the *first* char in a set, then interpret it as a literal
+ // `]`. That is, an empty class is impossible to write.
+ if union.items.is_empty() && self.char() == ']' {
+ union.push(ast::ClassSetItem::Literal(ast::Literal {
+ span: self.span_char(),
+ kind: ast::LiteralKind::Verbatim,
+ c: ']',
+ }));
+ if !self.bump_and_bump_space() {
+ return Err(self.error(
+ Span::new(start, self.pos()),
+ ast::ErrorKind::ClassUnclosed,
+ ));
+ }
+ }
+ let set = ast::ClassBracketed {
+ span: Span::new(start, self.pos()),
+ negated,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: Span::new(union.span.start, union.span.start),
+ items: vec![],
+ }),
+ };
+ Ok((set, union))
+ }
+
+ /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
+ ///
+ /// This assumes the parser is positioned at the opening `[`.
+ ///
+ /// If no valid ASCII character class could be found, then this does not
+ /// advance the parser and `None` is returned. Otherwise, the parser is
+ /// advanced to the first byte following the closing `]` and the
+ /// corresponding ASCII class is returned.
+ #[inline(never)]
+ fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
+ // ASCII character classes are interesting from a parsing perspective
+ // because parsing cannot fail with any interesting error. For example,
+ // in order to use an ASCII character class, it must be enclosed in
+ // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
+ // of it as "ASCII character characters have the syntax `[:NAME:]`
+ // which can only appear within character brackets." This means that
+ // things like `[[:lower:]A]` are legal constructs.
+ //
+ // However, if one types an incorrect ASCII character class, e.g.,
+ // `[[:loower:]]`, then we treat that as a normal nested character
+ // class containing the characters `:elorw`. One might argue that we
+ // should return an error instead since the repeated colons give away
+ // the intent to write an ASCII class. But what if the user typed
+ // `[[:lower]]` instead? How can we tell that was intended to be an
+ // ASCII class and not just a normal nested class?
+ //
+ // Reasonable people can probably disagree over this, but for better
+ // or worse, we implement semantics that never fails at the expense
+ // of better failure modes.
+ assert_eq!(self.char(), '[');
+ // If parsing fails, then we back up the parser to this starting point.
+ let start = self.pos();
+ let mut negated = false;
+ if !self.bump() || self.char() != ':' {
+ self.parser().pos.set(start);
+ return None;
+ }
+ if !self.bump() {
+ self.parser().pos.set(start);
+ return None;
+ }
+ if self.char() == '^' {
+ negated = true;
+ if !self.bump() {
+ self.parser().pos.set(start);
+ return None;
+ }
+ }
+ let name_start = self.offset();
+ while self.char() != ':' && self.bump() {}
+ if self.is_eof() {
+ self.parser().pos.set(start);
+ return None;
+ }
+ let name = &self.pattern()[name_start..self.offset()];
+ if !self.bump_if(":]") {
+ self.parser().pos.set(start);
+ return None;
+ }
+ let kind = match ast::ClassAsciiKind::from_name(name) {
+ Some(kind) => kind,
+ None => {
+ self.parser().pos.set(start);
+ return None;
+ }
+ };
+ Some(ast::ClassAscii {
+ span: Span::new(start, self.pos()),
+ kind,
+ negated,
+ })
+ }
+
+ /// Parse a Unicode class in either the single character notation, `\pN`
+ /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
+ /// the parser is positioned at the `p` (or `P` for negation) and will
+ /// advance the parser to the character immediately following the class.
+ ///
+ /// Note that this does not check whether the class name is valid or not.
+ #[inline(never)]
+ fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
+ assert!(self.char() == 'p' || self.char() == 'P');
+
+ let mut scratch = self.parser().scratch.borrow_mut();
+ scratch.clear();
+
+ let negated = self.char() == 'P';
+ if !self.bump_and_bump_space() {
+ return Err(
+ self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
+ );
+ }
+ let (start, kind) = if self.char() == '{' {
+ let start = self.span_char().end;
+ while self.bump_and_bump_space() && self.char() != '}' {
+ scratch.push(self.char());
+ }
+ if self.is_eof() {
+ return Err(self
+ .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
+ }
+ assert_eq!(self.char(), '}');
+ self.bump();
+
+ let name = scratch.as_str();
+ if let Some(i) = name.find("!=") {
+ (
+ start,
+ ast::ClassUnicodeKind::NamedValue {
+ op: ast::ClassUnicodeOpKind::NotEqual,
+ name: name[..i].to_string(),
+ value: name[i + 2..].to_string(),
+ },
+ )
+ } else if let Some(i) = name.find(':') {
+ (
+ start,
+ ast::ClassUnicodeKind::NamedValue {
+ op: ast::ClassUnicodeOpKind::Colon,
+ name: name[..i].to_string(),
+ value: name[i + 1..].to_string(),
+ },
+ )
+ } else if let Some(i) = name.find('=') {
+ (
+ start,
+ ast::ClassUnicodeKind::NamedValue {
+ op: ast::ClassUnicodeOpKind::Equal,
+ name: name[..i].to_string(),
+ value: name[i + 1..].to_string(),
+ },
+ )
+ } else {
+ (start, ast::ClassUnicodeKind::Named(name.to_string()))
+ }
+ } else {
+ let start = self.pos();
+ let c = self.char();
+ if c == '\\' {
+ return Err(self.error(
+ self.span_char(),
+ ast::ErrorKind::UnicodeClassInvalid,
+ ));
+ }
+ self.bump_and_bump_space();
+ let kind = ast::ClassUnicodeKind::OneLetter(c);
+ (start, kind)
+ };
+ Ok(ast::ClassUnicode {
+ span: Span::new(start, self.pos()),
+ negated,
+ kind,
+ })
+ }
+
+ /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
+ /// parser is currently at a valid character class name and will be
+ /// advanced to the character immediately following the class.
+ #[inline(never)]
+ fn parse_perl_class(&self) -> ast::ClassPerl {
+ let c = self.char();
+ let span = self.span_char();
+ self.bump();
+ let (negated, kind) = match c {
+ 'd' => (false, ast::ClassPerlKind::Digit),
+ 'D' => (true, ast::ClassPerlKind::Digit),
+ 's' => (false, ast::ClassPerlKind::Space),
+ 'S' => (true, ast::ClassPerlKind::Space),
+ 'w' => (false, ast::ClassPerlKind::Word),
+ 'W' => (true, ast::ClassPerlKind::Word),
+ c => panic!("expected valid Perl class but got '{}'", c),
+ };
+ ast::ClassPerl { span, kind, negated }
+ }
+}
+
+/// A type that traverses a fully parsed Ast and checks whether its depth
+/// exceeds the specified nesting limit. If it does, then an error is returned.
+#[derive(Debug)]
+struct NestLimiter<'p, 's, P> {
+ /// The parser that is checking the nest limit.
+ p: &'p ParserI<'s, P>,
+ /// The current depth while walking an Ast.
+ depth: u32,
+}
+
+impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
+ fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
+ NestLimiter { p, depth: 0 }
+ }
+
+ #[inline(never)]
+ fn check(self, ast: &Ast) -> Result<()> {
+ ast::visit(ast, self)
+ }
+
+ fn increment_depth(&mut self, span: &Span) -> Result<()> {
+ let new = self.depth.checked_add(1).ok_or_else(|| {
+ self.p.error(
+ span.clone(),
+ ast::ErrorKind::NestLimitExceeded(::std::u32::MAX),
+ )
+ })?;
+ let limit = self.p.parser().nest_limit;
+ if new > limit {
+ return Err(self.p.error(
+ span.clone(),
+ ast::ErrorKind::NestLimitExceeded(limit),
+ ));
+ }
+ self.depth = new;
+ Ok(())
+ }
+
+ fn decrement_depth(&mut self) {
+ // Assuming the correctness of the visitor, this should never drop
+ // below 0.
+ self.depth = self.depth.checked_sub(1).unwrap();
+ }
+}
+
+impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
+ type Output = ();
+ type Err = ast::Error;
+
+ fn finish(self) -> Result<()> {
+ Ok(())
+ }
+
+ fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
+ let span = match *ast {
+ Ast::Empty(_)
+ | Ast::Flags(_)
+ | Ast::Literal(_)
+ | Ast::Dot(_)
+ | Ast::Assertion(_)
+ | Ast::Class(ast::Class::Unicode(_))
+ | Ast::Class(ast::Class::Perl(_)) => {
+ // These are all base cases, so we don't increment depth.
+ return Ok(());
+ }
+ Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
+ Ast::Repetition(ref x) => &x.span,
+ Ast::Group(ref x) => &x.span,
+ Ast::Alternation(ref x) => &x.span,
+ Ast::Concat(ref x) => &x.span,
+ };
+ self.increment_depth(span)
+ }
+
+ fn visit_post(&mut self, ast: &Ast) -> Result<()> {
+ match *ast {
+ Ast::Empty(_)
+ | Ast::Flags(_)
+ | Ast::Literal(_)
+ | Ast::Dot(_)
+ | Ast::Assertion(_)
+ | Ast::Class(ast::Class::Unicode(_))
+ | Ast::Class(ast::Class::Perl(_)) => {
+ // These are all base cases, so we don't decrement depth.
+ Ok(())
+ }
+ Ast::Class(ast::Class::Bracketed(_))
+ | Ast::Repetition(_)
+ | Ast::Group(_)
+ | Ast::Alternation(_)
+ | Ast::Concat(_) => {
+ self.decrement_depth();
+ Ok(())
+ }
+ }
+ }
+
+ fn visit_class_set_item_pre(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ let span = match *ast {
+ ast::ClassSetItem::Empty(_)
+ | ast::ClassSetItem::Literal(_)
+ | ast::ClassSetItem::Range(_)
+ | ast::ClassSetItem::Ascii(_)
+ | ast::ClassSetItem::Unicode(_)
+ | ast::ClassSetItem::Perl(_) => {
+ // These are all base cases, so we don't increment depth.
+ return Ok(());
+ }
+ ast::ClassSetItem::Bracketed(ref x) => &x.span,
+ ast::ClassSetItem::Union(ref x) => &x.span,
+ };
+ self.increment_depth(span)
+ }
+
+ fn visit_class_set_item_post(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ match *ast {
+ ast::ClassSetItem::Empty(_)
+ | ast::ClassSetItem::Literal(_)
+ | ast::ClassSetItem::Range(_)
+ | ast::ClassSetItem::Ascii(_)
+ | ast::ClassSetItem::Unicode(_)
+ | ast::ClassSetItem::Perl(_) => {
+ // These are all base cases, so we don't decrement depth.
+ Ok(())
+ }
+ ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => {
+ self.decrement_depth();
+ Ok(())
+ }
+ }
+ }
+
+ fn visit_class_set_binary_op_pre(
+ &mut self,
+ ast: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ self.increment_depth(&ast.span)
+ }
+
+ fn visit_class_set_binary_op_post(
+ &mut self,
+ _ast: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ self.decrement_depth();
+ Ok(())
+ }
+}
+
+/// When the result is an error, transforms the ast::ErrorKind from the source
+/// Result into another one. This function is used to return clearer error
+/// messages when possible.
+fn specialize_err<T>(
+ result: Result<T>,
+ from: ast::ErrorKind,
+ to: ast::ErrorKind,
+) -> Result<T> {
+ if let Err(e) = result {
+ if e.kind == from {
+ Err(ast::Error { kind: to, pattern: e.pattern, span: e.span })
+ } else {
+ Err(e)
+ }
+ } else {
+ result
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::ops::Range;
+
+ use super::{Parser, ParserBuilder, ParserI, Primitive};
+ use crate::ast::{self, Ast, Position, Span};
+
+ // Our own assert_eq, which has slightly better formatting (but honestly
+ // still kind of crappy).
+ macro_rules! assert_eq {
+ ($left:expr, $right:expr) => {{
+ match (&$left, &$right) {
+ (left_val, right_val) => {
+ if !(*left_val == *right_val) {
+ panic!(
+ "assertion failed: `(left == right)`\n\n\
+ left: `{:?}`\nright: `{:?}`\n\n",
+ left_val, right_val
+ )
+ }
+ }
+ }
+ }};
+ }
+
+ // We create these errors to compare with real ast::Errors in the tests.
+ // We define equality between TestError and ast::Error to disregard the
+ // pattern string in ast::Error, which is annoying to provide in tests.
+ #[derive(Clone, Debug)]
+ struct TestError {
+ span: Span,
+ kind: ast::ErrorKind,
+ }
+
+ impl PartialEq<ast::Error> for TestError {
+ fn eq(&self, other: &ast::Error) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ impl PartialEq<TestError> for ast::Error {
+ fn eq(&self, other: &TestError) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ fn s(str: &str) -> String {
+ str.to_string()
+ }
+
+ fn parser(pattern: &str) -> ParserI<'_, Parser> {
+ ParserI::new(Parser::new(), pattern)
+ }
+
+ fn parser_octal(pattern: &str) -> ParserI<'_, Parser> {
+ let parser = ParserBuilder::new().octal(true).build();
+ ParserI::new(parser, pattern)
+ }
+
+ fn parser_nest_limit(
+ pattern: &str,
+ nest_limit: u32,
+ ) -> ParserI<'_, Parser> {
+ let p = ParserBuilder::new().nest_limit(nest_limit).build();
+ ParserI::new(p, pattern)
+ }
+
+ fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> {
+ let p = ParserBuilder::new().ignore_whitespace(true).build();
+ ParserI::new(p, pattern)
+ }
+
+ /// Short alias for creating a new span.
+ fn nspan(start: Position, end: Position) -> Span {
+ Span::new(start, end)
+ }
+
+ /// Short alias for creating a new position.
+ fn npos(offset: usize, line: usize, column: usize) -> Position {
+ Position::new(offset, line, column)
+ }
+
+ /// Create a new span from the given offset range. This assumes a single
+ /// line and sets the columns based on the offsets. i.e., This only works
+ /// out of the box for ASCII, which is fine for most tests.
+ fn span(range: Range<usize>) -> Span {
+ let start = Position::new(range.start, 1, range.start + 1);
+ let end = Position::new(range.end, 1, range.end + 1);
+ Span::new(start, end)
+ }
+
+ /// Create a new span for the corresponding byte range in the given string.
+ fn span_range(subject: &str, range: Range<usize>) -> Span {
+ let start = Position {
+ offset: range.start,
+ line: 1 + subject[..range.start].matches('\n').count(),
+ column: 1 + subject[..range.start]
+ .chars()
+ .rev()
+ .position(|c| c == '\n')
+ .unwrap_or(subject[..range.start].chars().count()),
+ };
+ let end = Position {
+ offset: range.end,
+ line: 1 + subject[..range.end].matches('\n').count(),
+ column: 1 + subject[..range.end]
+ .chars()
+ .rev()
+ .position(|c| c == '\n')
+ .unwrap_or(subject[..range.end].chars().count()),
+ };
+ Span::new(start, end)
+ }
+
+ /// Create a verbatim literal starting at the given position.
+ fn lit(c: char, start: usize) -> Ast {
+ lit_with(c, span(start..start + c.len_utf8()))
+ }
+
+ /// Create a punctuation literal starting at the given position.
+ fn punct_lit(c: char, span: Span) -> Ast {
+ Ast::Literal(ast::Literal {
+ span,
+ kind: ast::LiteralKind::Punctuation,
+ c,
+ })
+ }
+
+ /// Create a verbatim literal with the given span.
+ fn lit_with(c: char, span: Span) -> Ast {
+ Ast::Literal(ast::Literal {
+ span,
+ kind: ast::LiteralKind::Verbatim,
+ c,
+ })
+ }
+
+ /// Create a concatenation with the given range.
+ fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
+ concat_with(span(range), asts)
+ }
+
+ /// Create a concatenation with the given span.
+ fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
+ Ast::Concat(ast::Concat { span, asts })
+ }
+
+ /// Create an alternation with the given span.
+ fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
+ Ast::Alternation(ast::Alternation { span: span(range), asts })
+ }
+
+ /// Create a capturing group with the given span.
+ fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
+ Ast::Group(ast::Group {
+ span: span(range),
+ kind: ast::GroupKind::CaptureIndex(index),
+ ast: Box::new(ast),
+ })
+ }
+
+ /// Create an ast::SetFlags.
+ ///
+ /// The given pattern should be the full pattern string. The range given
+ /// should correspond to the byte offsets where the flag set occurs.
+ ///
+ /// If negated is true, then the set is interpreted as beginning with a
+ /// negation.
+ fn flag_set(
+ pat: &str,
+ range: Range<usize>,
+ flag: ast::Flag,
+ negated: bool,
+ ) -> Ast {
+ let mut items = vec![ast::FlagsItem {
+ span: span_range(pat, (range.end - 2)..(range.end - 1)),
+ kind: ast::FlagsItemKind::Flag(flag),
+ }];
+ if negated {
+ items.insert(
+ 0,
+ ast::FlagsItem {
+ span: span_range(pat, (range.start + 2)..(range.end - 2)),
+ kind: ast::FlagsItemKind::Negation,
+ },
+ );
+ }
+ Ast::Flags(ast::SetFlags {
+ span: span_range(pat, range.clone()),
+ flags: ast::Flags {
+ span: span_range(pat, (range.start + 2)..(range.end - 1)),
+ items,
+ },
+ })
+ }
+
+ #[test]
+ fn parse_nest_limit() {
+ // A nest limit of 0 still allows some types of regexes.
+ assert_eq!(
+ parser_nest_limit("", 0).parse(),
+ Ok(Ast::Empty(span(0..0)))
+ );
+ assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
+
+ // Test repetition operations, which require one level of nesting.
+ assert_eq!(
+ parser_nest_limit("a+", 0).parse().unwrap_err(),
+ TestError {
+ span: span(0..2),
+ kind: ast::ErrorKind::NestLimitExceeded(0),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("a+", 1).parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..2),
+ op: ast::RepetitionOp {
+ span: span(1..2),
+ kind: ast::RepetitionKind::OneOrMore,
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser_nest_limit("(a)+", 1).parse().unwrap_err(),
+ TestError {
+ span: span(0..3),
+ kind: ast::ErrorKind::NestLimitExceeded(1),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("a+*", 1).parse().unwrap_err(),
+ TestError {
+ span: span(0..2),
+ kind: ast::ErrorKind::NestLimitExceeded(1),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("a+*", 2).parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..3),
+ op: ast::RepetitionOp {
+ span: span(2..3),
+ kind: ast::RepetitionKind::ZeroOrMore,
+ },
+ greedy: true,
+ ast: Box::new(Ast::Repetition(ast::Repetition {
+ span: span(0..2),
+ op: ast::RepetitionOp {
+ span: span(1..2),
+ kind: ast::RepetitionKind::OneOrMore,
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ })),
+ }))
+ );
+
+ // Test concatenations. A concatenation requires one level of nesting.
+ assert_eq!(
+ parser_nest_limit("ab", 0).parse().unwrap_err(),
+ TestError {
+ span: span(0..2),
+ kind: ast::ErrorKind::NestLimitExceeded(0),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("ab", 1).parse(),
+ Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))
+ );
+ assert_eq!(
+ parser_nest_limit("abc", 1).parse(),
+ Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))
+ );
+
+ // Test alternations. An alternation requires one level of nesting.
+ assert_eq!(
+ parser_nest_limit("a|b", 0).parse().unwrap_err(),
+ TestError {
+ span: span(0..3),
+ kind: ast::ErrorKind::NestLimitExceeded(0),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("a|b", 1).parse(),
+ Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))
+ );
+ assert_eq!(
+ parser_nest_limit("a|b|c", 1).parse(),
+ Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))
+ );
+
+ // Test character classes. Classes form their own mini-recursive
+ // syntax!
+ assert_eq!(
+ parser_nest_limit("[a]", 0).parse().unwrap_err(),
+ TestError {
+ span: span(0..3),
+ kind: ast::ErrorKind::NestLimitExceeded(0),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("[a]", 1).parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..3),
+ negated: false,
+ kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
+ ast::Literal {
+ span: span(1..2),
+ kind: ast::LiteralKind::Verbatim,
+ c: 'a',
+ }
+ )),
+ })))
+ );
+ assert_eq!(
+ parser_nest_limit("[ab]", 1).parse().unwrap_err(),
+ TestError {
+ span: span(1..3),
+ kind: ast::ErrorKind::NestLimitExceeded(1),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
+ TestError {
+ span: span(3..7),
+ kind: ast::ErrorKind::NestLimitExceeded(2),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
+ TestError {
+ span: span(4..6),
+ kind: ast::ErrorKind::NestLimitExceeded(3),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
+ TestError {
+ span: span(1..5),
+ kind: ast::ErrorKind::NestLimitExceeded(1),
+ }
+ );
+ assert_eq!(
+ parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
+ TestError {
+ span: span(4..6),
+ kind: ast::ErrorKind::NestLimitExceeded(2),
+ }
+ );
+ }
+
+ #[test]
+ fn parse_comments() {
+ let pat = "(?x)
+# This is comment 1.
+foo # This is comment 2.
+ # This is comment 3.
+bar
+# This is comment 4.";
+ let astc = parser(pat).parse_with_comments().unwrap();
+ assert_eq!(
+ astc.ast,
+ concat_with(
+ span_range(pat, 0..pat.len()),
+ vec![
+ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
+ lit_with('f', span_range(pat, 26..27)),
+ lit_with('o', span_range(pat, 27..28)),
+ lit_with('o', span_range(pat, 28..29)),
+ lit_with('b', span_range(pat, 74..75)),
+ lit_with('a', span_range(pat, 75..76)),
+ lit_with('r', span_range(pat, 76..77)),
+ ]
+ )
+ );
+ assert_eq!(
+ astc.comments,
+ vec![
+ ast::Comment {
+ span: span_range(pat, 5..26),
+ comment: s(" This is comment 1."),
+ },
+ ast::Comment {
+ span: span_range(pat, 30..51),
+ comment: s(" This is comment 2."),
+ },
+ ast::Comment {
+ span: span_range(pat, 53..74),
+ comment: s(" This is comment 3."),
+ },
+ ast::Comment {
+ span: span_range(pat, 78..98),
+ comment: s(" This is comment 4."),
+ },
+ ]
+ );
+ }
+
+ #[test]
+ fn parse_holistic() {
+ assert_eq!(parser("]").parse(), Ok(lit(']', 0)));
+ assert_eq!(
+ parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
+ Ok(concat(
+ 0..36,
+ vec![
+ punct_lit('\\', span(0..2)),
+ punct_lit('.', span(2..4)),
+ punct_lit('+', span(4..6)),
+ punct_lit('*', span(6..8)),
+ punct_lit('?', span(8..10)),
+ punct_lit('(', span(10..12)),
+ punct_lit(')', span(12..14)),
+ punct_lit('|', span(14..16)),
+ punct_lit('[', span(16..18)),
+ punct_lit(']', span(18..20)),
+ punct_lit('{', span(20..22)),
+ punct_lit('}', span(22..24)),
+ punct_lit('^', span(24..26)),
+ punct_lit('$', span(26..28)),
+ punct_lit('#', span(28..30)),
+ punct_lit('&', span(30..32)),
+ punct_lit('-', span(32..34)),
+ punct_lit('~', span(34..36)),
+ ]
+ ))
+ );
+ }
+
+ #[test]
+ fn parse_ignore_whitespace() {
+ // Test that basic whitespace insensitivity works.
+ let pat = "(?x)a b";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ nspan(npos(0, 1, 1), npos(7, 1, 8)),
+ vec![
+ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
+ lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
+ lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
+ ]
+ ))
+ );
+
+ // Test that we can toggle whitespace insensitivity.
+ let pat = "(?x)a b(?-x)a b";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ nspan(npos(0, 1, 1), npos(15, 1, 16)),
+ vec![
+ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
+ lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
+ lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
+ flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
+ lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
+ lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
+ lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
+ ]
+ ))
+ );
+
+ // Test that nesting whitespace insensitive flags works.
+ let pat = "a (?x:a )a ";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ span_range(pat, 0..11),
+ vec![
+ lit_with('a', span_range(pat, 0..1)),
+ lit_with(' ', span_range(pat, 1..2)),
+ Ast::Group(ast::Group {
+ span: span_range(pat, 2..9),
+ kind: ast::GroupKind::NonCapturing(ast::Flags {
+ span: span_range(pat, 4..5),
+ items: vec![ast::FlagsItem {
+ span: span_range(pat, 4..5),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::IgnoreWhitespace
+ ),
+ },],
+ }),
+ ast: Box::new(lit_with('a', span_range(pat, 6..7))),
+ }),
+ lit_with('a', span_range(pat, 9..10)),
+ lit_with(' ', span_range(pat, 10..11)),
+ ]
+ ))
+ );
+
+ // Test that whitespace after an opening paren is insignificant.
+ let pat = "(?x)( ?P<foo> a )";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ span_range(pat, 0..pat.len()),
+ vec![
+ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
+ Ast::Group(ast::Group {
+ span: span_range(pat, 4..pat.len()),
+ kind: ast::GroupKind::CaptureName(ast::CaptureName {
+ span: span_range(pat, 9..12),
+ name: s("foo"),
+ index: 1,
+ }),
+ ast: Box::new(lit_with('a', span_range(pat, 14..15))),
+ }),
+ ]
+ ))
+ );
+ let pat = "(?x)( a )";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ span_range(pat, 0..pat.len()),
+ vec![
+ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
+ Ast::Group(ast::Group {
+ span: span_range(pat, 4..pat.len()),
+ kind: ast::GroupKind::CaptureIndex(1),
+ ast: Box::new(lit_with('a', span_range(pat, 7..8))),
+ }),
+ ]
+ ))
+ );
+ let pat = "(?x)( ?: a )";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ span_range(pat, 0..pat.len()),
+ vec![
+ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
+ Ast::Group(ast::Group {
+ span: span_range(pat, 4..pat.len()),
+ kind: ast::GroupKind::NonCapturing(ast::Flags {
+ span: span_range(pat, 8..8),
+ items: vec![],
+ }),
+ ast: Box::new(lit_with('a', span_range(pat, 11..12))),
+ }),
+ ]
+ ))
+ );
+ let pat = r"(?x)\x { 53 }";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ span_range(pat, 0..pat.len()),
+ vec![
+ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
+ Ast::Literal(ast::Literal {
+ span: span(4..13),
+ kind: ast::LiteralKind::HexBrace(
+ ast::HexLiteralKind::X
+ ),
+ c: 'S',
+ }),
+ ]
+ ))
+ );
+
+ // Test that whitespace after an escape is OK.
+ let pat = r"(?x)\ ";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ span_range(pat, 0..pat.len()),
+ vec![
+ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
+ Ast::Literal(ast::Literal {
+ span: span_range(pat, 4..6),
+ kind: ast::LiteralKind::Special(
+ ast::SpecialLiteralKind::Space
+ ),
+ c: ' ',
+ }),
+ ]
+ ))
+ );
+ // ... but only when `x` mode is enabled.
+ let pat = r"\ ";
+ assert_eq!(
+ parser(pat).parse().unwrap_err(),
+ TestError {
+ span: span_range(pat, 0..2),
+ kind: ast::ErrorKind::EscapeUnrecognized,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_newlines() {
+ let pat = ".\n.";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ span_range(pat, 0..3),
+ vec![
+ Ast::Dot(span_range(pat, 0..1)),
+ lit_with('\n', span_range(pat, 1..2)),
+ Ast::Dot(span_range(pat, 2..3)),
+ ]
+ ))
+ );
+
+ let pat = "foobar\nbaz\nquux\n";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(concat_with(
+ span_range(pat, 0..pat.len()),
+ vec![
+ lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
+ lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
+ lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
+ lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
+ lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
+ lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
+ lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
+ lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
+ lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
+ lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
+ lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
+ lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
+ lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
+ lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
+ lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
+ lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
+ ]
+ ))
+ );
+ }
+
+ #[test]
+ fn parse_uncounted_repetition() {
+ assert_eq!(
+ parser(r"a*").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..2),
+ op: ast::RepetitionOp {
+ span: span(1..2),
+ kind: ast::RepetitionKind::ZeroOrMore,
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser(r"a+").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..2),
+ op: ast::RepetitionOp {
+ span: span(1..2),
+ kind: ast::RepetitionKind::OneOrMore,
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+
+ assert_eq!(
+ parser(r"a?").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..2),
+ op: ast::RepetitionOp {
+ span: span(1..2),
+ kind: ast::RepetitionKind::ZeroOrOne,
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser(r"a??").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..3),
+ op: ast::RepetitionOp {
+ span: span(1..3),
+ kind: ast::RepetitionKind::ZeroOrOne,
+ },
+ greedy: false,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser(r"a?").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..2),
+ op: ast::RepetitionOp {
+ span: span(1..2),
+ kind: ast::RepetitionKind::ZeroOrOne,
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser(r"a?b").parse(),
+ Ok(concat(
+ 0..3,
+ vec![
+ Ast::Repetition(ast::Repetition {
+ span: span(0..2),
+ op: ast::RepetitionOp {
+ span: span(1..2),
+ kind: ast::RepetitionKind::ZeroOrOne,
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }),
+ lit('b', 2),
+ ]
+ ))
+ );
+ assert_eq!(
+ parser(r"a??b").parse(),
+ Ok(concat(
+ 0..4,
+ vec![
+ Ast::Repetition(ast::Repetition {
+ span: span(0..3),
+ op: ast::RepetitionOp {
+ span: span(1..3),
+ kind: ast::RepetitionKind::ZeroOrOne,
+ },
+ greedy: false,
+ ast: Box::new(lit('a', 0)),
+ }),
+ lit('b', 3),
+ ]
+ ))
+ );
+ assert_eq!(
+ parser(r"ab?").parse(),
+ Ok(concat(
+ 0..3,
+ vec![
+ lit('a', 0),
+ Ast::Repetition(ast::Repetition {
+ span: span(1..3),
+ op: ast::RepetitionOp {
+ span: span(2..3),
+ kind: ast::RepetitionKind::ZeroOrOne,
+ },
+ greedy: true,
+ ast: Box::new(lit('b', 1)),
+ }),
+ ]
+ ))
+ );
+ assert_eq!(
+ parser(r"(ab)?").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..5),
+ op: ast::RepetitionOp {
+ span: span(4..5),
+ kind: ast::RepetitionKind::ZeroOrOne,
+ },
+ greedy: true,
+ ast: Box::new(group(
+ 0..4,
+ 1,
+ concat(1..3, vec![lit('a', 1), lit('b', 2),])
+ )),
+ }))
+ );
+ assert_eq!(
+ parser(r"|a?").parse(),
+ Ok(alt(
+ 0..3,
+ vec![
+ Ast::Empty(span(0..0)),
+ Ast::Repetition(ast::Repetition {
+ span: span(1..3),
+ op: ast::RepetitionOp {
+ span: span(2..3),
+ kind: ast::RepetitionKind::ZeroOrOne,
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 1)),
+ }),
+ ]
+ ))
+ );
+
+ assert_eq!(
+ parser(r"*").parse().unwrap_err(),
+ TestError {
+ span: span(0..0),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"(?i)*").parse().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"(*)").parse().unwrap_err(),
+ TestError {
+ span: span(1..1),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"(?:?)").parse().unwrap_err(),
+ TestError {
+ span: span(3..3),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"+").parse().unwrap_err(),
+ TestError {
+ span: span(0..0),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"?").parse().unwrap_err(),
+ TestError {
+ span: span(0..0),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"(?)").parse().unwrap_err(),
+ TestError {
+ span: span(1..1),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"|*").parse().unwrap_err(),
+ TestError {
+ span: span(1..1),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"|+").parse().unwrap_err(),
+ TestError {
+ span: span(1..1),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"|?").parse().unwrap_err(),
+ TestError {
+ span: span(1..1),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_counted_repetition() {
+ assert_eq!(
+ parser(r"a{5}").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..4),
+ op: ast::RepetitionOp {
+ span: span(1..4),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::Exactly(5)
+ ),
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser(r"a{5,}").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..5),
+ op: ast::RepetitionOp {
+ span: span(1..5),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::AtLeast(5)
+ ),
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser(r"a{5,9}").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..6),
+ op: ast::RepetitionOp {
+ span: span(1..6),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::Bounded(5, 9)
+ ),
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser(r"a{5}?").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..5),
+ op: ast::RepetitionOp {
+ span: span(1..5),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::Exactly(5)
+ ),
+ },
+ greedy: false,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser(r"ab{5}").parse(),
+ Ok(concat(
+ 0..5,
+ vec![
+ lit('a', 0),
+ Ast::Repetition(ast::Repetition {
+ span: span(1..5),
+ op: ast::RepetitionOp {
+ span: span(2..5),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::Exactly(5)
+ ),
+ },
+ greedy: true,
+ ast: Box::new(lit('b', 1)),
+ }),
+ ]
+ ))
+ );
+ assert_eq!(
+ parser(r"ab{5}c").parse(),
+ Ok(concat(
+ 0..6,
+ vec![
+ lit('a', 0),
+ Ast::Repetition(ast::Repetition {
+ span: span(1..5),
+ op: ast::RepetitionOp {
+ span: span(2..5),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::Exactly(5)
+ ),
+ },
+ greedy: true,
+ ast: Box::new(lit('b', 1)),
+ }),
+ lit('c', 5),
+ ]
+ ))
+ );
+
+ assert_eq!(
+ parser(r"a{ 5 }").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..6),
+ op: ast::RepetitionOp {
+ span: span(1..6),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::Exactly(5)
+ ),
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser(r"a{ 5 , 9 }").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..10),
+ op: ast::RepetitionOp {
+ span: span(1..10),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::Bounded(5, 9)
+ ),
+ },
+ greedy: true,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+ assert_eq!(
+ parser_ignore_whitespace(r"a{5,9} ?").parse(),
+ Ok(Ast::Repetition(ast::Repetition {
+ span: span(0..8),
+ op: ast::RepetitionOp {
+ span: span(1..8),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::Bounded(5, 9)
+ ),
+ },
+ greedy: false,
+ ast: Box::new(lit('a', 0)),
+ }))
+ );
+
+ assert_eq!(
+ parser(r"(?i){0}").parse().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"(?m){1,1}").parse().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"a{]}").parse().unwrap_err(),
+ TestError {
+ span: span(2..2),
+ kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
+ }
+ );
+ assert_eq!(
+ parser(r"a{1,]}").parse().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
+ }
+ );
+ assert_eq!(
+ parser(r"a{").parse().unwrap_err(),
+ TestError {
+ span: span(1..2),
+ kind: ast::ErrorKind::RepetitionCountUnclosed,
+ }
+ );
+ assert_eq!(
+ parser(r"a{}").parse().unwrap_err(),
+ TestError {
+ span: span(2..2),
+ kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
+ }
+ );
+ assert_eq!(
+ parser(r"a{a").parse().unwrap_err(),
+ TestError {
+ span: span(2..2),
+ kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
+ }
+ );
+ assert_eq!(
+ parser(r"a{9999999999}").parse().unwrap_err(),
+ TestError {
+ span: span(2..12),
+ kind: ast::ErrorKind::DecimalInvalid,
+ }
+ );
+ assert_eq!(
+ parser(r"a{9").parse().unwrap_err(),
+ TestError {
+ span: span(1..3),
+ kind: ast::ErrorKind::RepetitionCountUnclosed,
+ }
+ );
+ assert_eq!(
+ parser(r"a{9,a").parse().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
+ }
+ );
+ assert_eq!(
+ parser(r"a{9,9999999999}").parse().unwrap_err(),
+ TestError {
+ span: span(4..14),
+ kind: ast::ErrorKind::DecimalInvalid,
+ }
+ );
+ assert_eq!(
+ parser(r"a{9,").parse().unwrap_err(),
+ TestError {
+ span: span(1..4),
+ kind: ast::ErrorKind::RepetitionCountUnclosed,
+ }
+ );
+ assert_eq!(
+ parser(r"a{9,11").parse().unwrap_err(),
+ TestError {
+ span: span(1..6),
+ kind: ast::ErrorKind::RepetitionCountUnclosed,
+ }
+ );
+ assert_eq!(
+ parser(r"a{2,1}").parse().unwrap_err(),
+ TestError {
+ span: span(1..6),
+ kind: ast::ErrorKind::RepetitionCountInvalid,
+ }
+ );
+ assert_eq!(
+ parser(r"{5}").parse().unwrap_err(),
+ TestError {
+ span: span(0..0),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ assert_eq!(
+ parser(r"|{5}").parse().unwrap_err(),
+ TestError {
+ span: span(1..1),
+ kind: ast::ErrorKind::RepetitionMissing,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_alternate() {
+ assert_eq!(
+ parser(r"a|b").parse(),
+ Ok(Ast::Alternation(ast::Alternation {
+ span: span(0..3),
+ asts: vec![lit('a', 0), lit('b', 2)],
+ }))
+ );
+ assert_eq!(
+ parser(r"(a|b)").parse(),
+ Ok(group(
+ 0..5,
+ 1,
+ Ast::Alternation(ast::Alternation {
+ span: span(1..4),
+ asts: vec![lit('a', 1), lit('b', 3)],
+ })
+ ))
+ );
+
+ assert_eq!(
+ parser(r"a|b|c").parse(),
+ Ok(Ast::Alternation(ast::Alternation {
+ span: span(0..5),
+ asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
+ }))
+ );
+ assert_eq!(
+ parser(r"ax|by|cz").parse(),
+ Ok(Ast::Alternation(ast::Alternation {
+ span: span(0..8),
+ asts: vec![
+ concat(0..2, vec![lit('a', 0), lit('x', 1)]),
+ concat(3..5, vec![lit('b', 3), lit('y', 4)]),
+ concat(6..8, vec![lit('c', 6), lit('z', 7)]),
+ ],
+ }))
+ );
+ assert_eq!(
+ parser(r"(ax|by|cz)").parse(),
+ Ok(group(
+ 0..10,
+ 1,
+ Ast::Alternation(ast::Alternation {
+ span: span(1..9),
+ asts: vec![
+ concat(1..3, vec![lit('a', 1), lit('x', 2)]),
+ concat(4..6, vec![lit('b', 4), lit('y', 5)]),
+ concat(7..9, vec![lit('c', 7), lit('z', 8)]),
+ ],
+ })
+ ))
+ );
+ assert_eq!(
+ parser(r"(ax|(by|(cz)))").parse(),
+ Ok(group(
+ 0..14,
+ 1,
+ alt(
+ 1..13,
+ vec![
+ concat(1..3, vec![lit('a', 1), lit('x', 2)]),
+ group(
+ 4..13,
+ 2,
+ alt(
+ 5..12,
+ vec![
+ concat(
+ 5..7,
+ vec![lit('b', 5), lit('y', 6)]
+ ),
+ group(
+ 8..12,
+ 3,
+ concat(
+ 9..11,
+ vec![lit('c', 9), lit('z', 10),]
+ )
+ ),
+ ]
+ )
+ ),
+ ]
+ )
+ ))
+ );
+
+ assert_eq!(
+ parser(r"|").parse(),
+ Ok(alt(
+ 0..1,
+ vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),]
+ ))
+ );
+ assert_eq!(
+ parser(r"||").parse(),
+ Ok(alt(
+ 0..2,
+ vec![
+ Ast::Empty(span(0..0)),
+ Ast::Empty(span(1..1)),
+ Ast::Empty(span(2..2)),
+ ]
+ ))
+ );
+ assert_eq!(
+ parser(r"a|").parse(),
+ Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),]))
+ );
+ assert_eq!(
+ parser(r"|a").parse(),
+ Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),]))
+ );
+
+ assert_eq!(
+ parser(r"(|)").parse(),
+ Ok(group(
+ 0..3,
+ 1,
+ alt(
+ 1..2,
+ vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),]
+ )
+ ))
+ );
+ assert_eq!(
+ parser(r"(a|)").parse(),
+ Ok(group(
+ 0..4,
+ 1,
+ alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),])
+ ))
+ );
+ assert_eq!(
+ parser(r"(|a)").parse(),
+ Ok(group(
+ 0..4,
+ 1,
+ alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),])
+ ))
+ );
+
+ assert_eq!(
+ parser(r"a|b)").parse().unwrap_err(),
+ TestError {
+ span: span(3..4),
+ kind: ast::ErrorKind::GroupUnopened,
+ }
+ );
+ assert_eq!(
+ parser(r"(a|b").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::GroupUnclosed,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_unsupported_lookaround() {
+ assert_eq!(
+ parser(r"(?=a)").parse().unwrap_err(),
+ TestError {
+ span: span(0..3),
+ kind: ast::ErrorKind::UnsupportedLookAround,
+ }
+ );
+ assert_eq!(
+ parser(r"(?!a)").parse().unwrap_err(),
+ TestError {
+ span: span(0..3),
+ kind: ast::ErrorKind::UnsupportedLookAround,
+ }
+ );
+ assert_eq!(
+ parser(r"(?<=a)").parse().unwrap_err(),
+ TestError {
+ span: span(0..4),
+ kind: ast::ErrorKind::UnsupportedLookAround,
+ }
+ );
+ assert_eq!(
+ parser(r"(?<!a)").parse().unwrap_err(),
+ TestError {
+ span: span(0..4),
+ kind: ast::ErrorKind::UnsupportedLookAround,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_group() {
+ assert_eq!(
+ parser("(?i)").parse(),
+ Ok(Ast::Flags(ast::SetFlags {
+ span: span(0..4),
+ flags: ast::Flags {
+ span: span(2..3),
+ items: vec![ast::FlagsItem {
+ span: span(2..3),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::CaseInsensitive
+ ),
+ }],
+ },
+ }))
+ );
+ assert_eq!(
+ parser("(?iU)").parse(),
+ Ok(Ast::Flags(ast::SetFlags {
+ span: span(0..5),
+ flags: ast::Flags {
+ span: span(2..4),
+ items: vec![
+ ast::FlagsItem {
+ span: span(2..3),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::CaseInsensitive
+ ),
+ },
+ ast::FlagsItem {
+ span: span(3..4),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::SwapGreed
+ ),
+ },
+ ],
+ },
+ }))
+ );
+ assert_eq!(
+ parser("(?i-U)").parse(),
+ Ok(Ast::Flags(ast::SetFlags {
+ span: span(0..6),
+ flags: ast::Flags {
+ span: span(2..5),
+ items: vec![
+ ast::FlagsItem {
+ span: span(2..3),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::CaseInsensitive
+ ),
+ },
+ ast::FlagsItem {
+ span: span(3..4),
+ kind: ast::FlagsItemKind::Negation,
+ },
+ ast::FlagsItem {
+ span: span(4..5),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::SwapGreed
+ ),
+ },
+ ],
+ },
+ }))
+ );
+
+ assert_eq!(
+ parser("()").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..2),
+ kind: ast::GroupKind::CaptureIndex(1),
+ ast: Box::new(Ast::Empty(span(1..1))),
+ }))
+ );
+ assert_eq!(
+ parser("(a)").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..3),
+ kind: ast::GroupKind::CaptureIndex(1),
+ ast: Box::new(lit('a', 1)),
+ }))
+ );
+ assert_eq!(
+ parser("(())").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..4),
+ kind: ast::GroupKind::CaptureIndex(1),
+ ast: Box::new(Ast::Group(ast::Group {
+ span: span(1..3),
+ kind: ast::GroupKind::CaptureIndex(2),
+ ast: Box::new(Ast::Empty(span(2..2))),
+ })),
+ }))
+ );
+
+ assert_eq!(
+ parser("(?:a)").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..5),
+ kind: ast::GroupKind::NonCapturing(ast::Flags {
+ span: span(2..2),
+ items: vec![],
+ }),
+ ast: Box::new(lit('a', 3)),
+ }))
+ );
+
+ assert_eq!(
+ parser("(?i:a)").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..6),
+ kind: ast::GroupKind::NonCapturing(ast::Flags {
+ span: span(2..3),
+ items: vec![ast::FlagsItem {
+ span: span(2..3),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::CaseInsensitive
+ ),
+ },],
+ }),
+ ast: Box::new(lit('a', 4)),
+ }))
+ );
+ assert_eq!(
+ parser("(?i-U:a)").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..8),
+ kind: ast::GroupKind::NonCapturing(ast::Flags {
+ span: span(2..5),
+ items: vec![
+ ast::FlagsItem {
+ span: span(2..3),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::CaseInsensitive
+ ),
+ },
+ ast::FlagsItem {
+ span: span(3..4),
+ kind: ast::FlagsItemKind::Negation,
+ },
+ ast::FlagsItem {
+ span: span(4..5),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::SwapGreed
+ ),
+ },
+ ],
+ }),
+ ast: Box::new(lit('a', 6)),
+ }))
+ );
+
+ assert_eq!(
+ parser("(").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::GroupUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("(?").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::GroupUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("(?P").parse().unwrap_err(),
+ TestError {
+ span: span(2..3),
+ kind: ast::ErrorKind::FlagUnrecognized,
+ }
+ );
+ assert_eq!(
+ parser("(?P<").parse().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::GroupNameUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser("(a").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::GroupUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("(()").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::GroupUnclosed,
+ }
+ );
+ assert_eq!(
+ parser(")").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::GroupUnopened,
+ }
+ );
+ assert_eq!(
+ parser("a)").parse().unwrap_err(),
+ TestError {
+ span: span(1..2),
+ kind: ast::ErrorKind::GroupUnopened,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_capture_name() {
+ assert_eq!(
+ parser("(?P<a>z)").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..8),
+ kind: ast::GroupKind::CaptureName(ast::CaptureName {
+ span: span(4..5),
+ name: s("a"),
+ index: 1,
+ }),
+ ast: Box::new(lit('z', 6)),
+ }))
+ );
+ assert_eq!(
+ parser("(?P<abc>z)").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..10),
+ kind: ast::GroupKind::CaptureName(ast::CaptureName {
+ span: span(4..7),
+ name: s("abc"),
+ index: 1,
+ }),
+ ast: Box::new(lit('z', 8)),
+ }))
+ );
+
+ assert_eq!(
+ parser("(?P<a_1>z)").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..10),
+ kind: ast::GroupKind::CaptureName(ast::CaptureName {
+ span: span(4..7),
+ name: s("a_1"),
+ index: 1,
+ }),
+ ast: Box::new(lit('z', 8)),
+ }))
+ );
+
+ assert_eq!(
+ parser("(?P<a.1>z)").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..10),
+ kind: ast::GroupKind::CaptureName(ast::CaptureName {
+ span: span(4..7),
+ name: s("a.1"),
+ index: 1,
+ }),
+ ast: Box::new(lit('z', 8)),
+ }))
+ );
+
+ assert_eq!(
+ parser("(?P<a[1]>z)").parse(),
+ Ok(Ast::Group(ast::Group {
+ span: span(0..11),
+ kind: ast::GroupKind::CaptureName(ast::CaptureName {
+ span: span(4..8),
+ name: s("a[1]"),
+ index: 1,
+ }),
+ ast: Box::new(lit('z', 9)),
+ }))
+ );
+
+ assert_eq!(
+ parser("(?P<").parse().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::GroupNameUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser("(?P<>z)").parse().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::GroupNameEmpty,
+ }
+ );
+ assert_eq!(
+ parser("(?P<a").parse().unwrap_err(),
+ TestError {
+ span: span(5..5),
+ kind: ast::ErrorKind::GroupNameUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser("(?P<ab").parse().unwrap_err(),
+ TestError {
+ span: span(6..6),
+ kind: ast::ErrorKind::GroupNameUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser("(?P<0a").parse().unwrap_err(),
+ TestError {
+ span: span(4..5),
+ kind: ast::ErrorKind::GroupNameInvalid,
+ }
+ );
+ assert_eq!(
+ parser("(?P<~").parse().unwrap_err(),
+ TestError {
+ span: span(4..5),
+ kind: ast::ErrorKind::GroupNameInvalid,
+ }
+ );
+ assert_eq!(
+ parser("(?P<abc~").parse().unwrap_err(),
+ TestError {
+ span: span(7..8),
+ kind: ast::ErrorKind::GroupNameInvalid,
+ }
+ );
+ assert_eq!(
+ parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
+ TestError {
+ span: span(12..13),
+ kind: ast::ErrorKind::GroupNameDuplicate {
+ original: span(4..5),
+ },
+ }
+ );
+ }
+
+ #[test]
+ fn parse_flags() {
+ assert_eq!(
+ parser("i:").parse_flags(),
+ Ok(ast::Flags {
+ span: span(0..1),
+ items: vec![ast::FlagsItem {
+ span: span(0..1),
+ kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
+ }],
+ })
+ );
+ assert_eq!(
+ parser("i)").parse_flags(),
+ Ok(ast::Flags {
+ span: span(0..1),
+ items: vec![ast::FlagsItem {
+ span: span(0..1),
+ kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
+ }],
+ })
+ );
+
+ assert_eq!(
+ parser("isU:").parse_flags(),
+ Ok(ast::Flags {
+ span: span(0..3),
+ items: vec![
+ ast::FlagsItem {
+ span: span(0..1),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::CaseInsensitive
+ ),
+ },
+ ast::FlagsItem {
+ span: span(1..2),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::DotMatchesNewLine
+ ),
+ },
+ ast::FlagsItem {
+ span: span(2..3),
+ kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
+ },
+ ],
+ })
+ );
+
+ assert_eq!(
+ parser("-isU:").parse_flags(),
+ Ok(ast::Flags {
+ span: span(0..4),
+ items: vec![
+ ast::FlagsItem {
+ span: span(0..1),
+ kind: ast::FlagsItemKind::Negation,
+ },
+ ast::FlagsItem {
+ span: span(1..2),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::CaseInsensitive
+ ),
+ },
+ ast::FlagsItem {
+ span: span(2..3),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::DotMatchesNewLine
+ ),
+ },
+ ast::FlagsItem {
+ span: span(3..4),
+ kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
+ },
+ ],
+ })
+ );
+ assert_eq!(
+ parser("i-sU:").parse_flags(),
+ Ok(ast::Flags {
+ span: span(0..4),
+ items: vec![
+ ast::FlagsItem {
+ span: span(0..1),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::CaseInsensitive
+ ),
+ },
+ ast::FlagsItem {
+ span: span(1..2),
+ kind: ast::FlagsItemKind::Negation,
+ },
+ ast::FlagsItem {
+ span: span(2..3),
+ kind: ast::FlagsItemKind::Flag(
+ ast::Flag::DotMatchesNewLine
+ ),
+ },
+ ast::FlagsItem {
+ span: span(3..4),
+ kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
+ },
+ ],
+ })
+ );
+
+ assert_eq!(
+ parser("isU").parse_flags().unwrap_err(),
+ TestError {
+ span: span(3..3),
+ kind: ast::ErrorKind::FlagUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser("isUa:").parse_flags().unwrap_err(),
+ TestError {
+ span: span(3..4),
+ kind: ast::ErrorKind::FlagUnrecognized,
+ }
+ );
+ assert_eq!(
+ parser("isUi:").parse_flags().unwrap_err(),
+ TestError {
+ span: span(3..4),
+ kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) },
+ }
+ );
+ assert_eq!(
+ parser("i-sU-i:").parse_flags().unwrap_err(),
+ TestError {
+ span: span(4..5),
+ kind: ast::ErrorKind::FlagRepeatedNegation {
+ original: span(1..2),
+ },
+ }
+ );
+ assert_eq!(
+ parser("-)").parse_flags().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::FlagDanglingNegation,
+ }
+ );
+ assert_eq!(
+ parser("i-)").parse_flags().unwrap_err(),
+ TestError {
+ span: span(1..2),
+ kind: ast::ErrorKind::FlagDanglingNegation,
+ }
+ );
+ assert_eq!(
+ parser("iU-)").parse_flags().unwrap_err(),
+ TestError {
+ span: span(2..3),
+ kind: ast::ErrorKind::FlagDanglingNegation,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_flag() {
+ assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
+ assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
+ assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
+ assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
+ assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
+ assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
+
+ assert_eq!(
+ parser("a").parse_flag().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::FlagUnrecognized,
+ }
+ );
+ assert_eq!(
+ parser("☃").parse_flag().unwrap_err(),
+ TestError {
+ span: span_range("☃", 0..3),
+ kind: ast::ErrorKind::FlagUnrecognized,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_primitive_non_escape() {
+ assert_eq!(
+ parser(r".").parse_primitive(),
+ Ok(Primitive::Dot(span(0..1)))
+ );
+ assert_eq!(
+ parser(r"^").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..1),
+ kind: ast::AssertionKind::StartLine,
+ }))
+ );
+ assert_eq!(
+ parser(r"$").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..1),
+ kind: ast::AssertionKind::EndLine,
+ }))
+ );
+
+ assert_eq!(
+ parser(r"a").parse_primitive(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..1),
+ kind: ast::LiteralKind::Verbatim,
+ c: 'a',
+ }))
+ );
+ assert_eq!(
+ parser(r"|").parse_primitive(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..1),
+ kind: ast::LiteralKind::Verbatim,
+ c: '|',
+ }))
+ );
+ assert_eq!(
+ parser(r"☃").parse_primitive(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span_range("☃", 0..3),
+ kind: ast::LiteralKind::Verbatim,
+ c: '☃',
+ }))
+ );
+ }
+
+ #[test]
+ fn parse_escape() {
+ assert_eq!(
+ parser(r"\|").parse_primitive(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..2),
+ kind: ast::LiteralKind::Punctuation,
+ c: '|',
+ }))
+ );
+ let specials = &[
+ (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
+ (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
+ (r"\t", '\t', ast::SpecialLiteralKind::Tab),
+ (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
+ (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
+ (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
+ ];
+ for &(pat, c, ref kind) in specials {
+ assert_eq!(
+ parser(pat).parse_primitive(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..2),
+ kind: ast::LiteralKind::Special(kind.clone()),
+ c,
+ }))
+ );
+ }
+ assert_eq!(
+ parser(r"\A").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..2),
+ kind: ast::AssertionKind::StartText,
+ }))
+ );
+ assert_eq!(
+ parser(r"\z").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..2),
+ kind: ast::AssertionKind::EndText,
+ }))
+ );
+ assert_eq!(
+ parser(r"\b").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..2),
+ kind: ast::AssertionKind::WordBoundary,
+ }))
+ );
+ assert_eq!(
+ parser(r"\B").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..2),
+ kind: ast::AssertionKind::NotWordBoundary,
+ }))
+ );
+
+ assert_eq!(
+ parser(r"\").parse_escape().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\y").parse_escape().unwrap_err(),
+ TestError {
+ span: span(0..2),
+ kind: ast::ErrorKind::EscapeUnrecognized,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_unsupported_backreference() {
+ assert_eq!(
+ parser(r"\0").parse_escape().unwrap_err(),
+ TestError {
+ span: span(0..2),
+ kind: ast::ErrorKind::UnsupportedBackreference,
+ }
+ );
+ assert_eq!(
+ parser(r"\9").parse_escape().unwrap_err(),
+ TestError {
+ span: span(0..2),
+ kind: ast::ErrorKind::UnsupportedBackreference,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_octal() {
+ for i in 0..511 {
+ let pat = format!(r"\{:o}", i);
+ assert_eq!(
+ parser_octal(&pat).parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..pat.len()),
+ kind: ast::LiteralKind::Octal,
+ c: ::std::char::from_u32(i).unwrap(),
+ }))
+ );
+ }
+ assert_eq!(
+ parser_octal(r"\778").parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..3),
+ kind: ast::LiteralKind::Octal,
+ c: '?',
+ }))
+ );
+ assert_eq!(
+ parser_octal(r"\7777").parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..4),
+ kind: ast::LiteralKind::Octal,
+ c: '\u{01FF}',
+ }))
+ );
+ assert_eq!(
+ parser_octal(r"\778").parse(),
+ Ok(Ast::Concat(ast::Concat {
+ span: span(0..4),
+ asts: vec![
+ Ast::Literal(ast::Literal {
+ span: span(0..3),
+ kind: ast::LiteralKind::Octal,
+ c: '?',
+ }),
+ Ast::Literal(ast::Literal {
+ span: span(3..4),
+ kind: ast::LiteralKind::Verbatim,
+ c: '8',
+ }),
+ ],
+ }))
+ );
+ assert_eq!(
+ parser_octal(r"\7777").parse(),
+ Ok(Ast::Concat(ast::Concat {
+ span: span(0..5),
+ asts: vec![
+ Ast::Literal(ast::Literal {
+ span: span(0..4),
+ kind: ast::LiteralKind::Octal,
+ c: '\u{01FF}',
+ }),
+ Ast::Literal(ast::Literal {
+ span: span(4..5),
+ kind: ast::LiteralKind::Verbatim,
+ c: '7',
+ }),
+ ],
+ }))
+ );
+
+ assert_eq!(
+ parser_octal(r"\8").parse_escape().unwrap_err(),
+ TestError {
+ span: span(0..2),
+ kind: ast::ErrorKind::EscapeUnrecognized,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_hex_two() {
+ for i in 0..256 {
+ let pat = format!(r"\x{:02x}", i);
+ assert_eq!(
+ parser(&pat).parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..pat.len()),
+ kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
+ c: ::std::char::from_u32(i).unwrap(),
+ }))
+ );
+ }
+
+ assert_eq!(
+ parser(r"\xF").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..3),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\xG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..3),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\xFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..4),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_hex_four() {
+ for i in 0..65536 {
+ let c = match ::std::char::from_u32(i) {
+ None => continue,
+ Some(c) => c,
+ };
+ let pat = format!(r"\u{:04x}", i);
+ assert_eq!(
+ parser(&pat).parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..pat.len()),
+ kind: ast::LiteralKind::HexFixed(
+ ast::HexLiteralKind::UnicodeShort
+ ),
+ c,
+ }))
+ );
+ }
+
+ assert_eq!(
+ parser(r"\uF").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..3),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\uG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..3),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\uFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..4),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\uFFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(4..5),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\uFFFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(5..6),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\uD800").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..6),
+ kind: ast::ErrorKind::EscapeHexInvalid,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_hex_eight() {
+ for i in 0..65536 {
+ let c = match ::std::char::from_u32(i) {
+ None => continue,
+ Some(c) => c,
+ };
+ let pat = format!(r"\U{:08x}", i);
+ assert_eq!(
+ parser(&pat).parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..pat.len()),
+ kind: ast::LiteralKind::HexFixed(
+ ast::HexLiteralKind::UnicodeLong
+ ),
+ c,
+ }))
+ );
+ }
+
+ assert_eq!(
+ parser(r"\UF").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..3),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\UG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..3),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\UFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..4),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\UFFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(4..5),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\UFFFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(5..6),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\UFFFFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(6..7),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\UFFFFFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(7..8),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(8..9),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
+ TestError {
+ span: span(9..10),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_hex_brace() {
+ assert_eq!(
+ parser(r"\u{26c4}").parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..8),
+ kind: ast::LiteralKind::HexBrace(
+ ast::HexLiteralKind::UnicodeShort
+ ),
+ c: '⛄',
+ }))
+ );
+ assert_eq!(
+ parser(r"\U{26c4}").parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..8),
+ kind: ast::LiteralKind::HexBrace(
+ ast::HexLiteralKind::UnicodeLong
+ ),
+ c: '⛄',
+ }))
+ );
+ assert_eq!(
+ parser(r"\x{26c4}").parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..8),
+ kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
+ c: '⛄',
+ }))
+ );
+ assert_eq!(
+ parser(r"\x{26C4}").parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..8),
+ kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
+ c: '⛄',
+ }))
+ );
+ assert_eq!(
+ parser(r"\x{10fFfF}").parse_escape(),
+ Ok(Primitive::Literal(ast::Literal {
+ span: span(0..10),
+ kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
+ c: '\u{10FFFF}',
+ }))
+ );
+
+ assert_eq!(
+ parser(r"\x").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..2),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\x{").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..3),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\x{FF").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..5),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\x{}").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..4),
+ kind: ast::ErrorKind::EscapeHexEmpty,
+ }
+ );
+ assert_eq!(
+ parser(r"\x{FGF}").parse_escape().unwrap_err(),
+ TestError {
+ span: span(4..5),
+ kind: ast::ErrorKind::EscapeHexInvalidDigit,
+ }
+ );
+ assert_eq!(
+ parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..9),
+ kind: ast::ErrorKind::EscapeHexInvalid,
+ }
+ );
+ assert_eq!(
+ parser(r"\x{D800}").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..7),
+ kind: ast::ErrorKind::EscapeHexInvalid,
+ }
+ );
+ assert_eq!(
+ parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..12),
+ kind: ast::ErrorKind::EscapeHexInvalid,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_decimal() {
+ assert_eq!(parser("123").parse_decimal(), Ok(123));
+ assert_eq!(parser("0").parse_decimal(), Ok(0));
+ assert_eq!(parser("01").parse_decimal(), Ok(1));
+
+ assert_eq!(
+ parser("-1").parse_decimal().unwrap_err(),
+ TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
+ );
+ assert_eq!(
+ parser("").parse_decimal().unwrap_err(),
+ TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
+ );
+ assert_eq!(
+ parser("9999999999").parse_decimal().unwrap_err(),
+ TestError {
+ span: span(0..10),
+ kind: ast::ErrorKind::DecimalInvalid,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_set_class() {
+ fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
+ ast::ClassSet::union(ast::ClassSetUnion { span, items })
+ }
+
+ fn intersection(
+ span: Span,
+ lhs: ast::ClassSet,
+ rhs: ast::ClassSet,
+ ) -> ast::ClassSet {
+ ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
+ span,
+ kind: ast::ClassSetBinaryOpKind::Intersection,
+ lhs: Box::new(lhs),
+ rhs: Box::new(rhs),
+ })
+ }
+
+ fn difference(
+ span: Span,
+ lhs: ast::ClassSet,
+ rhs: ast::ClassSet,
+ ) -> ast::ClassSet {
+ ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
+ span,
+ kind: ast::ClassSetBinaryOpKind::Difference,
+ lhs: Box::new(lhs),
+ rhs: Box::new(rhs),
+ })
+ }
+
+ fn symdifference(
+ span: Span,
+ lhs: ast::ClassSet,
+ rhs: ast::ClassSet,
+ ) -> ast::ClassSet {
+ ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
+ span,
+ kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
+ lhs: Box::new(lhs),
+ rhs: Box::new(rhs),
+ })
+ }
+
+ fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
+ ast::ClassSet::Item(item)
+ }
+
+ fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
+ ast::ClassSetItem::Ascii(cls)
+ }
+
+ fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
+ ast::ClassSetItem::Unicode(cls)
+ }
+
+ fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
+ ast::ClassSetItem::Perl(cls)
+ }
+
+ fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
+ ast::ClassSetItem::Bracketed(Box::new(cls))
+ }
+
+ fn lit(span: Span, c: char) -> ast::ClassSetItem {
+ ast::ClassSetItem::Literal(ast::Literal {
+ span,
+ kind: ast::LiteralKind::Verbatim,
+ c,
+ })
+ }
+
+ fn empty(span: Span) -> ast::ClassSetItem {
+ ast::ClassSetItem::Empty(span)
+ }
+
+ fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
+ let pos1 = Position {
+ offset: span.start.offset + start.len_utf8(),
+ column: span.start.column + 1,
+ ..span.start
+ };
+ let pos2 = Position {
+ offset: span.end.offset - end.len_utf8(),
+ column: span.end.column - 1,
+ ..span.end
+ };
+ ast::ClassSetItem::Range(ast::ClassSetRange {
+ span,
+ start: ast::Literal {
+ span: Span { end: pos1, ..span },
+ kind: ast::LiteralKind::Verbatim,
+ c: start,
+ },
+ end: ast::Literal {
+ span: Span { start: pos2, ..span },
+ kind: ast::LiteralKind::Verbatim,
+ c: end,
+ },
+ })
+ }
+
+ fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
+ ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated }
+ }
+
+ fn lower(span: Span, negated: bool) -> ast::ClassAscii {
+ ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated }
+ }
+
+ assert_eq!(
+ parser("[[:alnum:]]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..11),
+ negated: false,
+ kind: itemset(item_ascii(alnum(span(1..10), false))),
+ })))
+ );
+ assert_eq!(
+ parser("[[[:alnum:]]]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..13),
+ negated: false,
+ kind: itemset(item_bracket(ast::ClassBracketed {
+ span: span(1..12),
+ negated: false,
+ kind: itemset(item_ascii(alnum(span(2..11), false))),
+ })),
+ })))
+ );
+ assert_eq!(
+ parser("[[:alnum:]&&[:lower:]]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..22),
+ negated: false,
+ kind: intersection(
+ span(1..21),
+ itemset(item_ascii(alnum(span(1..10), false))),
+ itemset(item_ascii(lower(span(12..21), false))),
+ ),
+ })))
+ );
+ assert_eq!(
+ parser("[[:alnum:]--[:lower:]]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..22),
+ negated: false,
+ kind: difference(
+ span(1..21),
+ itemset(item_ascii(alnum(span(1..10), false))),
+ itemset(item_ascii(lower(span(12..21), false))),
+ ),
+ })))
+ );
+ assert_eq!(
+ parser("[[:alnum:]~~[:lower:]]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..22),
+ negated: false,
+ kind: symdifference(
+ span(1..21),
+ itemset(item_ascii(alnum(span(1..10), false))),
+ itemset(item_ascii(lower(span(12..21), false))),
+ ),
+ })))
+ );
+
+ assert_eq!(
+ parser("[a]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..3),
+ negated: false,
+ kind: itemset(lit(span(1..2), 'a')),
+ })))
+ );
+ assert_eq!(
+ parser(r"[a\]]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..5),
+ negated: false,
+ kind: union(
+ span(1..4),
+ vec![
+ lit(span(1..2), 'a'),
+ ast::ClassSetItem::Literal(ast::Literal {
+ span: span(2..4),
+ kind: ast::LiteralKind::Punctuation,
+ c: ']',
+ }),
+ ]
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[a\-z]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..6),
+ negated: false,
+ kind: union(
+ span(1..5),
+ vec![
+ lit(span(1..2), 'a'),
+ ast::ClassSetItem::Literal(ast::Literal {
+ span: span(2..4),
+ kind: ast::LiteralKind::Punctuation,
+ c: '-',
+ }),
+ lit(span(4..5), 'z'),
+ ]
+ ),
+ })))
+ );
+ assert_eq!(
+ parser("[ab]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..4),
+ negated: false,
+ kind: union(
+ span(1..3),
+ vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
+ ),
+ })))
+ );
+ assert_eq!(
+ parser("[a-]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..4),
+ negated: false,
+ kind: union(
+ span(1..3),
+ vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
+ ),
+ })))
+ );
+ assert_eq!(
+ parser("[-a]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..4),
+ negated: false,
+ kind: union(
+ span(1..3),
+ vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[\pL]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..5),
+ negated: false,
+ kind: itemset(item_unicode(ast::ClassUnicode {
+ span: span(1..4),
+ negated: false,
+ kind: ast::ClassUnicodeKind::OneLetter('L'),
+ })),
+ })))
+ );
+ assert_eq!(
+ parser(r"[\w]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..4),
+ negated: false,
+ kind: itemset(item_perl(ast::ClassPerl {
+ span: span(1..3),
+ kind: ast::ClassPerlKind::Word,
+ negated: false,
+ })),
+ })))
+ );
+ assert_eq!(
+ parser(r"[a\wz]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..6),
+ negated: false,
+ kind: union(
+ span(1..5),
+ vec![
+ lit(span(1..2), 'a'),
+ item_perl(ast::ClassPerl {
+ span: span(2..4),
+ kind: ast::ClassPerlKind::Word,
+ negated: false,
+ }),
+ lit(span(4..5), 'z'),
+ ]
+ ),
+ })))
+ );
+
+ assert_eq!(
+ parser("[a-z]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..5),
+ negated: false,
+ kind: itemset(range(span(1..4), 'a', 'z')),
+ })))
+ );
+ assert_eq!(
+ parser("[a-cx-z]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..8),
+ negated: false,
+ kind: union(
+ span(1..7),
+ vec![
+ range(span(1..4), 'a', 'c'),
+ range(span(4..7), 'x', 'z'),
+ ]
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[\w&&a-cx-z]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..12),
+ negated: false,
+ kind: intersection(
+ span(1..11),
+ itemset(item_perl(ast::ClassPerl {
+ span: span(1..3),
+ kind: ast::ClassPerlKind::Word,
+ negated: false,
+ })),
+ union(
+ span(5..11),
+ vec![
+ range(span(5..8), 'a', 'c'),
+ range(span(8..11), 'x', 'z'),
+ ]
+ ),
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[a-cx-z&&\w]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..12),
+ negated: false,
+ kind: intersection(
+ span(1..11),
+ union(
+ span(1..7),
+ vec![
+ range(span(1..4), 'a', 'c'),
+ range(span(4..7), 'x', 'z'),
+ ]
+ ),
+ itemset(item_perl(ast::ClassPerl {
+ span: span(9..11),
+ kind: ast::ClassPerlKind::Word,
+ negated: false,
+ })),
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[a--b--c]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..9),
+ negated: false,
+ kind: difference(
+ span(1..8),
+ difference(
+ span(1..5),
+ itemset(lit(span(1..2), 'a')),
+ itemset(lit(span(4..5), 'b')),
+ ),
+ itemset(lit(span(7..8), 'c')),
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[a~~b~~c]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..9),
+ negated: false,
+ kind: symdifference(
+ span(1..8),
+ symdifference(
+ span(1..5),
+ itemset(lit(span(1..2), 'a')),
+ itemset(lit(span(4..5), 'b')),
+ ),
+ itemset(lit(span(7..8), 'c')),
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[\^&&^]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..7),
+ negated: false,
+ kind: intersection(
+ span(1..6),
+ itemset(ast::ClassSetItem::Literal(ast::Literal {
+ span: span(1..3),
+ kind: ast::LiteralKind::Punctuation,
+ c: '^',
+ })),
+ itemset(lit(span(5..6), '^')),
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[\&&&&]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..7),
+ negated: false,
+ kind: intersection(
+ span(1..6),
+ itemset(ast::ClassSetItem::Literal(ast::Literal {
+ span: span(1..3),
+ kind: ast::LiteralKind::Punctuation,
+ c: '&',
+ })),
+ itemset(lit(span(5..6), '&')),
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[&&&&]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..6),
+ negated: false,
+ kind: intersection(
+ span(1..5),
+ intersection(
+ span(1..3),
+ itemset(empty(span(1..1))),
+ itemset(empty(span(3..3))),
+ ),
+ itemset(empty(span(5..5))),
+ ),
+ })))
+ );
+
+ let pat = "[☃-⛄]";
+ assert_eq!(
+ parser(pat).parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span_range(pat, 0..9),
+ negated: false,
+ kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
+ span: span_range(pat, 1..8),
+ start: ast::Literal {
+ span: span_range(pat, 1..4),
+ kind: ast::LiteralKind::Verbatim,
+ c: '☃',
+ },
+ end: ast::Literal {
+ span: span_range(pat, 5..8),
+ kind: ast::LiteralKind::Verbatim,
+ c: '⛄',
+ },
+ })),
+ })))
+ );
+
+ assert_eq!(
+ parser(r"[]]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..3),
+ negated: false,
+ kind: itemset(lit(span(1..2), ']')),
+ })))
+ );
+ assert_eq!(
+ parser(r"[]\[]").parse(),
+ Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..5),
+ negated: false,
+ kind: union(
+ span(1..4),
+ vec![
+ lit(span(1..2), ']'),
+ ast::ClassSetItem::Literal(ast::Literal {
+ span: span(2..4),
+ kind: ast::LiteralKind::Punctuation,
+ c: '[',
+ }),
+ ]
+ ),
+ })))
+ );
+ assert_eq!(
+ parser(r"[\[]]").parse(),
+ Ok(concat(
+ 0..5,
+ vec![
+ Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ span: span(0..4),
+ negated: false,
+ kind: itemset(ast::ClassSetItem::Literal(
+ ast::Literal {
+ span: span(1..3),
+ kind: ast::LiteralKind::Punctuation,
+ c: '[',
+ }
+ )),
+ })),
+ Ast::Literal(ast::Literal {
+ span: span(4..5),
+ kind: ast::LiteralKind::Verbatim,
+ c: ']',
+ }),
+ ]
+ ))
+ );
+
+ assert_eq!(
+ parser("[").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("[[").parse().unwrap_err(),
+ TestError {
+ span: span(1..2),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("[[-]").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("[[[:alnum:]").parse().unwrap_err(),
+ TestError {
+ span: span(1..2),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser(r"[\b]").parse().unwrap_err(),
+ TestError {
+ span: span(1..3),
+ kind: ast::ErrorKind::ClassEscapeInvalid,
+ }
+ );
+ assert_eq!(
+ parser(r"[\w-a]").parse().unwrap_err(),
+ TestError {
+ span: span(1..3),
+ kind: ast::ErrorKind::ClassRangeLiteral,
+ }
+ );
+ assert_eq!(
+ parser(r"[a-\w]").parse().unwrap_err(),
+ TestError {
+ span: span(3..5),
+ kind: ast::ErrorKind::ClassRangeLiteral,
+ }
+ );
+ assert_eq!(
+ parser(r"[z-a]").parse().unwrap_err(),
+ TestError {
+ span: span(1..4),
+ kind: ast::ErrorKind::ClassRangeInvalid,
+ }
+ );
+
+ assert_eq!(
+ parser_ignore_whitespace("[a ").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser_ignore_whitespace("[a- ").parse().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_set_class_open() {
+ assert_eq!(parser("[a]").parse_set_class_open(), {
+ let set = ast::ClassBracketed {
+ span: span(0..1),
+ negated: false,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(1..1),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion { span: span(1..1), items: vec![] };
+ Ok((set, union))
+ });
+ assert_eq!(
+ parser_ignore_whitespace("[ a]").parse_set_class_open(),
+ {
+ let set = ast::ClassBracketed {
+ span: span(0..4),
+ negated: false,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(4..4),
+ items: vec![],
+ }),
+ };
+ let union =
+ ast::ClassSetUnion { span: span(4..4), items: vec![] };
+ Ok((set, union))
+ }
+ );
+ assert_eq!(parser("[^a]").parse_set_class_open(), {
+ let set = ast::ClassBracketed {
+ span: span(0..2),
+ negated: true,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(2..2),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion { span: span(2..2), items: vec![] };
+ Ok((set, union))
+ });
+ assert_eq!(
+ parser_ignore_whitespace("[ ^ a]").parse_set_class_open(),
+ {
+ let set = ast::ClassBracketed {
+ span: span(0..4),
+ negated: true,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(4..4),
+ items: vec![],
+ }),
+ };
+ let union =
+ ast::ClassSetUnion { span: span(4..4), items: vec![] };
+ Ok((set, union))
+ }
+ );
+ assert_eq!(parser("[-a]").parse_set_class_open(), {
+ let set = ast::ClassBracketed {
+ span: span(0..2),
+ negated: false,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(1..1),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion {
+ span: span(1..2),
+ items: vec![ast::ClassSetItem::Literal(ast::Literal {
+ span: span(1..2),
+ kind: ast::LiteralKind::Verbatim,
+ c: '-',
+ })],
+ };
+ Ok((set, union))
+ });
+ assert_eq!(
+ parser_ignore_whitespace("[ - a]").parse_set_class_open(),
+ {
+ let set = ast::ClassBracketed {
+ span: span(0..4),
+ negated: false,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(2..2),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion {
+ span: span(2..3),
+ items: vec![ast::ClassSetItem::Literal(ast::Literal {
+ span: span(2..3),
+ kind: ast::LiteralKind::Verbatim,
+ c: '-',
+ })],
+ };
+ Ok((set, union))
+ }
+ );
+ assert_eq!(parser("[^-a]").parse_set_class_open(), {
+ let set = ast::ClassBracketed {
+ span: span(0..3),
+ negated: true,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(2..2),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion {
+ span: span(2..3),
+ items: vec![ast::ClassSetItem::Literal(ast::Literal {
+ span: span(2..3),
+ kind: ast::LiteralKind::Verbatim,
+ c: '-',
+ })],
+ };
+ Ok((set, union))
+ });
+ assert_eq!(parser("[--a]").parse_set_class_open(), {
+ let set = ast::ClassBracketed {
+ span: span(0..3),
+ negated: false,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(1..1),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion {
+ span: span(1..3),
+ items: vec![
+ ast::ClassSetItem::Literal(ast::Literal {
+ span: span(1..2),
+ kind: ast::LiteralKind::Verbatim,
+ c: '-',
+ }),
+ ast::ClassSetItem::Literal(ast::Literal {
+ span: span(2..3),
+ kind: ast::LiteralKind::Verbatim,
+ c: '-',
+ }),
+ ],
+ };
+ Ok((set, union))
+ });
+ assert_eq!(parser("[]a]").parse_set_class_open(), {
+ let set = ast::ClassBracketed {
+ span: span(0..2),
+ negated: false,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(1..1),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion {
+ span: span(1..2),
+ items: vec![ast::ClassSetItem::Literal(ast::Literal {
+ span: span(1..2),
+ kind: ast::LiteralKind::Verbatim,
+ c: ']',
+ })],
+ };
+ Ok((set, union))
+ });
+ assert_eq!(
+ parser_ignore_whitespace("[ ] a]").parse_set_class_open(),
+ {
+ let set = ast::ClassBracketed {
+ span: span(0..4),
+ negated: false,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(2..2),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion {
+ span: span(2..3),
+ items: vec![ast::ClassSetItem::Literal(ast::Literal {
+ span: span(2..3),
+ kind: ast::LiteralKind::Verbatim,
+ c: ']',
+ })],
+ };
+ Ok((set, union))
+ }
+ );
+ assert_eq!(parser("[^]a]").parse_set_class_open(), {
+ let set = ast::ClassBracketed {
+ span: span(0..3),
+ negated: true,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(2..2),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion {
+ span: span(2..3),
+ items: vec![ast::ClassSetItem::Literal(ast::Literal {
+ span: span(2..3),
+ kind: ast::LiteralKind::Verbatim,
+ c: ']',
+ })],
+ };
+ Ok((set, union))
+ });
+ assert_eq!(parser("[-]a]").parse_set_class_open(), {
+ let set = ast::ClassBracketed {
+ span: span(0..2),
+ negated: false,
+ kind: ast::ClassSet::union(ast::ClassSetUnion {
+ span: span(1..1),
+ items: vec![],
+ }),
+ };
+ let union = ast::ClassSetUnion {
+ span: span(1..2),
+ items: vec![ast::ClassSetItem::Literal(ast::Literal {
+ span: span(1..2),
+ kind: ast::LiteralKind::Verbatim,
+ c: '-',
+ })],
+ };
+ Ok((set, union))
+ });
+
+ assert_eq!(
+ parser("[").parse_set_class_open().unwrap_err(),
+ TestError {
+ span: span(0..1),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser_ignore_whitespace("[ ")
+ .parse_set_class_open()
+ .unwrap_err(),
+ TestError {
+ span: span(0..5),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("[^").parse_set_class_open().unwrap_err(),
+ TestError {
+ span: span(0..2),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("[]").parse_set_class_open().unwrap_err(),
+ TestError {
+ span: span(0..2),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("[-").parse_set_class_open().unwrap_err(),
+ TestError {
+ span: span(0..0),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ assert_eq!(
+ parser("[--").parse_set_class_open().unwrap_err(),
+ TestError {
+ span: span(0..0),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+
+ // See: https://github.com/rust-lang/regex/issues/792
+ assert_eq!(
+ parser("(?x)[-#]").parse_with_comments().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::ClassUnclosed,
+ }
+ );
+ }
+
+ #[test]
+ fn maybe_parse_ascii_class() {
+ assert_eq!(
+ parser(r"[:alnum:]").maybe_parse_ascii_class(),
+ Some(ast::ClassAscii {
+ span: span(0..9),
+ kind: ast::ClassAsciiKind::Alnum,
+ negated: false,
+ })
+ );
+ assert_eq!(
+ parser(r"[:alnum:]A").maybe_parse_ascii_class(),
+ Some(ast::ClassAscii {
+ span: span(0..9),
+ kind: ast::ClassAsciiKind::Alnum,
+ negated: false,
+ })
+ );
+ assert_eq!(
+ parser(r"[:^alnum:]").maybe_parse_ascii_class(),
+ Some(ast::ClassAscii {
+ span: span(0..10),
+ kind: ast::ClassAsciiKind::Alnum,
+ negated: true,
+ })
+ );
+
+ let p = parser(r"[:");
+ assert_eq!(p.maybe_parse_ascii_class(), None);
+ assert_eq!(p.offset(), 0);
+
+ let p = parser(r"[:^");
+ assert_eq!(p.maybe_parse_ascii_class(), None);
+ assert_eq!(p.offset(), 0);
+
+ let p = parser(r"[^:alnum:]");
+ assert_eq!(p.maybe_parse_ascii_class(), None);
+ assert_eq!(p.offset(), 0);
+
+ let p = parser(r"[:alnnum:]");
+ assert_eq!(p.maybe_parse_ascii_class(), None);
+ assert_eq!(p.offset(), 0);
+
+ let p = parser(r"[:alnum]");
+ assert_eq!(p.maybe_parse_ascii_class(), None);
+ assert_eq!(p.offset(), 0);
+
+ let p = parser(r"[:alnum:");
+ assert_eq!(p.maybe_parse_ascii_class(), None);
+ assert_eq!(p.offset(), 0);
+ }
+
+ #[test]
+ fn parse_unicode_class() {
+ assert_eq!(
+ parser(r"\pN").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..3),
+ negated: false,
+ kind: ast::ClassUnicodeKind::OneLetter('N'),
+ }))
+ );
+ assert_eq!(
+ parser(r"\PN").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..3),
+ negated: true,
+ kind: ast::ClassUnicodeKind::OneLetter('N'),
+ }))
+ );
+ assert_eq!(
+ parser(r"\p{N}").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..5),
+ negated: false,
+ kind: ast::ClassUnicodeKind::Named(s("N")),
+ }))
+ );
+ assert_eq!(
+ parser(r"\P{N}").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..5),
+ negated: true,
+ kind: ast::ClassUnicodeKind::Named(s("N")),
+ }))
+ );
+ assert_eq!(
+ parser(r"\p{Greek}").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..9),
+ negated: false,
+ kind: ast::ClassUnicodeKind::Named(s("Greek")),
+ }))
+ );
+
+ assert_eq!(
+ parser(r"\p{scx:Katakana}").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..16),
+ negated: false,
+ kind: ast::ClassUnicodeKind::NamedValue {
+ op: ast::ClassUnicodeOpKind::Colon,
+ name: s("scx"),
+ value: s("Katakana"),
+ },
+ }))
+ );
+ assert_eq!(
+ parser(r"\p{scx=Katakana}").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..16),
+ negated: false,
+ kind: ast::ClassUnicodeKind::NamedValue {
+ op: ast::ClassUnicodeOpKind::Equal,
+ name: s("scx"),
+ value: s("Katakana"),
+ },
+ }))
+ );
+ assert_eq!(
+ parser(r"\p{scx!=Katakana}").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..17),
+ negated: false,
+ kind: ast::ClassUnicodeKind::NamedValue {
+ op: ast::ClassUnicodeOpKind::NotEqual,
+ name: s("scx"),
+ value: s("Katakana"),
+ },
+ }))
+ );
+
+ assert_eq!(
+ parser(r"\p{:}").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..5),
+ negated: false,
+ kind: ast::ClassUnicodeKind::NamedValue {
+ op: ast::ClassUnicodeOpKind::Colon,
+ name: s(""),
+ value: s(""),
+ },
+ }))
+ );
+ assert_eq!(
+ parser(r"\p{=}").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..5),
+ negated: false,
+ kind: ast::ClassUnicodeKind::NamedValue {
+ op: ast::ClassUnicodeOpKind::Equal,
+ name: s(""),
+ value: s(""),
+ },
+ }))
+ );
+ assert_eq!(
+ parser(r"\p{!=}").parse_escape(),
+ Ok(Primitive::Unicode(ast::ClassUnicode {
+ span: span(0..6),
+ negated: false,
+ kind: ast::ClassUnicodeKind::NamedValue {
+ op: ast::ClassUnicodeOpKind::NotEqual,
+ name: s(""),
+ value: s(""),
+ },
+ }))
+ );
+
+ assert_eq!(
+ parser(r"\p").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..2),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\p{").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..3),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\p{N").parse_escape().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+ assert_eq!(
+ parser(r"\p{Greek").parse_escape().unwrap_err(),
+ TestError {
+ span: span(8..8),
+ kind: ast::ErrorKind::EscapeUnexpectedEof,
+ }
+ );
+
+ assert_eq!(
+ parser(r"\pNz").parse(),
+ Ok(Ast::Concat(ast::Concat {
+ span: span(0..4),
+ asts: vec![
+ Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
+ span: span(0..3),
+ negated: false,
+ kind: ast::ClassUnicodeKind::OneLetter('N'),
+ })),
+ Ast::Literal(ast::Literal {
+ span: span(3..4),
+ kind: ast::LiteralKind::Verbatim,
+ c: 'z',
+ }),
+ ],
+ }))
+ );
+ assert_eq!(
+ parser(r"\p{Greek}z").parse(),
+ Ok(Ast::Concat(ast::Concat {
+ span: span(0..10),
+ asts: vec![
+ Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
+ span: span(0..9),
+ negated: false,
+ kind: ast::ClassUnicodeKind::Named(s("Greek")),
+ })),
+ Ast::Literal(ast::Literal {
+ span: span(9..10),
+ kind: ast::LiteralKind::Verbatim,
+ c: 'z',
+ }),
+ ],
+ }))
+ );
+ assert_eq!(
+ parser(r"\p\{").parse().unwrap_err(),
+ TestError {
+ span: span(2..3),
+ kind: ast::ErrorKind::UnicodeClassInvalid,
+ }
+ );
+ assert_eq!(
+ parser(r"\P\{").parse().unwrap_err(),
+ TestError {
+ span: span(2..3),
+ kind: ast::ErrorKind::UnicodeClassInvalid,
+ }
+ );
+ }
+
+ #[test]
+ fn parse_perl_class() {
+ assert_eq!(
+ parser(r"\d").parse_escape(),
+ Ok(Primitive::Perl(ast::ClassPerl {
+ span: span(0..2),
+ kind: ast::ClassPerlKind::Digit,
+ negated: false,
+ }))
+ );
+ assert_eq!(
+ parser(r"\D").parse_escape(),
+ Ok(Primitive::Perl(ast::ClassPerl {
+ span: span(0..2),
+ kind: ast::ClassPerlKind::Digit,
+ negated: true,
+ }))
+ );
+ assert_eq!(
+ parser(r"\s").parse_escape(),
+ Ok(Primitive::Perl(ast::ClassPerl {
+ span: span(0..2),
+ kind: ast::ClassPerlKind::Space,
+ negated: false,
+ }))
+ );
+ assert_eq!(
+ parser(r"\S").parse_escape(),
+ Ok(Primitive::Perl(ast::ClassPerl {
+ span: span(0..2),
+ kind: ast::ClassPerlKind::Space,
+ negated: true,
+ }))
+ );
+ assert_eq!(
+ parser(r"\w").parse_escape(),
+ Ok(Primitive::Perl(ast::ClassPerl {
+ span: span(0..2),
+ kind: ast::ClassPerlKind::Word,
+ negated: false,
+ }))
+ );
+ assert_eq!(
+ parser(r"\W").parse_escape(),
+ Ok(Primitive::Perl(ast::ClassPerl {
+ span: span(0..2),
+ kind: ast::ClassPerlKind::Word,
+ negated: true,
+ }))
+ );
+
+ assert_eq!(
+ parser(r"\d").parse(),
+ Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
+ span: span(0..2),
+ kind: ast::ClassPerlKind::Digit,
+ negated: false,
+ })))
+ );
+ assert_eq!(
+ parser(r"\dz").parse(),
+ Ok(Ast::Concat(ast::Concat {
+ span: span(0..3),
+ asts: vec![
+ Ast::Class(ast::Class::Perl(ast::ClassPerl {
+ span: span(0..2),
+ kind: ast::ClassPerlKind::Digit,
+ negated: false,
+ })),
+ Ast::Literal(ast::Literal {
+ span: span(2..3),
+ kind: ast::LiteralKind::Verbatim,
+ c: 'z',
+ }),
+ ],
+ }))
+ );
+ }
+
+ // This tests a bug fix where the nest limit checker wasn't decrementing
+ // its depth during post-traversal, which causes long regexes to trip
+ // the default limit too aggressively.
+ #[test]
+ fn regression_454_nest_too_big() {
+ let pattern = r#"
+ 2(?:
+ [45]\d{3}|
+ 7(?:
+ 1[0-267]|
+ 2[0-289]|
+ 3[0-29]|
+ 4[01]|
+ 5[1-3]|
+ 6[013]|
+ 7[0178]|
+ 91
+ )|
+ 8(?:
+ 0[125]|
+ [139][1-6]|
+ 2[0157-9]|
+ 41|
+ 6[1-35]|
+ 7[1-5]|
+ 8[1-8]|
+ 90
+ )|
+ 9(?:
+ 0[0-2]|
+ 1[0-4]|
+ 2[568]|
+ 3[3-6]|
+ 5[5-7]|
+ 6[0167]|
+ 7[15]|
+ 8[0146-9]
+ )
+ )\d{4}
+ "#;
+ assert!(parser_nest_limit(pattern, 50).parse().is_ok());
+ }
+
+ // This tests that we treat a trailing `-` in a character class as a
+ // literal `-` even when whitespace mode is enabled and there is whitespace
+ // after the trailing `-`.
+ #[test]
+ fn regression_455_trailing_dash_ignore_whitespace() {
+ assert!(parser("(?x)[ / - ]").parse().is_ok());
+ assert!(parser("(?x)[ a - ]").parse().is_ok());
+ assert!(parser(
+ "(?x)[
+ a
+ - ]
+ "
+ )
+ .parse()
+ .is_ok());
+ assert!(parser(
+ "(?x)[
+ a # wat
+ - ]
+ "
+ )
+ .parse()
+ .is_ok());
+
+ assert!(parser("(?x)[ / -").parse().is_err());
+ assert!(parser("(?x)[ / - ").parse().is_err());
+ assert!(parser(
+ "(?x)[
+ / -
+ "
+ )
+ .parse()
+ .is_err());
+ assert!(parser(
+ "(?x)[
+ / - # wat
+ "
+ )
+ .parse()
+ .is_err());
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/ast/print.rs b/third_party/rust/regex-syntax/src/ast/print.rs
new file mode 100644
index 0000000000..045de2eaf2
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/ast/print.rs
@@ -0,0 +1,568 @@
+/*!
+This module provides a regular expression printer for `Ast`.
+*/
+
+use std::fmt;
+
+use crate::ast::visitor::{self, Visitor};
+use crate::ast::{self, Ast};
+
+/// A builder for constructing a printer.
+///
+/// Note that since a printer doesn't have any configuration knobs, this type
+/// remains unexported.
+#[derive(Clone, Debug)]
+struct PrinterBuilder {
+ _priv: (),
+}
+
+impl Default for PrinterBuilder {
+ fn default() -> PrinterBuilder {
+ PrinterBuilder::new()
+ }
+}
+
+impl PrinterBuilder {
+ fn new() -> PrinterBuilder {
+ PrinterBuilder { _priv: () }
+ }
+
+ fn build(&self) -> Printer {
+ Printer { _priv: () }
+ }
+}
+
+/// A printer for a regular expression abstract syntax tree.
+///
+/// A printer converts an abstract syntax tree (AST) to a regular expression
+/// pattern string. This particular printer uses constant stack space and heap
+/// space proportional to the size of the AST.
+///
+/// This printer will not necessarily preserve the original formatting of the
+/// regular expression pattern string. For example, all whitespace and comments
+/// are ignored.
+#[derive(Debug)]
+pub struct Printer {
+ _priv: (),
+}
+
+impl Printer {
+ /// Create a new printer.
+ pub fn new() -> Printer {
+ PrinterBuilder::new().build()
+ }
+
+ /// Print the given `Ast` to the given writer. The writer must implement
+ /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
+ /// here are a `fmt::Formatter` (which is available in `fmt::Display`
+ /// implementations) or a `&mut String`.
+ pub fn print<W: fmt::Write>(&mut self, ast: &Ast, wtr: W) -> fmt::Result {
+ visitor::visit(ast, Writer { wtr })
+ }
+}
+
+#[derive(Debug)]
+struct Writer<W> {
+ wtr: W,
+}
+
+impl<W: fmt::Write> Visitor for Writer<W> {
+ type Output = ();
+ type Err = fmt::Error;
+
+ fn finish(self) -> fmt::Result {
+ Ok(())
+ }
+
+ fn visit_pre(&mut self, ast: &Ast) -> fmt::Result {
+ match *ast {
+ Ast::Group(ref x) => self.fmt_group_pre(x),
+ Ast::Class(ast::Class::Bracketed(ref x)) => {
+ self.fmt_class_bracketed_pre(x)
+ }
+ _ => Ok(()),
+ }
+ }
+
+ fn visit_post(&mut self, ast: &Ast) -> fmt::Result {
+ use crate::ast::Class;
+
+ match *ast {
+ Ast::Empty(_) => Ok(()),
+ Ast::Flags(ref x) => self.fmt_set_flags(x),
+ Ast::Literal(ref x) => self.fmt_literal(x),
+ Ast::Dot(_) => self.wtr.write_str("."),
+ Ast::Assertion(ref x) => self.fmt_assertion(x),
+ Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x),
+ Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x),
+ Ast::Class(Class::Bracketed(ref x)) => {
+ self.fmt_class_bracketed_post(x)
+ }
+ Ast::Repetition(ref x) => self.fmt_repetition(x),
+ Ast::Group(ref x) => self.fmt_group_post(x),
+ Ast::Alternation(_) => Ok(()),
+ Ast::Concat(_) => Ok(()),
+ }
+ }
+
+ fn visit_alternation_in(&mut self) -> fmt::Result {
+ self.wtr.write_str("|")
+ }
+
+ fn visit_class_set_item_pre(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<(), Self::Err> {
+ match *ast {
+ ast::ClassSetItem::Bracketed(ref x) => {
+ self.fmt_class_bracketed_pre(x)
+ }
+ _ => Ok(()),
+ }
+ }
+
+ fn visit_class_set_item_post(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<(), Self::Err> {
+ use crate::ast::ClassSetItem::*;
+
+ match *ast {
+ Empty(_) => Ok(()),
+ Literal(ref x) => self.fmt_literal(x),
+ Range(ref x) => {
+ self.fmt_literal(&x.start)?;
+ self.wtr.write_str("-")?;
+ self.fmt_literal(&x.end)?;
+ Ok(())
+ }
+ Ascii(ref x) => self.fmt_class_ascii(x),
+ Unicode(ref x) => self.fmt_class_unicode(x),
+ Perl(ref x) => self.fmt_class_perl(x),
+ Bracketed(ref x) => self.fmt_class_bracketed_post(x),
+ Union(_) => Ok(()),
+ }
+ }
+
+ fn visit_class_set_binary_op_in(
+ &mut self,
+ ast: &ast::ClassSetBinaryOp,
+ ) -> Result<(), Self::Err> {
+ self.fmt_class_set_binary_op_kind(&ast.kind)
+ }
+}
+
+impl<W: fmt::Write> Writer<W> {
+ fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result {
+ use crate::ast::GroupKind::*;
+ match ast.kind {
+ CaptureIndex(_) => self.wtr.write_str("("),
+ CaptureName(ref x) => {
+ self.wtr.write_str("(?P<")?;
+ self.wtr.write_str(&x.name)?;
+ self.wtr.write_str(">")?;
+ Ok(())
+ }
+ NonCapturing(ref flags) => {
+ self.wtr.write_str("(?")?;
+ self.fmt_flags(flags)?;
+ self.wtr.write_str(":")?;
+ Ok(())
+ }
+ }
+ }
+
+ fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result {
+ self.wtr.write_str(")")
+ }
+
+ fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result {
+ use crate::ast::RepetitionKind::*;
+ match ast.op.kind {
+ ZeroOrOne if ast.greedy => self.wtr.write_str("?"),
+ ZeroOrOne => self.wtr.write_str("??"),
+ ZeroOrMore if ast.greedy => self.wtr.write_str("*"),
+ ZeroOrMore => self.wtr.write_str("*?"),
+ OneOrMore if ast.greedy => self.wtr.write_str("+"),
+ OneOrMore => self.wtr.write_str("+?"),
+ Range(ref x) => {
+ self.fmt_repetition_range(x)?;
+ if !ast.greedy {
+ self.wtr.write_str("?")?;
+ }
+ Ok(())
+ }
+ }
+ }
+
+ fn fmt_repetition_range(
+ &mut self,
+ ast: &ast::RepetitionRange,
+ ) -> fmt::Result {
+ use crate::ast::RepetitionRange::*;
+ match *ast {
+ Exactly(x) => write!(self.wtr, "{{{}}}", x),
+ AtLeast(x) => write!(self.wtr, "{{{},}}", x),
+ Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y),
+ }
+ }
+
+ fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result {
+ use crate::ast::LiteralKind::*;
+
+ match ast.kind {
+ Verbatim => self.wtr.write_char(ast.c),
+ Punctuation => write!(self.wtr, r"\{}", ast.c),
+ Octal => write!(self.wtr, r"\{:o}", ast.c as u32),
+ HexFixed(ast::HexLiteralKind::X) => {
+ write!(self.wtr, r"\x{:02X}", ast.c as u32)
+ }
+ HexFixed(ast::HexLiteralKind::UnicodeShort) => {
+ write!(self.wtr, r"\u{:04X}", ast.c as u32)
+ }
+ HexFixed(ast::HexLiteralKind::UnicodeLong) => {
+ write!(self.wtr, r"\U{:08X}", ast.c as u32)
+ }
+ HexBrace(ast::HexLiteralKind::X) => {
+ write!(self.wtr, r"\x{{{:X}}}", ast.c as u32)
+ }
+ HexBrace(ast::HexLiteralKind::UnicodeShort) => {
+ write!(self.wtr, r"\u{{{:X}}}", ast.c as u32)
+ }
+ HexBrace(ast::HexLiteralKind::UnicodeLong) => {
+ write!(self.wtr, r"\U{{{:X}}}", ast.c as u32)
+ }
+ Special(ast::SpecialLiteralKind::Bell) => {
+ self.wtr.write_str(r"\a")
+ }
+ Special(ast::SpecialLiteralKind::FormFeed) => {
+ self.wtr.write_str(r"\f")
+ }
+ Special(ast::SpecialLiteralKind::Tab) => self.wtr.write_str(r"\t"),
+ Special(ast::SpecialLiteralKind::LineFeed) => {
+ self.wtr.write_str(r"\n")
+ }
+ Special(ast::SpecialLiteralKind::CarriageReturn) => {
+ self.wtr.write_str(r"\r")
+ }
+ Special(ast::SpecialLiteralKind::VerticalTab) => {
+ self.wtr.write_str(r"\v")
+ }
+ Special(ast::SpecialLiteralKind::Space) => {
+ self.wtr.write_str(r"\ ")
+ }
+ }
+ }
+
+ fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result {
+ use crate::ast::AssertionKind::*;
+ match ast.kind {
+ StartLine => self.wtr.write_str("^"),
+ EndLine => self.wtr.write_str("$"),
+ StartText => self.wtr.write_str(r"\A"),
+ EndText => self.wtr.write_str(r"\z"),
+ WordBoundary => self.wtr.write_str(r"\b"),
+ NotWordBoundary => self.wtr.write_str(r"\B"),
+ }
+ }
+
+ fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result {
+ self.wtr.write_str("(?")?;
+ self.fmt_flags(&ast.flags)?;
+ self.wtr.write_str(")")?;
+ Ok(())
+ }
+
+ fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result {
+ use crate::ast::{Flag, FlagsItemKind};
+
+ for item in &ast.items {
+ match item.kind {
+ FlagsItemKind::Negation => self.wtr.write_str("-"),
+ FlagsItemKind::Flag(ref flag) => match *flag {
+ Flag::CaseInsensitive => self.wtr.write_str("i"),
+ Flag::MultiLine => self.wtr.write_str("m"),
+ Flag::DotMatchesNewLine => self.wtr.write_str("s"),
+ Flag::SwapGreed => self.wtr.write_str("U"),
+ Flag::Unicode => self.wtr.write_str("u"),
+ Flag::IgnoreWhitespace => self.wtr.write_str("x"),
+ },
+ }?;
+ }
+ Ok(())
+ }
+
+ fn fmt_class_bracketed_pre(
+ &mut self,
+ ast: &ast::ClassBracketed,
+ ) -> fmt::Result {
+ if ast.negated {
+ self.wtr.write_str("[^")
+ } else {
+ self.wtr.write_str("[")
+ }
+ }
+
+ fn fmt_class_bracketed_post(
+ &mut self,
+ _ast: &ast::ClassBracketed,
+ ) -> fmt::Result {
+ self.wtr.write_str("]")
+ }
+
+ fn fmt_class_set_binary_op_kind(
+ &mut self,
+ ast: &ast::ClassSetBinaryOpKind,
+ ) -> fmt::Result {
+ use crate::ast::ClassSetBinaryOpKind::*;
+ match *ast {
+ Intersection => self.wtr.write_str("&&"),
+ Difference => self.wtr.write_str("--"),
+ SymmetricDifference => self.wtr.write_str("~~"),
+ }
+ }
+
+ fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result {
+ use crate::ast::ClassPerlKind::*;
+ match ast.kind {
+ Digit if ast.negated => self.wtr.write_str(r"\D"),
+ Digit => self.wtr.write_str(r"\d"),
+ Space if ast.negated => self.wtr.write_str(r"\S"),
+ Space => self.wtr.write_str(r"\s"),
+ Word if ast.negated => self.wtr.write_str(r"\W"),
+ Word => self.wtr.write_str(r"\w"),
+ }
+ }
+
+ fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result {
+ use crate::ast::ClassAsciiKind::*;
+ match ast.kind {
+ Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"),
+ Alnum => self.wtr.write_str("[:alnum:]"),
+ Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"),
+ Alpha => self.wtr.write_str("[:alpha:]"),
+ Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"),
+ Ascii => self.wtr.write_str("[:ascii:]"),
+ Blank if ast.negated => self.wtr.write_str("[:^blank:]"),
+ Blank => self.wtr.write_str("[:blank:]"),
+ Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"),
+ Cntrl => self.wtr.write_str("[:cntrl:]"),
+ Digit if ast.negated => self.wtr.write_str("[:^digit:]"),
+ Digit => self.wtr.write_str("[:digit:]"),
+ Graph if ast.negated => self.wtr.write_str("[:^graph:]"),
+ Graph => self.wtr.write_str("[:graph:]"),
+ Lower if ast.negated => self.wtr.write_str("[:^lower:]"),
+ Lower => self.wtr.write_str("[:lower:]"),
+ Print if ast.negated => self.wtr.write_str("[:^print:]"),
+ Print => self.wtr.write_str("[:print:]"),
+ Punct if ast.negated => self.wtr.write_str("[:^punct:]"),
+ Punct => self.wtr.write_str("[:punct:]"),
+ Space if ast.negated => self.wtr.write_str("[:^space:]"),
+ Space => self.wtr.write_str("[:space:]"),
+ Upper if ast.negated => self.wtr.write_str("[:^upper:]"),
+ Upper => self.wtr.write_str("[:upper:]"),
+ Word if ast.negated => self.wtr.write_str("[:^word:]"),
+ Word => self.wtr.write_str("[:word:]"),
+ Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"),
+ Xdigit => self.wtr.write_str("[:xdigit:]"),
+ }
+ }
+
+ fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result {
+ use crate::ast::ClassUnicodeKind::*;
+ use crate::ast::ClassUnicodeOpKind::*;
+
+ if ast.negated {
+ self.wtr.write_str(r"\P")?;
+ } else {
+ self.wtr.write_str(r"\p")?;
+ }
+ match ast.kind {
+ OneLetter(c) => self.wtr.write_char(c),
+ Named(ref x) => write!(self.wtr, "{{{}}}", x),
+ NamedValue { op: Equal, ref name, ref value } => {
+ write!(self.wtr, "{{{}={}}}", name, value)
+ }
+ NamedValue { op: Colon, ref name, ref value } => {
+ write!(self.wtr, "{{{}:{}}}", name, value)
+ }
+ NamedValue { op: NotEqual, ref name, ref value } => {
+ write!(self.wtr, "{{{}!={}}}", name, value)
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::Printer;
+ use crate::ast::parse::ParserBuilder;
+
+ fn roundtrip(given: &str) {
+ roundtrip_with(|b| b, given);
+ }
+
+ fn roundtrip_with<F>(mut f: F, given: &str)
+ where
+ F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
+ {
+ let mut builder = ParserBuilder::new();
+ f(&mut builder);
+ let ast = builder.build().parse(given).unwrap();
+
+ let mut printer = Printer::new();
+ let mut dst = String::new();
+ printer.print(&ast, &mut dst).unwrap();
+ assert_eq!(given, dst);
+ }
+
+ #[test]
+ fn print_literal() {
+ roundtrip("a");
+ roundtrip(r"\[");
+ roundtrip_with(|b| b.octal(true), r"\141");
+ roundtrip(r"\x61");
+ roundtrip(r"\x7F");
+ roundtrip(r"\u0061");
+ roundtrip(r"\U00000061");
+ roundtrip(r"\x{61}");
+ roundtrip(r"\x{7F}");
+ roundtrip(r"\u{61}");
+ roundtrip(r"\U{61}");
+
+ roundtrip(r"\a");
+ roundtrip(r"\f");
+ roundtrip(r"\t");
+ roundtrip(r"\n");
+ roundtrip(r"\r");
+ roundtrip(r"\v");
+ roundtrip(r"(?x)\ ");
+ }
+
+ #[test]
+ fn print_dot() {
+ roundtrip(".");
+ }
+
+ #[test]
+ fn print_concat() {
+ roundtrip("ab");
+ roundtrip("abcde");
+ roundtrip("a(bcd)ef");
+ }
+
+ #[test]
+ fn print_alternation() {
+ roundtrip("a|b");
+ roundtrip("a|b|c|d|e");
+ roundtrip("|a|b|c|d|e");
+ roundtrip("|a|b|c|d|e|");
+ roundtrip("a(b|c|d)|e|f");
+ }
+
+ #[test]
+ fn print_assertion() {
+ roundtrip(r"^");
+ roundtrip(r"$");
+ roundtrip(r"\A");
+ roundtrip(r"\z");
+ roundtrip(r"\b");
+ roundtrip(r"\B");
+ }
+
+ #[test]
+ fn print_repetition() {
+ roundtrip("a?");
+ roundtrip("a??");
+ roundtrip("a*");
+ roundtrip("a*?");
+ roundtrip("a+");
+ roundtrip("a+?");
+ roundtrip("a{5}");
+ roundtrip("a{5}?");
+ roundtrip("a{5,}");
+ roundtrip("a{5,}?");
+ roundtrip("a{5,10}");
+ roundtrip("a{5,10}?");
+ }
+
+ #[test]
+ fn print_flags() {
+ roundtrip("(?i)");
+ roundtrip("(?-i)");
+ roundtrip("(?s-i)");
+ roundtrip("(?-si)");
+ roundtrip("(?siUmux)");
+ }
+
+ #[test]
+ fn print_group() {
+ roundtrip("(?i:a)");
+ roundtrip("(?P<foo>a)");
+ roundtrip("(a)");
+ }
+
+ #[test]
+ fn print_class() {
+ roundtrip(r"[abc]");
+ roundtrip(r"[a-z]");
+ roundtrip(r"[^a-z]");
+ roundtrip(r"[a-z0-9]");
+ roundtrip(r"[-a-z0-9]");
+ roundtrip(r"[-a-z0-9]");
+ roundtrip(r"[a-z0-9---]");
+ roundtrip(r"[a-z&&m-n]");
+ roundtrip(r"[[a-z&&m-n]]");
+ roundtrip(r"[a-z--m-n]");
+ roundtrip(r"[a-z~~m-n]");
+ roundtrip(r"[a-z[0-9]]");
+ roundtrip(r"[a-z[^0-9]]");
+
+ roundtrip(r"\d");
+ roundtrip(r"\D");
+ roundtrip(r"\s");
+ roundtrip(r"\S");
+ roundtrip(r"\w");
+ roundtrip(r"\W");
+
+ roundtrip(r"[[:alnum:]]");
+ roundtrip(r"[[:^alnum:]]");
+ roundtrip(r"[[:alpha:]]");
+ roundtrip(r"[[:^alpha:]]");
+ roundtrip(r"[[:ascii:]]");
+ roundtrip(r"[[:^ascii:]]");
+ roundtrip(r"[[:blank:]]");
+ roundtrip(r"[[:^blank:]]");
+ roundtrip(r"[[:cntrl:]]");
+ roundtrip(r"[[:^cntrl:]]");
+ roundtrip(r"[[:digit:]]");
+ roundtrip(r"[[:^digit:]]");
+ roundtrip(r"[[:graph:]]");
+ roundtrip(r"[[:^graph:]]");
+ roundtrip(r"[[:lower:]]");
+ roundtrip(r"[[:^lower:]]");
+ roundtrip(r"[[:print:]]");
+ roundtrip(r"[[:^print:]]");
+ roundtrip(r"[[:punct:]]");
+ roundtrip(r"[[:^punct:]]");
+ roundtrip(r"[[:space:]]");
+ roundtrip(r"[[:^space:]]");
+ roundtrip(r"[[:upper:]]");
+ roundtrip(r"[[:^upper:]]");
+ roundtrip(r"[[:word:]]");
+ roundtrip(r"[[:^word:]]");
+ roundtrip(r"[[:xdigit:]]");
+ roundtrip(r"[[:^xdigit:]]");
+
+ roundtrip(r"\pL");
+ roundtrip(r"\PL");
+ roundtrip(r"\p{L}");
+ roundtrip(r"\P{L}");
+ roundtrip(r"\p{X=Y}");
+ roundtrip(r"\P{X=Y}");
+ roundtrip(r"\p{X:Y}");
+ roundtrip(r"\P{X:Y}");
+ roundtrip(r"\p{X!=Y}");
+ roundtrip(r"\P{X!=Y}");
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/ast/visitor.rs b/third_party/rust/regex-syntax/src/ast/visitor.rs
new file mode 100644
index 0000000000..78ee487cff
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/ast/visitor.rs
@@ -0,0 +1,517 @@
+use std::fmt;
+
+use crate::ast::{self, Ast};
+
+/// A trait for visiting an abstract syntax tree (AST) in depth first order.
+///
+/// The principle aim of this trait is to enable callers to perform case
+/// analysis on an abstract syntax tree without necessarily using recursion.
+/// In particular, this permits callers to do case analysis with constant stack
+/// usage, which can be important since the size of an abstract syntax tree
+/// may be proportional to end user input.
+///
+/// Typical usage of this trait involves providing an implementation and then
+/// running it using the [`visit`](fn.visit.html) function.
+///
+/// Note that the abstract syntax tree for a regular expression is quite
+/// complex. Unless you specifically need it, you might be able to use the
+/// much simpler
+/// [high-level intermediate representation](../hir/struct.Hir.html)
+/// and its
+/// [corresponding `Visitor` trait](../hir/trait.Visitor.html)
+/// instead.
+pub trait Visitor {
+ /// The result of visiting an AST.
+ type Output;
+ /// An error that visiting an AST might return.
+ type Err;
+
+ /// All implementors of `Visitor` must provide a `finish` method, which
+ /// yields the result of visiting the AST or an error.
+ fn finish(self) -> Result<Self::Output, Self::Err>;
+
+ /// This method is called before beginning traversal of the AST.
+ fn start(&mut self) {}
+
+ /// This method is called on an `Ast` before descending into child `Ast`
+ /// nodes.
+ fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called on an `Ast` after descending all of its child
+ /// `Ast` nodes.
+ fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called between child nodes of an
+ /// [`Alternation`](struct.Alternation.html).
+ fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called on every
+ /// [`ClassSetItem`](enum.ClassSetItem.html)
+ /// before descending into child nodes.
+ fn visit_class_set_item_pre(
+ &mut self,
+ _ast: &ast::ClassSetItem,
+ ) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called on every
+ /// [`ClassSetItem`](enum.ClassSetItem.html)
+ /// after descending into child nodes.
+ fn visit_class_set_item_post(
+ &mut self,
+ _ast: &ast::ClassSetItem,
+ ) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called on every
+ /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html)
+ /// before descending into child nodes.
+ fn visit_class_set_binary_op_pre(
+ &mut self,
+ _ast: &ast::ClassSetBinaryOp,
+ ) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called on every
+ /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html)
+ /// after descending into child nodes.
+ fn visit_class_set_binary_op_post(
+ &mut self,
+ _ast: &ast::ClassSetBinaryOp,
+ ) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called between the left hand and right hand child nodes
+ /// of a [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html).
+ fn visit_class_set_binary_op_in(
+ &mut self,
+ _ast: &ast::ClassSetBinaryOp,
+ ) -> Result<(), Self::Err> {
+ Ok(())
+ }
+}
+
+/// Executes an implementation of `Visitor` in constant stack space.
+///
+/// This function will visit every node in the given `Ast` while calling the
+/// appropriate methods provided by the
+/// [`Visitor`](trait.Visitor.html) trait.
+///
+/// The primary use case for this method is when one wants to perform case
+/// analysis over an `Ast` without using a stack size proportional to the depth
+/// of the `Ast`. Namely, this method will instead use constant stack size, but
+/// will use heap space proportional to the size of the `Ast`. This may be
+/// desirable in cases where the size of `Ast` is proportional to end user
+/// input.
+///
+/// If the visitor returns an error at any point, then visiting is stopped and
+/// the error is returned.
+pub fn visit<V: Visitor>(ast: &Ast, visitor: V) -> Result<V::Output, V::Err> {
+ HeapVisitor::new().visit(ast, visitor)
+}
+
+/// HeapVisitor visits every item in an `Ast` recursively using constant stack
+/// size and a heap size proportional to the size of the `Ast`.
+struct HeapVisitor<'a> {
+ /// A stack of `Ast` nodes. This is roughly analogous to the call stack
+ /// used in a typical recursive visitor.
+ stack: Vec<(&'a Ast, Frame<'a>)>,
+ /// Similar to the `Ast` stack above, but is used only for character
+ /// classes. In particular, character classes embed their own mini
+ /// recursive syntax.
+ stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>,
+}
+
+/// Represents a single stack frame while performing structural induction over
+/// an `Ast`.
+enum Frame<'a> {
+ /// A stack frame allocated just before descending into a repetition
+ /// operator's child node.
+ Repetition(&'a ast::Repetition),
+ /// A stack frame allocated just before descending into a group's child
+ /// node.
+ Group(&'a ast::Group),
+ /// The stack frame used while visiting every child node of a concatenation
+ /// of expressions.
+ Concat {
+ /// The child node we are currently visiting.
+ head: &'a Ast,
+ /// The remaining child nodes to visit (which may be empty).
+ tail: &'a [Ast],
+ },
+ /// The stack frame used while visiting every child node of an alternation
+ /// of expressions.
+ Alternation {
+ /// The child node we are currently visiting.
+ head: &'a Ast,
+ /// The remaining child nodes to visit (which may be empty).
+ tail: &'a [Ast],
+ },
+}
+
+/// Represents a single stack frame while performing structural induction over
+/// a character class.
+enum ClassFrame<'a> {
+ /// The stack frame used while visiting every child node of a union of
+ /// character class items.
+ Union {
+ /// The child node we are currently visiting.
+ head: &'a ast::ClassSetItem,
+ /// The remaining child nodes to visit (which may be empty).
+ tail: &'a [ast::ClassSetItem],
+ },
+ /// The stack frame used while a binary class operation.
+ Binary { op: &'a ast::ClassSetBinaryOp },
+ /// A stack frame allocated just before descending into a binary operator's
+ /// left hand child node.
+ BinaryLHS {
+ op: &'a ast::ClassSetBinaryOp,
+ lhs: &'a ast::ClassSet,
+ rhs: &'a ast::ClassSet,
+ },
+ /// A stack frame allocated just before descending into a binary operator's
+ /// right hand child node.
+ BinaryRHS { op: &'a ast::ClassSetBinaryOp, rhs: &'a ast::ClassSet },
+}
+
+/// A representation of the inductive step when performing structural induction
+/// over a character class.
+///
+/// Note that there is no analogous explicit type for the inductive step for
+/// `Ast` nodes because the inductive step is just an `Ast`. For character
+/// classes, the inductive step can produce one of two possible child nodes:
+/// an item or a binary operation. (An item cannot be a binary operation
+/// because that would imply binary operations can be unioned in the concrete
+/// syntax, which is not possible.)
+enum ClassInduct<'a> {
+ Item(&'a ast::ClassSetItem),
+ BinaryOp(&'a ast::ClassSetBinaryOp),
+}
+
+impl<'a> HeapVisitor<'a> {
+ fn new() -> HeapVisitor<'a> {
+ HeapVisitor { stack: vec![], stack_class: vec![] }
+ }
+
+ fn visit<V: Visitor>(
+ &mut self,
+ mut ast: &'a Ast,
+ mut visitor: V,
+ ) -> Result<V::Output, V::Err> {
+ self.stack.clear();
+ self.stack_class.clear();
+
+ visitor.start();
+ loop {
+ visitor.visit_pre(ast)?;
+ if let Some(x) = self.induct(ast, &mut visitor)? {
+ let child = x.child();
+ self.stack.push((ast, x));
+ ast = child;
+ continue;
+ }
+ // No induction means we have a base case, so we can post visit
+ // it now.
+ visitor.visit_post(ast)?;
+
+ // At this point, we now try to pop our call stack until it is
+ // either empty or we hit another inductive case.
+ loop {
+ let (post_ast, frame) = match self.stack.pop() {
+ None => return visitor.finish(),
+ Some((post_ast, frame)) => (post_ast, frame),
+ };
+ // If this is a concat/alternate, then we might have additional
+ // inductive steps to process.
+ if let Some(x) = self.pop(frame) {
+ if let Frame::Alternation { .. } = x {
+ visitor.visit_alternation_in()?;
+ }
+ ast = x.child();
+ self.stack.push((post_ast, x));
+ break;
+ }
+ // Otherwise, we've finished visiting all the child nodes for
+ // this AST, so we can post visit it now.
+ visitor.visit_post(post_ast)?;
+ }
+ }
+ }
+
+ /// Build a stack frame for the given AST if one is needed (which occurs if
+ /// and only if there are child nodes in the AST). Otherwise, return None.
+ ///
+ /// If this visits a class, then the underlying visitor implementation may
+ /// return an error which will be passed on here.
+ fn induct<V: Visitor>(
+ &mut self,
+ ast: &'a Ast,
+ visitor: &mut V,
+ ) -> Result<Option<Frame<'a>>, V::Err> {
+ Ok(match *ast {
+ Ast::Class(ast::Class::Bracketed(ref x)) => {
+ self.visit_class(x, visitor)?;
+ None
+ }
+ Ast::Repetition(ref x) => Some(Frame::Repetition(x)),
+ Ast::Group(ref x) => Some(Frame::Group(x)),
+ Ast::Concat(ref x) if x.asts.is_empty() => None,
+ Ast::Concat(ref x) => {
+ Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] })
+ }
+ Ast::Alternation(ref x) if x.asts.is_empty() => None,
+ Ast::Alternation(ref x) => Some(Frame::Alternation {
+ head: &x.asts[0],
+ tail: &x.asts[1..],
+ }),
+ _ => None,
+ })
+ }
+
+ /// Pops the given frame. If the frame has an additional inductive step,
+ /// then return it, otherwise return `None`.
+ fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
+ match induct {
+ Frame::Repetition(_) => None,
+ Frame::Group(_) => None,
+ Frame::Concat { tail, .. } => {
+ if tail.is_empty() {
+ None
+ } else {
+ Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
+ }
+ }
+ Frame::Alternation { tail, .. } => {
+ if tail.is_empty() {
+ None
+ } else {
+ Some(Frame::Alternation {
+ head: &tail[0],
+ tail: &tail[1..],
+ })
+ }
+ }
+ }
+ }
+
+ fn visit_class<V: Visitor>(
+ &mut self,
+ ast: &'a ast::ClassBracketed,
+ visitor: &mut V,
+ ) -> Result<(), V::Err> {
+ let mut ast = ClassInduct::from_bracketed(ast);
+ loop {
+ self.visit_class_pre(&ast, visitor)?;
+ if let Some(x) = self.induct_class(&ast) {
+ let child = x.child();
+ self.stack_class.push((ast, x));
+ ast = child;
+ continue;
+ }
+ self.visit_class_post(&ast, visitor)?;
+
+ // At this point, we now try to pop our call stack until it is
+ // either empty or we hit another inductive case.
+ loop {
+ let (post_ast, frame) = match self.stack_class.pop() {
+ None => return Ok(()),
+ Some((post_ast, frame)) => (post_ast, frame),
+ };
+ // If this is a union or a binary op, then we might have
+ // additional inductive steps to process.
+ if let Some(x) = self.pop_class(frame) {
+ if let ClassFrame::BinaryRHS { ref op, .. } = x {
+ visitor.visit_class_set_binary_op_in(op)?;
+ }
+ ast = x.child();
+ self.stack_class.push((post_ast, x));
+ break;
+ }
+ // Otherwise, we've finished visiting all the child nodes for
+ // this class node, so we can post visit it now.
+ self.visit_class_post(&post_ast, visitor)?;
+ }
+ }
+ }
+
+ /// Call the appropriate `Visitor` methods given an inductive step.
+ fn visit_class_pre<V: Visitor>(
+ &self,
+ ast: &ClassInduct<'a>,
+ visitor: &mut V,
+ ) -> Result<(), V::Err> {
+ match *ast {
+ ClassInduct::Item(item) => {
+ visitor.visit_class_set_item_pre(item)?;
+ }
+ ClassInduct::BinaryOp(op) => {
+ visitor.visit_class_set_binary_op_pre(op)?;
+ }
+ }
+ Ok(())
+ }
+
+ /// Call the appropriate `Visitor` methods given an inductive step.
+ fn visit_class_post<V: Visitor>(
+ &self,
+ ast: &ClassInduct<'a>,
+ visitor: &mut V,
+ ) -> Result<(), V::Err> {
+ match *ast {
+ ClassInduct::Item(item) => {
+ visitor.visit_class_set_item_post(item)?;
+ }
+ ClassInduct::BinaryOp(op) => {
+ visitor.visit_class_set_binary_op_post(op)?;
+ }
+ }
+ Ok(())
+ }
+
+ /// Build a stack frame for the given class node if one is needed (which
+ /// occurs if and only if there are child nodes). Otherwise, return None.
+ fn induct_class(&self, ast: &ClassInduct<'a>) -> Option<ClassFrame<'a>> {
+ match *ast {
+ ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => {
+ match x.kind {
+ ast::ClassSet::Item(ref item) => {
+ Some(ClassFrame::Union { head: item, tail: &[] })
+ }
+ ast::ClassSet::BinaryOp(ref op) => {
+ Some(ClassFrame::Binary { op })
+ }
+ }
+ }
+ ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => {
+ if x.items.is_empty() {
+ None
+ } else {
+ Some(ClassFrame::Union {
+ head: &x.items[0],
+ tail: &x.items[1..],
+ })
+ }
+ }
+ ClassInduct::BinaryOp(op) => {
+ Some(ClassFrame::BinaryLHS { op, lhs: &op.lhs, rhs: &op.rhs })
+ }
+ _ => None,
+ }
+ }
+
+ /// Pops the given frame. If the frame has an additional inductive step,
+ /// then return it, otherwise return `None`.
+ fn pop_class(&self, induct: ClassFrame<'a>) -> Option<ClassFrame<'a>> {
+ match induct {
+ ClassFrame::Union { tail, .. } => {
+ if tail.is_empty() {
+ None
+ } else {
+ Some(ClassFrame::Union {
+ head: &tail[0],
+ tail: &tail[1..],
+ })
+ }
+ }
+ ClassFrame::Binary { .. } => None,
+ ClassFrame::BinaryLHS { op, rhs, .. } => {
+ Some(ClassFrame::BinaryRHS { op, rhs })
+ }
+ ClassFrame::BinaryRHS { .. } => None,
+ }
+ }
+}
+
+impl<'a> Frame<'a> {
+ /// Perform the next inductive step on this frame and return the next
+ /// child AST node to visit.
+ fn child(&self) -> &'a Ast {
+ match *self {
+ Frame::Repetition(rep) => &rep.ast,
+ Frame::Group(group) => &group.ast,
+ Frame::Concat { head, .. } => head,
+ Frame::Alternation { head, .. } => head,
+ }
+ }
+}
+
+impl<'a> ClassFrame<'a> {
+ /// Perform the next inductive step on this frame and return the next
+ /// child class node to visit.
+ fn child(&self) -> ClassInduct<'a> {
+ match *self {
+ ClassFrame::Union { head, .. } => ClassInduct::Item(head),
+ ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op),
+ ClassFrame::BinaryLHS { ref lhs, .. } => {
+ ClassInduct::from_set(lhs)
+ }
+ ClassFrame::BinaryRHS { ref rhs, .. } => {
+ ClassInduct::from_set(rhs)
+ }
+ }
+ }
+}
+
+impl<'a> ClassInduct<'a> {
+ fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> {
+ ClassInduct::from_set(&ast.kind)
+ }
+
+ fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> {
+ match *ast {
+ ast::ClassSet::Item(ref item) => ClassInduct::Item(item),
+ ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op),
+ }
+ }
+}
+
+impl<'a> fmt::Debug for ClassFrame<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let x = match *self {
+ ClassFrame::Union { .. } => "Union",
+ ClassFrame::Binary { .. } => "Binary",
+ ClassFrame::BinaryLHS { .. } => "BinaryLHS",
+ ClassFrame::BinaryRHS { .. } => "BinaryRHS",
+ };
+ write!(f, "{}", x)
+ }
+}
+
+impl<'a> fmt::Debug for ClassInduct<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let x = match *self {
+ ClassInduct::Item(it) => match *it {
+ ast::ClassSetItem::Empty(_) => "Item(Empty)",
+ ast::ClassSetItem::Literal(_) => "Item(Literal)",
+ ast::ClassSetItem::Range(_) => "Item(Range)",
+ ast::ClassSetItem::Ascii(_) => "Item(Ascii)",
+ ast::ClassSetItem::Perl(_) => "Item(Perl)",
+ ast::ClassSetItem::Unicode(_) => "Item(Unicode)",
+ ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)",
+ ast::ClassSetItem::Union(_) => "Item(Union)",
+ },
+ ClassInduct::BinaryOp(it) => match it.kind {
+ ast::ClassSetBinaryOpKind::Intersection => {
+ "BinaryOp(Intersection)"
+ }
+ ast::ClassSetBinaryOpKind::Difference => {
+ "BinaryOp(Difference)"
+ }
+ ast::ClassSetBinaryOpKind::SymmetricDifference => {
+ "BinaryOp(SymmetricDifference)"
+ }
+ },
+ };
+ write!(f, "{}", x)
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/either.rs b/third_party/rust/regex-syntax/src/either.rs
new file mode 100644
index 0000000000..7ae41e4ced
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/either.rs
@@ -0,0 +1,8 @@
+/// A simple binary sum type.
+///
+/// This is occasionally useful in an ad hoc fashion.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Either<Left, Right> {
+ Left(Left),
+ Right(Right),
+}
diff --git a/third_party/rust/regex-syntax/src/error.rs b/third_party/rust/regex-syntax/src/error.rs
new file mode 100644
index 0000000000..1230d2fc5d
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/error.rs
@@ -0,0 +1,324 @@
+use std::cmp;
+use std::error;
+use std::fmt;
+use std::result;
+
+use crate::ast;
+use crate::hir;
+
+/// A type alias for dealing with errors returned by this crate.
+pub type Result<T> = result::Result<T, Error>;
+
+/// This error type encompasses any error that can be returned by this crate.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Error {
+ /// An error that occurred while translating concrete syntax into abstract
+ /// syntax (AST).
+ Parse(ast::Error),
+ /// An error that occurred while translating abstract syntax into a high
+ /// level intermediate representation (HIR).
+ Translate(hir::Error),
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl From<ast::Error> for Error {
+ fn from(err: ast::Error) -> Error {
+ Error::Parse(err)
+ }
+}
+
+impl From<hir::Error> for Error {
+ fn from(err: hir::Error) -> Error {
+ Error::Translate(err)
+ }
+}
+
+impl error::Error for Error {
+ // TODO: Remove this method entirely on the next breaking semver release.
+ #[allow(deprecated)]
+ fn description(&self) -> &str {
+ match *self {
+ Error::Parse(ref x) => x.description(),
+ Error::Translate(ref x) => x.description(),
+ _ => unreachable!(),
+ }
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ Error::Parse(ref x) => x.fmt(f),
+ Error::Translate(ref x) => x.fmt(f),
+ _ => unreachable!(),
+ }
+ }
+}
+
+/// A helper type for formatting nice error messages.
+///
+/// This type is responsible for reporting regex parse errors in a nice human
+/// readable format. Most of its complexity is from interspersing notational
+/// markers pointing out the position where an error occurred.
+#[derive(Debug)]
+pub struct Formatter<'e, E> {
+ /// The original regex pattern in which the error occurred.
+ pattern: &'e str,
+ /// The error kind. It must impl fmt::Display.
+ err: &'e E,
+ /// The primary span of the error.
+ span: &'e ast::Span,
+ /// An auxiliary and optional span, in case the error needs to point to
+ /// two locations (e.g., when reporting a duplicate capture group name).
+ aux_span: Option<&'e ast::Span>,
+}
+
+impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> {
+ fn from(err: &'e ast::Error) -> Self {
+ Formatter {
+ pattern: err.pattern(),
+ err: err.kind(),
+ span: err.span(),
+ aux_span: err.auxiliary_span(),
+ }
+ }
+}
+
+impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> {
+ fn from(err: &'e hir::Error) -> Self {
+ Formatter {
+ pattern: err.pattern(),
+ err: err.kind(),
+ span: err.span(),
+ aux_span: None,
+ }
+ }
+}
+
+impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let spans = Spans::from_formatter(self);
+ if self.pattern.contains('\n') {
+ let divider = repeat_char('~', 79);
+
+ writeln!(f, "regex parse error:")?;
+ writeln!(f, "{}", divider)?;
+ let notated = spans.notate();
+ write!(f, "{}", notated)?;
+ writeln!(f, "{}", divider)?;
+ // If we have error spans that cover multiple lines, then we just
+ // note the line numbers.
+ if !spans.multi_line.is_empty() {
+ let mut notes = vec![];
+ for span in &spans.multi_line {
+ notes.push(format!(
+ "on line {} (column {}) through line {} (column {})",
+ span.start.line,
+ span.start.column,
+ span.end.line,
+ span.end.column - 1
+ ));
+ }
+ writeln!(f, "{}", notes.join("\n"))?;
+ }
+ write!(f, "error: {}", self.err)?;
+ } else {
+ writeln!(f, "regex parse error:")?;
+ let notated = Spans::from_formatter(self).notate();
+ write!(f, "{}", notated)?;
+ write!(f, "error: {}", self.err)?;
+ }
+ Ok(())
+ }
+}
+
+/// This type represents an arbitrary number of error spans in a way that makes
+/// it convenient to notate the regex pattern. ("Notate" means "point out
+/// exactly where the error occurred in the regex pattern.")
+///
+/// Technically, we can only ever have two spans given our current error
+/// structure. However, after toiling with a specific algorithm for handling
+/// two spans, it became obvious that an algorithm to handle an arbitrary
+/// number of spans was actually much simpler.
+struct Spans<'p> {
+ /// The original regex pattern string.
+ pattern: &'p str,
+ /// The total width that should be used for line numbers. The width is
+ /// used for left padding the line numbers for alignment.
+ ///
+ /// A value of `0` means line numbers should not be displayed. That is,
+ /// the pattern is itself only one line.
+ line_number_width: usize,
+ /// All error spans that occur on a single line. This sequence always has
+ /// length equivalent to the number of lines in `pattern`, where the index
+ /// of the sequence represents a line number, starting at `0`. The spans
+ /// in each line are sorted in ascending order.
+ by_line: Vec<Vec<ast::Span>>,
+ /// All error spans that occur over one or more lines. That is, the start
+ /// and end position of the span have different line numbers. The spans are
+ /// sorted in ascending order.
+ multi_line: Vec<ast::Span>,
+}
+
+impl<'p> Spans<'p> {
+ /// Build a sequence of spans from a formatter.
+ fn from_formatter<'e, E: fmt::Display>(
+ fmter: &'p Formatter<'e, E>,
+ ) -> Spans<'p> {
+ let mut line_count = fmter.pattern.lines().count();
+ // If the pattern ends with a `\n` literal, then our line count is
+ // off by one, since a span can occur immediately after the last `\n`,
+ // which is consider to be an additional line.
+ if fmter.pattern.ends_with('\n') {
+ line_count += 1;
+ }
+ let line_number_width =
+ if line_count <= 1 { 0 } else { line_count.to_string().len() };
+ let mut spans = Spans {
+ pattern: &fmter.pattern,
+ line_number_width,
+ by_line: vec![vec![]; line_count],
+ multi_line: vec![],
+ };
+ spans.add(fmter.span.clone());
+ if let Some(span) = fmter.aux_span {
+ spans.add(span.clone());
+ }
+ spans
+ }
+
+ /// Add the given span to this sequence, putting it in the right place.
+ fn add(&mut self, span: ast::Span) {
+ // This is grossly inefficient since we sort after each add, but right
+ // now, we only ever add two spans at most.
+ if span.is_one_line() {
+ let i = span.start.line - 1; // because lines are 1-indexed
+ self.by_line[i].push(span);
+ self.by_line[i].sort();
+ } else {
+ self.multi_line.push(span);
+ self.multi_line.sort();
+ }
+ }
+
+ /// Notate the pattern string with carents (`^`) pointing at each span
+ /// location. This only applies to spans that occur within a single line.
+ fn notate(&self) -> String {
+ let mut notated = String::new();
+ for (i, line) in self.pattern.lines().enumerate() {
+ if self.line_number_width > 0 {
+ notated.push_str(&self.left_pad_line_number(i + 1));
+ notated.push_str(": ");
+ } else {
+ notated.push_str(" ");
+ }
+ notated.push_str(line);
+ notated.push('\n');
+ if let Some(notes) = self.notate_line(i) {
+ notated.push_str(&notes);
+ notated.push('\n');
+ }
+ }
+ notated
+ }
+
+ /// Return notes for the line indexed at `i` (zero-based). If there are no
+ /// spans for the given line, then `None` is returned. Otherwise, an
+ /// appropriately space padded string with correctly positioned `^` is
+ /// returned, accounting for line numbers.
+ fn notate_line(&self, i: usize) -> Option<String> {
+ let spans = &self.by_line[i];
+ if spans.is_empty() {
+ return None;
+ }
+ let mut notes = String::new();
+ for _ in 0..self.line_number_padding() {
+ notes.push(' ');
+ }
+ let mut pos = 0;
+ for span in spans {
+ for _ in pos..(span.start.column - 1) {
+ notes.push(' ');
+ pos += 1;
+ }
+ let note_len = span.end.column.saturating_sub(span.start.column);
+ for _ in 0..cmp::max(1, note_len) {
+ notes.push('^');
+ pos += 1;
+ }
+ }
+ Some(notes)
+ }
+
+ /// Left pad the given line number with spaces such that it is aligned with
+ /// other line numbers.
+ fn left_pad_line_number(&self, n: usize) -> String {
+ let n = n.to_string();
+ let pad = self.line_number_width.checked_sub(n.len()).unwrap();
+ let mut result = repeat_char(' ', pad);
+ result.push_str(&n);
+ result
+ }
+
+ /// Return the line number padding beginning at the start of each line of
+ /// the pattern.
+ ///
+ /// If the pattern is only one line, then this returns a fixed padding
+ /// for visual indentation.
+ fn line_number_padding(&self) -> usize {
+ if self.line_number_width == 0 {
+ 4
+ } else {
+ 2 + self.line_number_width
+ }
+ }
+}
+
+fn repeat_char(c: char, count: usize) -> String {
+ ::std::iter::repeat(c).take(count).collect()
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::ast::parse::Parser;
+
+ fn assert_panic_message(pattern: &str, expected_msg: &str) {
+ let result = Parser::new().parse(pattern);
+ match result {
+ Ok(_) => {
+ panic!("regex should not have parsed");
+ }
+ Err(err) => {
+ assert_eq!(err.to_string(), expected_msg.trim());
+ }
+ }
+ }
+
+ // See: https://github.com/rust-lang/regex/issues/464
+ #[test]
+ fn regression_464() {
+ let err = Parser::new().parse("a{\n").unwrap_err();
+ // This test checks that the error formatter doesn't panic.
+ assert!(!err.to_string().is_empty());
+ }
+
+ // See: https://github.com/rust-lang/regex/issues/545
+ #[test]
+ fn repetition_quantifier_expects_a_valid_decimal() {
+ assert_panic_message(
+ r"\\u{[^}]*}",
+ r#"
+regex parse error:
+ \\u{[^}]*}
+ ^
+error: repetition quantifier expects a valid decimal
+"#,
+ );
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/hir/interval.rs b/third_party/rust/regex-syntax/src/hir/interval.rs
new file mode 100644
index 0000000000..56698c53af
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/interval.rs
@@ -0,0 +1,520 @@
+use std::char;
+use std::cmp;
+use std::fmt::Debug;
+use std::slice;
+use std::u8;
+
+use crate::unicode;
+
+// This module contains an *internal* implementation of interval sets.
+//
+// The primary invariant that interval sets guards is canonical ordering. That
+// is, every interval set contains an ordered sequence of intervals where
+// no two intervals are overlapping or adjacent. While this invariant is
+// occasionally broken within the implementation, it should be impossible for
+// callers to observe it.
+//
+// Since case folding (as implemented below) breaks that invariant, we roll
+// that into this API even though it is a little out of place in an otherwise
+// generic interval set. (Hence the reason why the `unicode` module is imported
+// here.)
+//
+// Some of the implementation complexity here is a result of me wanting to
+// preserve the sequential representation without using additional memory.
+// In many cases, we do use linear extra memory, but it is at most 2x and it
+// is amortized. If we relaxed the memory requirements, this implementation
+// could become much simpler. The extra memory is honestly probably OK, but
+// character classes (especially of the Unicode variety) can become quite
+// large, and it would be nice to keep regex compilation snappy even in debug
+// builds. (In the past, I have been careless with this area of code and it has
+// caused slow regex compilations in debug mode, so this isn't entirely
+// unwarranted.)
+//
+// Tests on this are relegated to the public API of HIR in src/hir.rs.
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct IntervalSet<I> {
+ ranges: Vec<I>,
+}
+
+impl<I: Interval> IntervalSet<I> {
+ /// Create a new set from a sequence of intervals. Each interval is
+ /// specified as a pair of bounds, where both bounds are inclusive.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap.
+ pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
+ let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
+ set.canonicalize();
+ set
+ }
+
+ /// Add a new interval to this set.
+ pub fn push(&mut self, interval: I) {
+ // TODO: This could be faster. e.g., Push the interval such that
+ // it preserves canonicalization.
+ self.ranges.push(interval);
+ self.canonicalize();
+ }
+
+ /// Return an iterator over all intervals in this set.
+ ///
+ /// The iterator yields intervals in ascending order.
+ pub fn iter(&self) -> IntervalSetIter<'_, I> {
+ IntervalSetIter(self.ranges.iter())
+ }
+
+ /// Return an immutable slice of intervals in this set.
+ ///
+ /// The sequence returned is in canonical ordering.
+ pub fn intervals(&self) -> &[I] {
+ &self.ranges
+ }
+
+ /// Expand this interval set such that it contains all case folded
+ /// characters. For example, if this class consists of the range `a-z`,
+ /// then applying case folding will result in the class containing both the
+ /// ranges `a-z` and `A-Z`.
+ ///
+ /// This returns an error if the necessary case mapping data is not
+ /// available.
+ pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
+ let len = self.ranges.len();
+ for i in 0..len {
+ let range = self.ranges[i];
+ if let Err(err) = range.case_fold_simple(&mut self.ranges) {
+ self.canonicalize();
+ return Err(err);
+ }
+ }
+ self.canonicalize();
+ Ok(())
+ }
+
+ /// Union this set with the given set, in place.
+ pub fn union(&mut self, other: &IntervalSet<I>) {
+ // This could almost certainly be done more efficiently.
+ self.ranges.extend(&other.ranges);
+ self.canonicalize();
+ }
+
+ /// Intersect this set with the given set, in place.
+ pub fn intersect(&mut self, other: &IntervalSet<I>) {
+ if self.ranges.is_empty() {
+ return;
+ }
+ if other.ranges.is_empty() {
+ self.ranges.clear();
+ return;
+ }
+
+ // There should be a way to do this in-place with constant memory,
+ // but I couldn't figure out a simple way to do it. So just append
+ // the intersection to the end of this range, and then drain it before
+ // we're done.
+ let drain_end = self.ranges.len();
+
+ let mut ita = 0..drain_end;
+ let mut itb = 0..other.ranges.len();
+ let mut a = ita.next().unwrap();
+ let mut b = itb.next().unwrap();
+ loop {
+ if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
+ self.ranges.push(ab);
+ }
+ let (it, aorb) =
+ if self.ranges[a].upper() < other.ranges[b].upper() {
+ (&mut ita, &mut a)
+ } else {
+ (&mut itb, &mut b)
+ };
+ match it.next() {
+ Some(v) => *aorb = v,
+ None => break,
+ }
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Subtract the given set from this set, in place.
+ pub fn difference(&mut self, other: &IntervalSet<I>) {
+ if self.ranges.is_empty() || other.ranges.is_empty() {
+ return;
+ }
+
+ // This algorithm is (to me) surprisingly complex. A search of the
+ // interwebs indicate that this is a potentially interesting problem.
+ // Folks seem to suggest interval or segment trees, but I'd like to
+ // avoid the overhead (both runtime and conceptual) of that.
+ //
+ // The following is basically my Shitty First Draft. Therefore, in
+ // order to grok it, you probably need to read each line carefully.
+ // Simplifications are most welcome!
+ //
+ // Remember, we can assume the canonical format invariant here, which
+ // says that all ranges are sorted, not overlapping and not adjacent in
+ // each class.
+ let drain_end = self.ranges.len();
+ let (mut a, mut b) = (0, 0);
+ 'LOOP: while a < drain_end && b < other.ranges.len() {
+ // Basically, the easy cases are when neither range overlaps with
+ // each other. If the `b` range is less than our current `a`
+ // range, then we can skip it and move on.
+ if other.ranges[b].upper() < self.ranges[a].lower() {
+ b += 1;
+ continue;
+ }
+ // ... similarly for the `a` range. If it's less than the smallest
+ // `b` range, then we can add it as-is.
+ if self.ranges[a].upper() < other.ranges[b].lower() {
+ let range = self.ranges[a];
+ self.ranges.push(range);
+ a += 1;
+ continue;
+ }
+ // Otherwise, we have overlapping ranges.
+ assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
+
+ // This part is tricky and was non-obvious to me without looking
+ // at explicit examples (see the tests). The trickiness stems from
+ // two things: 1) subtracting a range from another range could
+ // yield two ranges and 2) after subtracting a range, it's possible
+ // that future ranges can have an impact. The loop below advances
+ // the `b` ranges until they can't possible impact the current
+ // range.
+ //
+ // For example, if our `a` range is `a-t` and our next three `b`
+ // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
+ // subtraction three times before moving on to the next `a` range.
+ let mut range = self.ranges[a];
+ while b < other.ranges.len()
+ && !range.is_intersection_empty(&other.ranges[b])
+ {
+ let old_range = range;
+ range = match range.difference(&other.ranges[b]) {
+ (None, None) => {
+ // We lost the entire range, so move on to the next
+ // without adding this one.
+ a += 1;
+ continue 'LOOP;
+ }
+ (Some(range1), None) | (None, Some(range1)) => range1,
+ (Some(range1), Some(range2)) => {
+ self.ranges.push(range1);
+ range2
+ }
+ };
+ // It's possible that the `b` range has more to contribute
+ // here. In particular, if it is greater than the original
+ // range, then it might impact the next `a` range *and* it
+ // has impacted the current `a` range as much as possible,
+ // so we can quit. We don't bump `b` so that the next `a`
+ // range can apply it.
+ if other.ranges[b].upper() > old_range.upper() {
+ break;
+ }
+ // Otherwise, the next `b` range might apply to the current
+ // `a` range.
+ b += 1;
+ }
+ self.ranges.push(range);
+ a += 1;
+ }
+ while a < drain_end {
+ let range = self.ranges[a];
+ self.ranges.push(range);
+ a += 1;
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Compute the symmetric difference of the two sets, in place.
+ ///
+ /// This computes the symmetric difference of two interval sets. This
+ /// removes all elements in this set that are also in the given set,
+ /// but also adds all elements from the given set that aren't in this
+ /// set. That is, the set will contain all elements in either set,
+ /// but will not contain any elements that are in both sets.
+ pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
+ // TODO(burntsushi): Fix this so that it amortizes allocation.
+ let mut intersection = self.clone();
+ intersection.intersect(other);
+ self.union(other);
+ self.difference(&intersection);
+ }
+
+ /// Negate this interval set.
+ ///
+ /// For all `x` where `x` is any element, if `x` was in this set, then it
+ /// will not be in this set after negation.
+ pub fn negate(&mut self) {
+ if self.ranges.is_empty() {
+ let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
+ self.ranges.push(I::create(min, max));
+ return;
+ }
+
+ // There should be a way to do this in-place with constant memory,
+ // but I couldn't figure out a simple way to do it. So just append
+ // the negation to the end of this range, and then drain it before
+ // we're done.
+ let drain_end = self.ranges.len();
+
+ // We do checked arithmetic below because of the canonical ordering
+ // invariant.
+ if self.ranges[0].lower() > I::Bound::min_value() {
+ let upper = self.ranges[0].lower().decrement();
+ self.ranges.push(I::create(I::Bound::min_value(), upper));
+ }
+ for i in 1..drain_end {
+ let lower = self.ranges[i - 1].upper().increment();
+ let upper = self.ranges[i].lower().decrement();
+ self.ranges.push(I::create(lower, upper));
+ }
+ if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
+ let lower = self.ranges[drain_end - 1].upper().increment();
+ self.ranges.push(I::create(lower, I::Bound::max_value()));
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Converts this set into a canonical ordering.
+ fn canonicalize(&mut self) {
+ if self.is_canonical() {
+ return;
+ }
+ self.ranges.sort();
+ assert!(!self.ranges.is_empty());
+
+ // Is there a way to do this in-place with constant memory? I couldn't
+ // figure out a way to do it. So just append the canonicalization to
+ // the end of this range, and then drain it before we're done.
+ let drain_end = self.ranges.len();
+ for oldi in 0..drain_end {
+ // If we've added at least one new range, then check if we can
+ // merge this range in the previously added range.
+ if self.ranges.len() > drain_end {
+ let (last, rest) = self.ranges.split_last_mut().unwrap();
+ if let Some(union) = last.union(&rest[oldi]) {
+ *last = union;
+ continue;
+ }
+ }
+ let range = self.ranges[oldi];
+ self.ranges.push(range);
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Returns true if and only if this class is in a canonical ordering.
+ fn is_canonical(&self) -> bool {
+ for pair in self.ranges.windows(2) {
+ if pair[0] >= pair[1] {
+ return false;
+ }
+ if pair[0].is_contiguous(&pair[1]) {
+ return false;
+ }
+ }
+ true
+ }
+}
+
+/// An iterator over intervals.
+#[derive(Debug)]
+pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
+
+impl<'a, I> Iterator for IntervalSetIter<'a, I> {
+ type Item = &'a I;
+
+ fn next(&mut self) -> Option<&'a I> {
+ self.0.next()
+ }
+}
+
+pub trait Interval:
+ Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
+{
+ type Bound: Bound;
+
+ fn lower(&self) -> Self::Bound;
+ fn upper(&self) -> Self::Bound;
+ fn set_lower(&mut self, bound: Self::Bound);
+ fn set_upper(&mut self, bound: Self::Bound);
+ fn case_fold_simple(
+ &self,
+ intervals: &mut Vec<Self>,
+ ) -> Result<(), unicode::CaseFoldError>;
+
+ /// Create a new interval.
+ fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
+ let mut int = Self::default();
+ if lower <= upper {
+ int.set_lower(lower);
+ int.set_upper(upper);
+ } else {
+ int.set_lower(upper);
+ int.set_upper(lower);
+ }
+ int
+ }
+
+ /// Union the given overlapping range into this range.
+ ///
+ /// If the two ranges aren't contiguous, then this returns `None`.
+ fn union(&self, other: &Self) -> Option<Self> {
+ if !self.is_contiguous(other) {
+ return None;
+ }
+ let lower = cmp::min(self.lower(), other.lower());
+ let upper = cmp::max(self.upper(), other.upper());
+ Some(Self::create(lower, upper))
+ }
+
+ /// Intersect this range with the given range and return the result.
+ ///
+ /// If the intersection is empty, then this returns `None`.
+ fn intersect(&self, other: &Self) -> Option<Self> {
+ let lower = cmp::max(self.lower(), other.lower());
+ let upper = cmp::min(self.upper(), other.upper());
+ if lower <= upper {
+ Some(Self::create(lower, upper))
+ } else {
+ None
+ }
+ }
+
+ /// Subtract the given range from this range and return the resulting
+ /// ranges.
+ ///
+ /// If subtraction would result in an empty range, then no ranges are
+ /// returned.
+ fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
+ if self.is_subset(other) {
+ return (None, None);
+ }
+ if self.is_intersection_empty(other) {
+ return (Some(self.clone()), None);
+ }
+ let add_lower = other.lower() > self.lower();
+ let add_upper = other.upper() < self.upper();
+ // We know this because !self.is_subset(other) and the ranges have
+ // a non-empty intersection.
+ assert!(add_lower || add_upper);
+ let mut ret = (None, None);
+ if add_lower {
+ let upper = other.lower().decrement();
+ ret.0 = Some(Self::create(self.lower(), upper));
+ }
+ if add_upper {
+ let lower = other.upper().increment();
+ let range = Self::create(lower, self.upper());
+ if ret.0.is_none() {
+ ret.0 = Some(range);
+ } else {
+ ret.1 = Some(range);
+ }
+ }
+ ret
+ }
+
+ /// Compute the symmetric difference the given range from this range. This
+ /// returns the union of the two ranges minus its intersection.
+ fn symmetric_difference(
+ &self,
+ other: &Self,
+ ) -> (Option<Self>, Option<Self>) {
+ let union = match self.union(other) {
+ None => return (Some(self.clone()), Some(other.clone())),
+ Some(union) => union,
+ };
+ let intersection = match self.intersect(other) {
+ None => return (Some(self.clone()), Some(other.clone())),
+ Some(intersection) => intersection,
+ };
+ union.difference(&intersection)
+ }
+
+ /// Returns true if and only if the two ranges are contiguous. Two ranges
+ /// are contiguous if and only if the ranges are either overlapping or
+ /// adjacent.
+ fn is_contiguous(&self, other: &Self) -> bool {
+ let lower1 = self.lower().as_u32();
+ let upper1 = self.upper().as_u32();
+ let lower2 = other.lower().as_u32();
+ let upper2 = other.upper().as_u32();
+ cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
+ }
+
+ /// Returns true if and only if the intersection of this range and the
+ /// other range is empty.
+ fn is_intersection_empty(&self, other: &Self) -> bool {
+ let (lower1, upper1) = (self.lower(), self.upper());
+ let (lower2, upper2) = (other.lower(), other.upper());
+ cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
+ }
+
+ /// Returns true if and only if this range is a subset of the other range.
+ fn is_subset(&self, other: &Self) -> bool {
+ let (lower1, upper1) = (self.lower(), self.upper());
+ let (lower2, upper2) = (other.lower(), other.upper());
+ (lower2 <= lower1 && lower1 <= upper2)
+ && (lower2 <= upper1 && upper1 <= upper2)
+ }
+}
+
+pub trait Bound:
+ Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
+{
+ fn min_value() -> Self;
+ fn max_value() -> Self;
+ fn as_u32(self) -> u32;
+ fn increment(self) -> Self;
+ fn decrement(self) -> Self;
+}
+
+impl Bound for u8 {
+ fn min_value() -> Self {
+ u8::MIN
+ }
+ fn max_value() -> Self {
+ u8::MAX
+ }
+ fn as_u32(self) -> u32 {
+ self as u32
+ }
+ fn increment(self) -> Self {
+ self.checked_add(1).unwrap()
+ }
+ fn decrement(self) -> Self {
+ self.checked_sub(1).unwrap()
+ }
+}
+
+impl Bound for char {
+ fn min_value() -> Self {
+ '\x00'
+ }
+ fn max_value() -> Self {
+ '\u{10FFFF}'
+ }
+ fn as_u32(self) -> u32 {
+ self as u32
+ }
+
+ fn increment(self) -> Self {
+ match self {
+ '\u{D7FF}' => '\u{E000}',
+ c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
+ }
+ }
+
+ fn decrement(self) -> Self {
+ match self {
+ '\u{E000}' => '\u{D7FF}',
+ c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
+ }
+ }
+}
+
+// Tests for interval sets are written in src/hir.rs against the public API.
diff --git a/third_party/rust/regex-syntax/src/hir/literal/mod.rs b/third_party/rust/regex-syntax/src/hir/literal/mod.rs
new file mode 100644
index 0000000000..fbc5d3c975
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/literal/mod.rs
@@ -0,0 +1,1686 @@
+/*!
+Provides routines for extracting literal prefixes and suffixes from an `Hir`.
+*/
+
+use std::cmp;
+use std::fmt;
+use std::iter;
+use std::mem;
+use std::ops;
+
+use crate::hir::{self, Hir, HirKind};
+
+/// A set of literal byte strings extracted from a regular expression.
+///
+/// Every member of the set is a `Literal`, which is represented by a
+/// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
+/// said to be either *complete* or *cut*. A complete literal means that
+/// it extends until the beginning (or end) of the regular expression. In
+/// some circumstances, this can be used to indicate a match in the regular
+/// expression.
+///
+/// A key aspect of literal extraction is knowing when to stop. It is not
+/// feasible to blindly extract all literals from a regular expression, even if
+/// there are finitely many. For example, the regular expression `[0-9]{10}`
+/// has `10^10` distinct literals. For this reason, literal extraction is
+/// bounded to some low number by default using heuristics, but the limits can
+/// be tweaked.
+///
+/// **WARNING**: Literal extraction uses stack space proportional to the size
+/// of the `Hir` expression. At some point, this drawback will be eliminated.
+/// To protect yourself, set a reasonable
+/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
+/// This is done for you by default.
+#[derive(Clone, Eq, PartialEq)]
+pub struct Literals {
+ lits: Vec<Literal>,
+ limit_size: usize,
+ limit_class: usize,
+}
+
+/// A single member of a set of literals extracted from a regular expression.
+///
+/// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
+/// and `Vec` operations are available.
+#[derive(Clone, Eq, Ord)]
+pub struct Literal {
+ v: Vec<u8>,
+ cut: bool,
+}
+
+impl Literals {
+ /// Returns a new empty set of literals using default limits.
+ pub fn empty() -> Literals {
+ Literals { lits: vec![], limit_size: 250, limit_class: 10 }
+ }
+
+ /// Returns a set of literal prefixes extracted from the given `Hir`.
+ pub fn prefixes(expr: &Hir) -> Literals {
+ let mut lits = Literals::empty();
+ lits.union_prefixes(expr);
+ lits
+ }
+
+ /// Returns a set of literal suffixes extracted from the given `Hir`.
+ pub fn suffixes(expr: &Hir) -> Literals {
+ let mut lits = Literals::empty();
+ lits.union_suffixes(expr);
+ lits
+ }
+
+ /// Get the approximate size limit (in bytes) of this set.
+ pub fn limit_size(&self) -> usize {
+ self.limit_size
+ }
+
+ /// Set the approximate size limit (in bytes) of this set.
+ ///
+ /// If extracting a literal would put the set over this limit, then
+ /// extraction stops.
+ ///
+ /// The new limits will only apply to additions to this set. Existing
+ /// members remain unchanged, even if the set exceeds the new limit.
+ pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
+ self.limit_size = size;
+ self
+ }
+
+ /// Get the character class size limit for this set.
+ pub fn limit_class(&self) -> usize {
+ self.limit_class
+ }
+
+ /// Limits the size of character(or byte) classes considered.
+ ///
+ /// A value of `0` prevents all character classes from being considered.
+ ///
+ /// This limit also applies to case insensitive literals, since each
+ /// character in the case insensitive literal is converted to a class, and
+ /// then case folded.
+ ///
+ /// The new limits will only apply to additions to this set. Existing
+ /// members remain unchanged, even if the set exceeds the new limit.
+ pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
+ self.limit_class = size;
+ self
+ }
+
+ /// Returns the set of literals as a slice. Its order is unspecified.
+ pub fn literals(&self) -> &[Literal] {
+ &self.lits
+ }
+
+ /// Returns the length of the smallest literal.
+ ///
+ /// Returns None is there are no literals in the set.
+ pub fn min_len(&self) -> Option<usize> {
+ let mut min = None;
+ for lit in &self.lits {
+ match min {
+ None => min = Some(lit.len()),
+ Some(m) if lit.len() < m => min = Some(lit.len()),
+ _ => {}
+ }
+ }
+ min
+ }
+
+ /// Returns true if all members in this set are complete.
+ pub fn all_complete(&self) -> bool {
+ !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
+ }
+
+ /// Returns true if any member in this set is complete.
+ pub fn any_complete(&self) -> bool {
+ self.lits.iter().any(|lit| !lit.is_cut())
+ }
+
+ /// Returns true if this set contains an empty literal.
+ pub fn contains_empty(&self) -> bool {
+ self.lits.iter().any(|lit| lit.is_empty())
+ }
+
+ /// Returns true if this set is empty or if all of its members is empty.
+ pub fn is_empty(&self) -> bool {
+ self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
+ }
+
+ /// Returns a new empty set of literals using this set's limits.
+ pub fn to_empty(&self) -> Literals {
+ let mut lits = Literals::empty();
+ lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
+ lits
+ }
+
+ /// Returns the longest common prefix of all members in this set.
+ pub fn longest_common_prefix(&self) -> &[u8] {
+ if self.is_empty() {
+ return &[];
+ }
+ let lit0 = &*self.lits[0];
+ let mut len = lit0.len();
+ for lit in &self.lits[1..] {
+ len = cmp::min(
+ len,
+ lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
+ );
+ }
+ &self.lits[0][..len]
+ }
+
+ /// Returns the longest common suffix of all members in this set.
+ pub fn longest_common_suffix(&self) -> &[u8] {
+ if self.is_empty() {
+ return &[];
+ }
+ let lit0 = &*self.lits[0];
+ let mut len = lit0.len();
+ for lit in &self.lits[1..] {
+ len = cmp::min(
+ len,
+ lit.iter()
+ .rev()
+ .zip(lit0.iter().rev())
+ .take_while(|&(a, b)| a == b)
+ .count(),
+ );
+ }
+ &self.lits[0][self.lits[0].len() - len..]
+ }
+
+ /// Returns a new set of literals with the given number of bytes trimmed
+ /// from the suffix of each literal.
+ ///
+ /// If any literal would be cut out completely by trimming, then None is
+ /// returned.
+ ///
+ /// Any duplicates that are created as a result of this transformation are
+ /// removed.
+ pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
+ if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
+ return None;
+ }
+ let mut new = self.to_empty();
+ for mut lit in self.lits.iter().cloned() {
+ let new_len = lit.len() - num_bytes;
+ lit.truncate(new_len);
+ lit.cut();
+ new.lits.push(lit);
+ }
+ new.lits.sort();
+ new.lits.dedup();
+ Some(new)
+ }
+
+ /// Returns a new set of prefixes of this set of literals that are
+ /// guaranteed to be unambiguous.
+ ///
+ /// Any substring match with a member of the set is returned is guaranteed
+ /// to never overlap with a substring match of another member of the set
+ /// at the same starting position.
+ ///
+ /// Given any two members of the returned set, neither is a substring of
+ /// the other.
+ pub fn unambiguous_prefixes(&self) -> Literals {
+ if self.lits.is_empty() {
+ return self.to_empty();
+ }
+ let mut old = self.lits.to_vec();
+ let mut new = self.to_empty();
+ 'OUTER: while let Some(mut candidate) = old.pop() {
+ if candidate.is_empty() {
+ continue;
+ }
+ if new.lits.is_empty() {
+ new.lits.push(candidate);
+ continue;
+ }
+ for lit2 in &mut new.lits {
+ if lit2.is_empty() {
+ continue;
+ }
+ if &candidate == lit2 {
+ // If the literal is already in the set, then we can
+ // just drop it. But make sure that cut literals are
+ // infectious!
+ candidate.cut = candidate.cut || lit2.cut;
+ lit2.cut = candidate.cut;
+ continue 'OUTER;
+ }
+ if candidate.len() < lit2.len() {
+ if let Some(i) = position(&candidate, &lit2) {
+ candidate.cut();
+ let mut lit3 = lit2.clone();
+ lit3.truncate(i);
+ lit3.cut();
+ old.push(lit3);
+ lit2.clear();
+ }
+ } else if let Some(i) = position(&lit2, &candidate) {
+ lit2.cut();
+ let mut new_candidate = candidate.clone();
+ new_candidate.truncate(i);
+ new_candidate.cut();
+ old.push(new_candidate);
+ candidate.clear();
+ }
+ // Oops, the candidate is already represented in the set.
+ if candidate.is_empty() {
+ continue 'OUTER;
+ }
+ }
+ new.lits.push(candidate);
+ }
+ new.lits.retain(|lit| !lit.is_empty());
+ new.lits.sort();
+ new.lits.dedup();
+ new
+ }
+
+ /// Returns a new set of suffixes of this set of literals that are
+ /// guaranteed to be unambiguous.
+ ///
+ /// Any substring match with a member of the set is returned is guaranteed
+ /// to never overlap with a substring match of another member of the set
+ /// at the same ending position.
+ ///
+ /// Given any two members of the returned set, neither is a substring of
+ /// the other.
+ pub fn unambiguous_suffixes(&self) -> Literals {
+ // This is a touch wasteful...
+ let mut lits = self.clone();
+ lits.reverse();
+ let mut unamb = lits.unambiguous_prefixes();
+ unamb.reverse();
+ unamb
+ }
+
+ /// Unions the prefixes from the given expression to this set.
+ ///
+ /// If prefixes could not be added (for example, this set would exceed its
+ /// size limits or the set of prefixes from `expr` includes the empty
+ /// string), then false is returned.
+ ///
+ /// Note that prefix literals extracted from `expr` are said to be complete
+ /// if and only if the literal extends from the beginning of `expr` to the
+ /// end of `expr`.
+ pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
+ let mut lits = self.to_empty();
+ prefixes(expr, &mut lits);
+ !lits.is_empty() && !lits.contains_empty() && self.union(lits)
+ }
+
+ /// Unions the suffixes from the given expression to this set.
+ ///
+ /// If suffixes could not be added (for example, this set would exceed its
+ /// size limits or the set of suffixes from `expr` includes the empty
+ /// string), then false is returned.
+ ///
+ /// Note that prefix literals extracted from `expr` are said to be complete
+ /// if and only if the literal extends from the end of `expr` to the
+ /// beginning of `expr`.
+ pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
+ let mut lits = self.to_empty();
+ suffixes(expr, &mut lits);
+ lits.reverse();
+ !lits.is_empty() && !lits.contains_empty() && self.union(lits)
+ }
+
+ /// Unions this set with another set.
+ ///
+ /// If the union would cause the set to exceed its limits, then the union
+ /// is skipped and it returns false. Otherwise, if the union succeeds, it
+ /// returns true.
+ pub fn union(&mut self, lits: Literals) -> bool {
+ if self.num_bytes() + lits.num_bytes() > self.limit_size {
+ return false;
+ }
+ if lits.is_empty() {
+ self.lits.push(Literal::empty());
+ } else {
+ self.lits.extend(lits.lits);
+ }
+ true
+ }
+
+ /// Extends this set with another set.
+ ///
+ /// The set of literals is extended via a cross product.
+ ///
+ /// If a cross product would cause this set to exceed its limits, then the
+ /// cross product is skipped and it returns false. Otherwise, if the cross
+ /// product succeeds, it returns true.
+ pub fn cross_product(&mut self, lits: &Literals) -> bool {
+ if lits.is_empty() {
+ return true;
+ }
+ // Check that we make sure we stay in our limits.
+ let mut size_after;
+ if self.is_empty() || !self.any_complete() {
+ size_after = self.num_bytes();
+ for lits_lit in lits.literals() {
+ size_after += lits_lit.len();
+ }
+ } else {
+ size_after = self.lits.iter().fold(0, |accum, lit| {
+ accum + if lit.is_cut() { lit.len() } else { 0 }
+ });
+ for lits_lit in lits.literals() {
+ for self_lit in self.literals() {
+ if !self_lit.is_cut() {
+ size_after += self_lit.len() + lits_lit.len();
+ }
+ }
+ }
+ }
+ if size_after > self.limit_size {
+ return false;
+ }
+
+ let mut base = self.remove_complete();
+ if base.is_empty() {
+ base = vec![Literal::empty()];
+ }
+ for lits_lit in lits.literals() {
+ for mut self_lit in base.clone() {
+ self_lit.extend(&**lits_lit);
+ self_lit.cut = lits_lit.cut;
+ self.lits.push(self_lit);
+ }
+ }
+ true
+ }
+
+ /// Extends each literal in this set with the bytes given.
+ ///
+ /// If the set is empty, then the given literal is added to the set.
+ ///
+ /// If adding any number of bytes to all members of this set causes a limit
+ /// to be exceeded, then no bytes are added and false is returned. If a
+ /// prefix of `bytes` can be fit into this set, then it is used and all
+ /// resulting literals are cut.
+ pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
+ // N.B. This could be implemented by simply calling cross_product with
+ // a literal set containing just `bytes`, but we can be smarter about
+ // taking shorter prefixes of `bytes` if they'll fit.
+ if bytes.is_empty() {
+ return true;
+ }
+ if self.lits.is_empty() {
+ let i = cmp::min(self.limit_size, bytes.len());
+ self.lits.push(Literal::new(bytes[..i].to_owned()));
+ self.lits[0].cut = i < bytes.len();
+ return !self.lits[0].is_cut();
+ }
+ let size = self.num_bytes();
+ if size + self.lits.len() >= self.limit_size {
+ return false;
+ }
+ let mut i = 1;
+ while size + (i * self.lits.len()) <= self.limit_size
+ && i < bytes.len()
+ {
+ i += 1;
+ }
+ for lit in &mut self.lits {
+ if !lit.is_cut() {
+ lit.extend(&bytes[..i]);
+ if i < bytes.len() {
+ lit.cut();
+ }
+ }
+ }
+ true
+ }
+
+ /// Adds the given literal to this set.
+ ///
+ /// Returns false if adding this literal would cause the class to be too
+ /// big.
+ pub fn add(&mut self, lit: Literal) -> bool {
+ if self.num_bytes() + lit.len() > self.limit_size {
+ return false;
+ }
+ self.lits.push(lit);
+ true
+ }
+
+ /// Extends each literal in this set with the character class given.
+ ///
+ /// Returns false if the character class was too big to add.
+ pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
+ self._add_char_class(cls, false)
+ }
+
+ /// Extends each literal in this set with the character class given,
+ /// writing the bytes of each character in reverse.
+ ///
+ /// Returns false if the character class was too big to add.
+ fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
+ self._add_char_class(cls, true)
+ }
+
+ fn _add_char_class(
+ &mut self,
+ cls: &hir::ClassUnicode,
+ reverse: bool,
+ ) -> bool {
+ use std::char;
+
+ if self.class_exceeds_limits(cls_char_count(cls)) {
+ return false;
+ }
+ let mut base = self.remove_complete();
+ if base.is_empty() {
+ base = vec![Literal::empty()];
+ }
+ for r in cls.iter() {
+ let (s, e) = (r.start as u32, r.end as u32 + 1);
+ for c in (s..e).filter_map(char::from_u32) {
+ for mut lit in base.clone() {
+ let mut bytes = c.to_string().into_bytes();
+ if reverse {
+ bytes.reverse();
+ }
+ lit.extend(&bytes);
+ self.lits.push(lit);
+ }
+ }
+ }
+ true
+ }
+
+ /// Extends each literal in this set with the byte class given.
+ ///
+ /// Returns false if the byte class was too big to add.
+ pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
+ if self.class_exceeds_limits(cls_byte_count(cls)) {
+ return false;
+ }
+ let mut base = self.remove_complete();
+ if base.is_empty() {
+ base = vec![Literal::empty()];
+ }
+ for r in cls.iter() {
+ let (s, e) = (r.start as u32, r.end as u32 + 1);
+ for b in (s..e).map(|b| b as u8) {
+ for mut lit in base.clone() {
+ lit.push(b);
+ self.lits.push(lit);
+ }
+ }
+ }
+ true
+ }
+
+ /// Cuts every member of this set. When a member is cut, it can never
+ /// be extended.
+ pub fn cut(&mut self) {
+ for lit in &mut self.lits {
+ lit.cut();
+ }
+ }
+
+ /// Reverses all members in place.
+ pub fn reverse(&mut self) {
+ for lit in &mut self.lits {
+ lit.reverse();
+ }
+ }
+
+ /// Clears this set of all members.
+ pub fn clear(&mut self) {
+ self.lits.clear();
+ }
+
+ /// Pops all complete literals out of this set.
+ fn remove_complete(&mut self) -> Vec<Literal> {
+ let mut base = vec![];
+ for lit in mem::replace(&mut self.lits, vec![]) {
+ if lit.is_cut() {
+ self.lits.push(lit);
+ } else {
+ base.push(lit);
+ }
+ }
+ base
+ }
+
+ /// Returns the total number of bytes in this set.
+ fn num_bytes(&self) -> usize {
+ self.lits.iter().fold(0, |accum, lit| accum + lit.len())
+ }
+
+ /// Returns true if a character class with the given size would cause this
+ /// set to exceed its limits.
+ ///
+ /// The size given should correspond to the number of items in the class.
+ fn class_exceeds_limits(&self, size: usize) -> bool {
+ if size > self.limit_class {
+ return true;
+ }
+ // This is an approximation since codepoints in a char class can encode
+ // to 1-4 bytes.
+ let new_byte_count = if self.lits.is_empty() {
+ size
+ } else {
+ self.lits.iter().fold(0, |accum, lit| {
+ accum
+ + if lit.is_cut() {
+ // If the literal is cut, then we'll never add
+ // anything to it, so don't count it.
+ 0
+ } else {
+ (lit.len() + 1) * size
+ }
+ })
+ };
+ new_byte_count > self.limit_size
+ }
+}
+
+fn prefixes(expr: &Hir, lits: &mut Literals) {
+ match *expr.kind() {
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
+ let mut buf = [0; 4];
+ lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ lits.cross_add(&[b]);
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ if !lits.add_char_class(cls) {
+ lits.cut();
+ }
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ if !lits.add_byte_class(cls) {
+ lits.cut();
+ }
+ }
+ HirKind::Group(hir::Group { ref hir, .. }) => {
+ prefixes(&**hir, lits);
+ }
+ HirKind::Repetition(ref x) => match x.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ repeat_zero_or_one_literals(&x.hir, lits, prefixes);
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ repeat_zero_or_more_literals(&x.hir, lits, prefixes);
+ }
+ hir::RepetitionKind::OneOrMore => {
+ repeat_one_or_more_literals(&x.hir, lits, prefixes);
+ }
+ hir::RepetitionKind::Range(ref rng) => {
+ let (min, max) = match *rng {
+ hir::RepetitionRange::Exactly(m) => (m, Some(m)),
+ hir::RepetitionRange::AtLeast(m) => (m, None),
+ hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
+ };
+ repeat_range_literals(
+ &x.hir, min, max, x.greedy, lits, prefixes,
+ )
+ }
+ },
+ HirKind::Concat(ref es) if es.is_empty() => {}
+ HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
+ HirKind::Concat(ref es) => {
+ for e in es {
+ if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
+ if !lits.is_empty() {
+ lits.cut();
+ break;
+ }
+ lits.add(Literal::empty());
+ continue;
+ }
+ let mut lits2 = lits.to_empty();
+ prefixes(e, &mut lits2);
+ if !lits.cross_product(&lits2) || !lits2.any_complete() {
+ // If this expression couldn't yield any literal that
+ // could be extended, then we need to quit. Since we're
+ // short-circuiting, we also need to freeze every member.
+ lits.cut();
+ break;
+ }
+ }
+ }
+ HirKind::Alternation(ref es) => {
+ alternate_literals(es, lits, prefixes);
+ }
+ _ => lits.cut(),
+ }
+}
+
+fn suffixes(expr: &Hir, lits: &mut Literals) {
+ match *expr.kind() {
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
+ let mut buf = [0u8; 4];
+ let i = c.encode_utf8(&mut buf).len();
+ let buf = &mut buf[..i];
+ buf.reverse();
+ lits.cross_add(buf);
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ lits.cross_add(&[b]);
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ if !lits.add_char_class_reverse(cls) {
+ lits.cut();
+ }
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ if !lits.add_byte_class(cls) {
+ lits.cut();
+ }
+ }
+ HirKind::Group(hir::Group { ref hir, .. }) => {
+ suffixes(&**hir, lits);
+ }
+ HirKind::Repetition(ref x) => match x.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ repeat_zero_or_one_literals(&x.hir, lits, suffixes);
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ repeat_zero_or_more_literals(&x.hir, lits, suffixes);
+ }
+ hir::RepetitionKind::OneOrMore => {
+ repeat_one_or_more_literals(&x.hir, lits, suffixes);
+ }
+ hir::RepetitionKind::Range(ref rng) => {
+ let (min, max) = match *rng {
+ hir::RepetitionRange::Exactly(m) => (m, Some(m)),
+ hir::RepetitionRange::AtLeast(m) => (m, None),
+ hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
+ };
+ repeat_range_literals(
+ &x.hir, min, max, x.greedy, lits, suffixes,
+ )
+ }
+ },
+ HirKind::Concat(ref es) if es.is_empty() => {}
+ HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
+ HirKind::Concat(ref es) => {
+ for e in es.iter().rev() {
+ if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
+ if !lits.is_empty() {
+ lits.cut();
+ break;
+ }
+ lits.add(Literal::empty());
+ continue;
+ }
+ let mut lits2 = lits.to_empty();
+ suffixes(e, &mut lits2);
+ if !lits.cross_product(&lits2) || !lits2.any_complete() {
+ // If this expression couldn't yield any literal that
+ // could be extended, then we need to quit. Since we're
+ // short-circuiting, we also need to freeze every member.
+ lits.cut();
+ break;
+ }
+ }
+ }
+ HirKind::Alternation(ref es) => {
+ alternate_literals(es, lits, suffixes);
+ }
+ _ => lits.cut(),
+ }
+}
+
+fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
+ e: &Hir,
+ lits: &mut Literals,
+ mut f: F,
+) {
+ f(
+ &Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ // FIXME: Our literal extraction doesn't care about greediness.
+ // Which is partially why we're treating 'e?' as 'e*'. Namely,
+ // 'ab??' yields [Complete(ab), Complete(a)], but it should yield
+ // [Complete(a), Complete(ab)] because of the non-greediness.
+ greedy: true,
+ hir: Box::new(e.clone()),
+ }),
+ lits,
+ );
+}
+
+fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
+ e: &Hir,
+ lits: &mut Literals,
+ mut f: F,
+) {
+ let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
+ lits3.set_limit_size(lits.limit_size() / 2);
+ f(e, &mut lits3);
+
+ if lits3.is_empty() || !lits2.cross_product(&lits3) {
+ lits.cut();
+ return;
+ }
+ lits2.cut();
+ lits2.add(Literal::empty());
+ if !lits.union(lits2) {
+ lits.cut();
+ }
+}
+
+fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
+ e: &Hir,
+ lits: &mut Literals,
+ mut f: F,
+) {
+ f(e, lits);
+ lits.cut();
+}
+
+fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
+ e: &Hir,
+ min: u32,
+ max: Option<u32>,
+ greedy: bool,
+ lits: &mut Literals,
+ mut f: F,
+) {
+ if min == 0 {
+ // This is a bit conservative. If `max` is set, then we could
+ // treat this as a finite set of alternations. For now, we
+ // just treat it as `e*`.
+ f(
+ &Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy,
+ hir: Box::new(e.clone()),
+ }),
+ lits,
+ );
+ } else {
+ if min > 0 {
+ let n = cmp::min(lits.limit_size, min as usize);
+ let es = iter::repeat(e.clone()).take(n).collect();
+ f(&Hir::concat(es), lits);
+ if n < min as usize || lits.contains_empty() {
+ lits.cut();
+ }
+ }
+ if max.map_or(true, |max| min < max) {
+ lits.cut();
+ }
+ }
+}
+
+fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
+ es: &[Hir],
+ lits: &mut Literals,
+ mut f: F,
+) {
+ let mut lits2 = lits.to_empty();
+ for e in es {
+ let mut lits3 = lits.to_empty();
+ lits3.set_limit_size(lits.limit_size() / 5);
+ f(e, &mut lits3);
+ if lits3.is_empty() || !lits2.union(lits3) {
+ // If we couldn't find suffixes for *any* of the
+ // alternates, then the entire alternation has to be thrown
+ // away and any existing members must be frozen. Similarly,
+ // if the union couldn't complete, stop and freeze.
+ lits.cut();
+ return;
+ }
+ }
+ if !lits.cross_product(&lits2) {
+ lits.cut();
+ }
+}
+
+impl fmt::Debug for Literals {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Literals")
+ .field("lits", &self.lits)
+ .field("limit_size", &self.limit_size)
+ .field("limit_class", &self.limit_class)
+ .finish()
+ }
+}
+
+impl Literal {
+ /// Returns a new complete literal with the bytes given.
+ pub fn new(bytes: Vec<u8>) -> Literal {
+ Literal { v: bytes, cut: false }
+ }
+
+ /// Returns a new complete empty literal.
+ pub fn empty() -> Literal {
+ Literal { v: vec![], cut: false }
+ }
+
+ /// Returns true if this literal was "cut."
+ pub fn is_cut(&self) -> bool {
+ self.cut
+ }
+
+ /// Cuts this literal.
+ pub fn cut(&mut self) {
+ self.cut = true;
+ }
+}
+
+impl PartialEq for Literal {
+ fn eq(&self, other: &Literal) -> bool {
+ self.v == other.v
+ }
+}
+
+impl PartialOrd for Literal {
+ fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
+ self.v.partial_cmp(&other.v)
+ }
+}
+
+impl fmt::Debug for Literal {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ if self.is_cut() {
+ write!(f, "Cut({})", escape_unicode(&self.v))
+ } else {
+ write!(f, "Complete({})", escape_unicode(&self.v))
+ }
+ }
+}
+
+impl AsRef<[u8]> for Literal {
+ fn as_ref(&self) -> &[u8] {
+ &self.v
+ }
+}
+
+impl ops::Deref for Literal {
+ type Target = Vec<u8>;
+ fn deref(&self) -> &Vec<u8> {
+ &self.v
+ }
+}
+
+impl ops::DerefMut for Literal {
+ fn deref_mut(&mut self) -> &mut Vec<u8> {
+ &mut self.v
+ }
+}
+
+fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
+ let mut i = 0;
+ while haystack.len() >= needle.len() {
+ if needle == &haystack[..needle.len()] {
+ return Some(i);
+ }
+ i += 1;
+ haystack = &haystack[1..];
+ }
+ None
+}
+
+fn escape_unicode(bytes: &[u8]) -> String {
+ let show = match ::std::str::from_utf8(bytes) {
+ Ok(v) => v.to_string(),
+ Err(_) => escape_bytes(bytes),
+ };
+ let mut space_escaped = String::new();
+ for c in show.chars() {
+ if c.is_whitespace() {
+ let escaped = if c as u32 <= 0x7F {
+ escape_byte(c as u8)
+ } else if c as u32 <= 0xFFFF {
+ format!(r"\u{{{:04x}}}", c as u32)
+ } else {
+ format!(r"\U{{{:08x}}}", c as u32)
+ };
+ space_escaped.push_str(&escaped);
+ } else {
+ space_escaped.push(c);
+ }
+ }
+ space_escaped
+}
+
+fn escape_bytes(bytes: &[u8]) -> String {
+ let mut s = String::new();
+ for &b in bytes {
+ s.push_str(&escape_byte(b));
+ }
+ s
+}
+
+fn escape_byte(byte: u8) -> String {
+ use std::ascii::escape_default;
+
+ let escaped: Vec<u8> = escape_default(byte).collect();
+ String::from_utf8_lossy(&escaped).into_owned()
+}
+
+fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
+ cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
+ as usize
+}
+
+fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
+ cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
+ as usize
+}
+
+#[cfg(test)]
+mod tests {
+ use std::fmt;
+
+ use super::{escape_bytes, Literal, Literals};
+ use crate::hir::Hir;
+ use crate::ParserBuilder;
+
+ // To make test failures easier to read.
+ #[derive(Debug, Eq, PartialEq)]
+ struct Bytes(Vec<ULiteral>);
+ #[derive(Debug, Eq, PartialEq)]
+ struct Unicode(Vec<ULiteral>);
+
+ fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
+ let mut ulits = vec![];
+ for blit in blits {
+ ulits
+ .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
+ }
+ ulits
+ }
+
+ fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
+ Literals {
+ lits: it.into_iter().collect(),
+ limit_size: 0,
+ limit_class: 0,
+ }
+ }
+
+ // Needs to be pub for 1.3?
+ #[derive(Clone, Eq, PartialEq)]
+ pub struct ULiteral {
+ v: String,
+ cut: bool,
+ }
+
+ impl ULiteral {
+ fn is_cut(&self) -> bool {
+ self.cut
+ }
+ }
+
+ impl fmt::Debug for ULiteral {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ if self.is_cut() {
+ write!(f, "Cut({})", self.v)
+ } else {
+ write!(f, "Complete({})", self.v)
+ }
+ }
+ }
+
+ impl PartialEq<Literal> for ULiteral {
+ fn eq(&self, other: &Literal) -> bool {
+ self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
+ }
+ }
+
+ impl PartialEq<ULiteral> for Literal {
+ fn eq(&self, other: &ULiteral) -> bool {
+ &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
+ }
+ }
+
+ #[allow(non_snake_case)]
+ fn C(s: &'static str) -> ULiteral {
+ ULiteral { v: s.to_owned(), cut: true }
+ }
+ #[allow(non_snake_case)]
+ fn M(s: &'static str) -> ULiteral {
+ ULiteral { v: s.to_owned(), cut: false }
+ }
+
+ fn prefixes(lits: &mut Literals, expr: &Hir) {
+ lits.union_prefixes(expr);
+ }
+
+ fn suffixes(lits: &mut Literals, expr: &Hir) {
+ lits.union_suffixes(expr);
+ }
+
+ macro_rules! assert_lit_eq {
+ ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
+ let expected: Vec<ULiteral> = vec![$($expected_lit),*];
+ let lits = $got_lits;
+ assert_eq!(
+ $which(expected.clone()),
+ $which(escape_lits(lits.literals())));
+ assert_eq!(
+ !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
+ lits.all_complete());
+ assert_eq!(
+ expected.iter().any(|l| !l.is_cut()),
+ lits.any_complete());
+ }};
+ }
+
+ macro_rules! test_lit {
+ ($name:ident, $which:ident, $re:expr) => {
+ test_lit!($name, $which, $re,);
+ };
+ ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
+ #[test]
+ fn $name() {
+ let expr = ParserBuilder::new()
+ .build()
+ .parse($re)
+ .unwrap();
+ let lits = Literals::$which(&expr);
+ assert_lit_eq!(Unicode, lits, $($lit),*);
+
+ let expr = ParserBuilder::new()
+ .allow_invalid_utf8(true)
+ .unicode(false)
+ .build()
+ .parse($re)
+ .unwrap();
+ let lits = Literals::$which(&expr);
+ assert_lit_eq!(Bytes, lits, $($lit),*);
+ }
+ };
+ }
+
+ // ************************************************************************
+ // Tests for prefix literal extraction.
+ // ************************************************************************
+
+ // Elementary tests.
+ test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
+ test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
+ test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
+ #[cfg(feature = "unicode-case")]
+ test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
+ test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
+ test_lit!(
+ pfx_class2,
+ prefixes,
+ "(?u)[☃Ⅰ]",
+ M("\\xe2\\x85\\xa0"),
+ M("\\xe2\\x98\\x83")
+ );
+ #[cfg(feature = "unicode-case")]
+ test_lit!(
+ pfx_class3,
+ prefixes,
+ "(?ui)[☃Ⅰ]",
+ M("\\xe2\\x85\\xa0"),
+ M("\\xe2\\x85\\xb0"),
+ M("\\xe2\\x98\\x83")
+ );
+ test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
+ test_lit!(
+ pfx_one_lit_casei2,
+ prefixes,
+ "(?i-u)abc",
+ M("ABC"),
+ M("aBC"),
+ M("AbC"),
+ M("abC"),
+ M("ABc"),
+ M("aBc"),
+ M("Abc"),
+ M("abc")
+ );
+ test_lit!(pfx_group1, prefixes, "(a)", M("a"));
+ test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
+ test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
+ test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
+ // FIXME: This should return [M("a"), M("ab")] because of the non-greedy
+ // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
+ // a cut literal.
+ test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
+ test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
+ test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
+ test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
+ test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
+ test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
+ test_lit!(pfx_rep_range1, prefixes, "a{0}");
+ test_lit!(pfx_rep_range2, prefixes, "a{0,}");
+ test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
+ test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
+ test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
+ test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
+ test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
+
+ // Test regexes with concatenations.
+ test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
+ test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
+ test_lit!(
+ pfx_cat3,
+ prefixes,
+ "(?i-u)[ab]z",
+ M("AZ"),
+ M("BZ"),
+ M("aZ"),
+ M("bZ"),
+ M("Az"),
+ M("Bz"),
+ M("az"),
+ M("bz")
+ );
+ test_lit!(
+ pfx_cat4,
+ prefixes,
+ "[ab][yz]",
+ M("ay"),
+ M("by"),
+ M("az"),
+ M("bz")
+ );
+ test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
+ test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
+ test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
+ test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
+ test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
+ test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
+ test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
+ test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
+ test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
+ test_lit!(pfx_cat14, prefixes, "a^", C("a"));
+ test_lit!(pfx_cat15, prefixes, "$a");
+ test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
+ test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
+ test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
+ test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
+
+ // Test regexes with alternations.
+ test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
+ test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
+ test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
+ test_lit!(pfx_alt4, prefixes, "a|b*");
+ test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
+ test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
+ test_lit!(
+ pfx_alt7,
+ prefixes,
+ "(a|b)*c|(a|ab)*c",
+ C("a"),
+ C("b"),
+ M("c"),
+ C("a"),
+ C("ab"),
+ M("c")
+ );
+ test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
+
+ // Test regexes with empty assertions.
+ test_lit!(pfx_empty1, prefixes, "^a", M("a"));
+ test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
+ test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
+ test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
+
+ // Make sure some curious regexes have no prefixes.
+ test_lit!(pfx_nothing1, prefixes, ".");
+ test_lit!(pfx_nothing2, prefixes, "(?s).");
+ test_lit!(pfx_nothing3, prefixes, "^");
+ test_lit!(pfx_nothing4, prefixes, "$");
+ test_lit!(pfx_nothing6, prefixes, "(?m)$");
+ test_lit!(pfx_nothing7, prefixes, r"\b");
+ test_lit!(pfx_nothing8, prefixes, r"\B");
+
+ // Test a few regexes that defeat any prefix literal detection.
+ test_lit!(pfx_defeated1, prefixes, ".a");
+ test_lit!(pfx_defeated2, prefixes, "(?s).a");
+ test_lit!(pfx_defeated3, prefixes, "a*b*c*");
+ test_lit!(pfx_defeated4, prefixes, "a|.");
+ test_lit!(pfx_defeated5, prefixes, ".|a");
+ test_lit!(pfx_defeated6, prefixes, "a|^");
+ test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
+ test_lit!(pfx_defeated8, prefixes, "$a");
+ test_lit!(pfx_defeated9, prefixes, "(?m)$a");
+ test_lit!(pfx_defeated10, prefixes, r"\ba");
+ test_lit!(pfx_defeated11, prefixes, r"\Ba");
+ test_lit!(pfx_defeated12, prefixes, "^*a");
+ test_lit!(pfx_defeated13, prefixes, "^+a");
+
+ test_lit!(
+ pfx_crazy1,
+ prefixes,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ C("Mo\\'"),
+ C("Mu\\'"),
+ C("Moam"),
+ C("Muam")
+ );
+
+ // ************************************************************************
+ // Tests for quiting prefix literal search.
+ // ************************************************************************
+
+ macro_rules! test_exhausted {
+ ($name:ident, $which:ident, $re:expr) => {
+ test_exhausted!($name, $which, $re,);
+ };
+ ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
+ #[test]
+ fn $name() {
+ let expr = ParserBuilder::new()
+ .build()
+ .parse($re)
+ .unwrap();
+ let mut lits = Literals::empty();
+ lits.set_limit_size(20).set_limit_class(10);
+ $which(&mut lits, &expr);
+ assert_lit_eq!(Unicode, lits, $($lit),*);
+
+ let expr = ParserBuilder::new()
+ .allow_invalid_utf8(true)
+ .unicode(false)
+ .build()
+ .parse($re)
+ .unwrap();
+ let mut lits = Literals::empty();
+ lits.set_limit_size(20).set_limit_class(10);
+ $which(&mut lits, &expr);
+ assert_lit_eq!(Bytes, lits, $($lit),*);
+ }
+ };
+ }
+
+ // These test use a much lower limit than the default so that we can
+ // write test cases of reasonable size.
+ test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
+ test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
+ test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
+ test_exhausted!(
+ pfx_exhausted4,
+ prefixes,
+ "(?i-u)foobar",
+ C("FO"),
+ C("fO"),
+ C("Fo"),
+ C("fo")
+ );
+ test_exhausted!(
+ pfx_exhausted5,
+ prefixes,
+ "(?:ab){100}",
+ C("abababababababababab")
+ );
+ test_exhausted!(
+ pfx_exhausted6,
+ prefixes,
+ "(?:(?:ab){100})*cd",
+ C("ababababab"),
+ M("cd")
+ );
+ test_exhausted!(
+ pfx_exhausted7,
+ prefixes,
+ "z(?:(?:ab){100})*cd",
+ C("zababababab"),
+ M("zcd")
+ );
+ test_exhausted!(
+ pfx_exhausted8,
+ prefixes,
+ "aaaaaaaaaaaaaaaaaaaaz",
+ C("aaaaaaaaaaaaaaaaaaaa")
+ );
+
+ // ************************************************************************
+ // Tests for suffix literal extraction.
+ // ************************************************************************
+
+ // Elementary tests.
+ test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
+ test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
+ test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
+ #[cfg(feature = "unicode-case")]
+ test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
+ test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
+ test_lit!(
+ sfx_class2,
+ suffixes,
+ "(?u)[☃Ⅰ]",
+ M("\\xe2\\x85\\xa0"),
+ M("\\xe2\\x98\\x83")
+ );
+ #[cfg(feature = "unicode-case")]
+ test_lit!(
+ sfx_class3,
+ suffixes,
+ "(?ui)[☃Ⅰ]",
+ M("\\xe2\\x85\\xa0"),
+ M("\\xe2\\x85\\xb0"),
+ M("\\xe2\\x98\\x83")
+ );
+ test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
+ test_lit!(
+ sfx_one_lit_casei2,
+ suffixes,
+ "(?i-u)abc",
+ M("ABC"),
+ M("ABc"),
+ M("AbC"),
+ M("Abc"),
+ M("aBC"),
+ M("aBc"),
+ M("abC"),
+ M("abc")
+ );
+ test_lit!(sfx_group1, suffixes, "(a)", M("a"));
+ test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
+ test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
+ test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
+ test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
+ test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
+ test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
+ test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
+ test_lit!(sfx_rep_range1, suffixes, "a{0}");
+ test_lit!(sfx_rep_range2, suffixes, "a{0,}");
+ test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
+ test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
+ test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
+ test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
+ test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
+
+ // Test regexes with concatenations.
+ test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
+ test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
+ test_lit!(
+ sfx_cat3,
+ suffixes,
+ "(?i-u)[ab]z",
+ M("AZ"),
+ M("Az"),
+ M("BZ"),
+ M("Bz"),
+ M("aZ"),
+ M("az"),
+ M("bZ"),
+ M("bz")
+ );
+ test_lit!(
+ sfx_cat4,
+ suffixes,
+ "[ab][yz]",
+ M("ay"),
+ M("az"),
+ M("by"),
+ M("bz")
+ );
+ test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
+ test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
+ test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
+ test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
+ test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
+ test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
+ test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
+ test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
+ test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
+ test_lit!(sfx_cat14, suffixes, "a^");
+ test_lit!(sfx_cat15, suffixes, "$a", C("a"));
+ test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
+ test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
+ test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
+ test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
+
+ // Test regexes with alternations.
+ test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
+ test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
+ test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
+ test_lit!(sfx_alt4, suffixes, "a|b*");
+ test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
+ test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
+ test_lit!(
+ sfx_alt7,
+ suffixes,
+ "(a|b)*c|(a|ab)*c",
+ C("ac"),
+ C("bc"),
+ M("c"),
+ C("ac"),
+ C("abc"),
+ M("c")
+ );
+ test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
+
+ // Test regexes with empty assertions.
+ test_lit!(sfx_empty1, suffixes, "a$", M("a"));
+ test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
+
+ // Make sure some curious regexes have no suffixes.
+ test_lit!(sfx_nothing1, suffixes, ".");
+ test_lit!(sfx_nothing2, suffixes, "(?s).");
+ test_lit!(sfx_nothing3, suffixes, "^");
+ test_lit!(sfx_nothing4, suffixes, "$");
+ test_lit!(sfx_nothing6, suffixes, "(?m)$");
+ test_lit!(sfx_nothing7, suffixes, r"\b");
+ test_lit!(sfx_nothing8, suffixes, r"\B");
+
+ // Test a few regexes that defeat any suffix literal detection.
+ test_lit!(sfx_defeated1, suffixes, "a.");
+ test_lit!(sfx_defeated2, suffixes, "(?s)a.");
+ test_lit!(sfx_defeated3, suffixes, "a*b*c*");
+ test_lit!(sfx_defeated4, suffixes, "a|.");
+ test_lit!(sfx_defeated5, suffixes, ".|a");
+ test_lit!(sfx_defeated6, suffixes, "a|^");
+ test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
+ test_lit!(sfx_defeated8, suffixes, "a^");
+ test_lit!(sfx_defeated9, suffixes, "(?m)a$");
+ test_lit!(sfx_defeated10, suffixes, r"a\b");
+ test_lit!(sfx_defeated11, suffixes, r"a\B");
+ test_lit!(sfx_defeated12, suffixes, "a^*");
+ test_lit!(sfx_defeated13, suffixes, "a^+");
+
+ // These test use a much lower limit than the default so that we can
+ // write test cases of reasonable size.
+ test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
+ test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
+ test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
+ test_exhausted!(
+ sfx_exhausted4,
+ suffixes,
+ "(?i-u)foobar",
+ C("AR"),
+ C("Ar"),
+ C("aR"),
+ C("ar")
+ );
+ test_exhausted!(
+ sfx_exhausted5,
+ suffixes,
+ "(?:ab){100}",
+ C("abababababababababab")
+ );
+ test_exhausted!(
+ sfx_exhausted6,
+ suffixes,
+ "cd(?:(?:ab){100})*",
+ C("ababababab"),
+ M("cd")
+ );
+ test_exhausted!(
+ sfx_exhausted7,
+ suffixes,
+ "cd(?:(?:ab){100})*z",
+ C("abababababz"),
+ M("cdz")
+ );
+ test_exhausted!(
+ sfx_exhausted8,
+ suffixes,
+ "zaaaaaaaaaaaaaaaaaaaa",
+ C("aaaaaaaaaaaaaaaaaaaa")
+ );
+
+ // ************************************************************************
+ // Tests for generating unambiguous literal sets.
+ // ************************************************************************
+
+ macro_rules! test_unamb {
+ ($name:ident, $given:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let given: Vec<Literal> = $given
+ .into_iter()
+ .map(|ul| {
+ let cut = ul.is_cut();
+ Literal { v: ul.v.into_bytes(), cut: cut }
+ })
+ .collect();
+ let lits = create_lits(given);
+ let got = lits.unambiguous_prefixes();
+ assert_eq!($expected, escape_lits(got.literals()));
+ }
+ };
+ }
+
+ test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
+ test_unamb!(
+ unambiguous2,
+ vec![M("zaaaaaa"), M("aa")],
+ vec![C("aa"), C("z")]
+ );
+ test_unamb!(
+ unambiguous3,
+ vec![M("Sherlock"), M("Watson")],
+ vec![M("Sherlock"), M("Watson")]
+ );
+ test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
+ test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
+ test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
+ test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
+ test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
+ test_unamb!(
+ unambiguous9,
+ vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
+ vec![C("a"), C("b"), C("c")]
+ );
+ test_unamb!(
+ unambiguous10,
+ vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
+ vec![C("Mo"), C("Mu")]
+ );
+ test_unamb!(
+ unambiguous11,
+ vec![M("zazb"), M("azb")],
+ vec![C("a"), C("z")]
+ );
+ test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
+ test_unamb!(
+ unambiguous13,
+ vec![M("ABCX"), M("CDAX"), M("BCX")],
+ vec![C("A"), C("BCX"), C("CD")]
+ );
+ test_unamb!(
+ unambiguous14,
+ vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
+ vec![M("DSX"), C("I"), C("MGX"), C("MV")]
+ );
+ test_unamb!(
+ unambiguous15,
+ vec![M("IMG_"), M("MG_"), M("CIMG")],
+ vec![C("C"), C("I"), C("MG_")]
+ );
+
+ // ************************************************************************
+ // Tests for suffix trimming.
+ // ************************************************************************
+ macro_rules! test_trim {
+ ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let given: Vec<Literal> = $given
+ .into_iter()
+ .map(|ul| {
+ let cut = ul.is_cut();
+ Literal { v: ul.v.into_bytes(), cut: cut }
+ })
+ .collect();
+ let lits = create_lits(given);
+ let got = lits.trim_suffix($trim).unwrap();
+ assert_eq!($expected, escape_lits(got.literals()));
+ }
+ };
+ }
+
+ test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
+ test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
+ test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
+ test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
+
+ // ************************************************************************
+ // Tests for longest common prefix.
+ // ************************************************************************
+
+ macro_rules! test_lcp {
+ ($name:ident, $given:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let given: Vec<Literal> = $given
+ .into_iter()
+ .map(|s: &str| Literal {
+ v: s.to_owned().into_bytes(),
+ cut: false,
+ })
+ .collect();
+ let lits = create_lits(given);
+ let got = lits.longest_common_prefix();
+ assert_eq!($expected, escape_bytes(got));
+ }
+ };
+ }
+
+ test_lcp!(lcp1, vec!["a"], "a");
+ test_lcp!(lcp2, vec![], "");
+ test_lcp!(lcp3, vec!["a", "b"], "");
+ test_lcp!(lcp4, vec!["ab", "ab"], "ab");
+ test_lcp!(lcp5, vec!["ab", "a"], "a");
+ test_lcp!(lcp6, vec!["a", "ab"], "a");
+ test_lcp!(lcp7, vec!["ab", "b"], "");
+ test_lcp!(lcp8, vec!["b", "ab"], "");
+ test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
+ test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
+ test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
+ test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
+
+ // ************************************************************************
+ // Tests for longest common suffix.
+ // ************************************************************************
+
+ macro_rules! test_lcs {
+ ($name:ident, $given:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let given: Vec<Literal> = $given
+ .into_iter()
+ .map(|s: &str| Literal {
+ v: s.to_owned().into_bytes(),
+ cut: false,
+ })
+ .collect();
+ let lits = create_lits(given);
+ let got = lits.longest_common_suffix();
+ assert_eq!($expected, escape_bytes(got));
+ }
+ };
+ }
+
+ test_lcs!(lcs1, vec!["a"], "a");
+ test_lcs!(lcs2, vec![], "");
+ test_lcs!(lcs3, vec!["a", "b"], "");
+ test_lcs!(lcs4, vec!["ab", "ab"], "ab");
+ test_lcs!(lcs5, vec!["ab", "a"], "");
+ test_lcs!(lcs6, vec!["a", "ab"], "");
+ test_lcs!(lcs7, vec!["ab", "b"], "b");
+ test_lcs!(lcs8, vec!["b", "ab"], "b");
+ test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
+ test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
+ test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
+ test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
+}
diff --git a/third_party/rust/regex-syntax/src/hir/mod.rs b/third_party/rust/regex-syntax/src/hir/mod.rs
new file mode 100644
index 0000000000..1096e9f05a
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/mod.rs
@@ -0,0 +1,2296 @@
+/*!
+Defines a high-level intermediate representation for regular expressions.
+*/
+use std::char;
+use std::cmp;
+use std::error;
+use std::fmt;
+use std::result;
+use std::u8;
+
+use crate::ast::Span;
+use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
+use crate::unicode;
+
+pub use crate::hir::visitor::{visit, Visitor};
+pub use crate::unicode::CaseFoldError;
+
+mod interval;
+pub mod literal;
+pub mod print;
+pub mod translate;
+mod visitor;
+
+/// An error that can occur while translating an `Ast` to a `Hir`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Error {
+ /// The kind of error.
+ kind: ErrorKind,
+ /// The original pattern that the translator's Ast was parsed from. Every
+ /// span in an error is a valid range into this string.
+ pattern: String,
+ /// The span of this error, derived from the Ast given to the translator.
+ span: Span,
+}
+
+impl Error {
+ /// Return the type of this error.
+ pub fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
+
+ /// The original pattern string in which this error occurred.
+ ///
+ /// Every span reported by this error is reported in terms of this string.
+ pub fn pattern(&self) -> &str {
+ &self.pattern
+ }
+
+ /// Return the span at which this error occurred.
+ pub fn span(&self) -> &Span {
+ &self.span
+ }
+}
+
+/// The type of an error that occurred while building an `Hir`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ErrorKind {
+ /// This error occurs when a Unicode feature is used when Unicode
+ /// support is disabled. For example `(?-u:\pL)` would trigger this error.
+ UnicodeNotAllowed,
+ /// This error occurs when translating a pattern that could match a byte
+ /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
+ InvalidUtf8,
+ /// This occurs when an unrecognized Unicode property name could not
+ /// be found.
+ UnicodePropertyNotFound,
+ /// This occurs when an unrecognized Unicode property value could not
+ /// be found.
+ UnicodePropertyValueNotFound,
+ /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
+ /// `\d`) could not be found. This can occur when the `unicode-perl`
+ /// crate feature is not enabled.
+ UnicodePerlClassNotFound,
+ /// This occurs when the Unicode simple case mapping tables are not
+ /// available, and the regular expression required Unicode aware case
+ /// insensitivity.
+ UnicodeCaseUnavailable,
+ /// This occurs when the translator attempts to construct a character class
+ /// that is empty.
+ ///
+ /// Note that this restriction in the translator may be removed in the
+ /// future.
+ EmptyClassNotAllowed,
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl ErrorKind {
+ // TODO: Remove this method entirely on the next breaking semver release.
+ #[allow(deprecated)]
+ fn description(&self) -> &str {
+ use self::ErrorKind::*;
+ match *self {
+ UnicodeNotAllowed => "Unicode not allowed here",
+ InvalidUtf8 => "pattern can match invalid UTF-8",
+ UnicodePropertyNotFound => "Unicode property not found",
+ UnicodePropertyValueNotFound => "Unicode property value not found",
+ UnicodePerlClassNotFound => {
+ "Unicode-aware Perl class not found \
+ (make sure the unicode-perl feature is enabled)"
+ }
+ UnicodeCaseUnavailable => {
+ "Unicode-aware case insensitivity matching is not available \
+ (make sure the unicode-case feature is enabled)"
+ }
+ EmptyClassNotAllowed => "empty character classes are not allowed",
+ __Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+impl error::Error for Error {
+ // TODO: Remove this method entirely on the next breaking semver release.
+ #[allow(deprecated)]
+ fn description(&self) -> &str {
+ self.kind.description()
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ crate::error::Formatter::from(self).fmt(f)
+ }
+}
+
+impl fmt::Display for ErrorKind {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ // TODO: Remove this on the next breaking semver release.
+ #[allow(deprecated)]
+ f.write_str(self.description())
+ }
+}
+
+/// A high-level intermediate representation (HIR) for a regular expression.
+///
+/// The HIR of a regular expression represents an intermediate step between its
+/// abstract syntax (a structured description of the concrete syntax) and
+/// compiled byte codes. The purpose of HIR is to make regular expressions
+/// easier to analyze. In particular, the AST is much more complex than the
+/// HIR. For example, while an AST supports arbitrarily nested character
+/// classes, the HIR will flatten all nested classes into a single set. The HIR
+/// will also "compile away" every flag present in the concrete syntax. For
+/// example, users of HIR expressions never need to worry about case folding;
+/// it is handled automatically by the translator (e.g., by translating `(?i)A`
+/// to `[aA]`).
+///
+/// If the HIR was produced by a translator that disallows invalid UTF-8, then
+/// the HIR is guaranteed to match UTF-8 exclusively.
+///
+/// This type defines its own destructor that uses constant stack space and
+/// heap space proportional to the size of the HIR.
+///
+/// The specific type of an HIR expression can be accessed via its `kind`
+/// or `into_kind` methods. This extra level of indirection exists for two
+/// reasons:
+///
+/// 1. Construction of an HIR expression *must* use the constructor methods
+/// on this `Hir` type instead of building the `HirKind` values directly.
+/// This permits construction to enforce invariants like "concatenations
+/// always consist of two or more sub-expressions."
+/// 2. Every HIR expression contains attributes that are defined inductively,
+/// and can be computed cheaply during the construction process. For
+/// example, one such attribute is whether the expression must match at the
+/// beginning of the text.
+///
+/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
+/// expression pattern string, and uses constant stack space and heap space
+/// proportional to the size of the `Hir`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Hir {
+ /// The underlying HIR kind.
+ kind: HirKind,
+ /// Analysis info about this HIR, computed during construction.
+ info: HirInfo,
+}
+
+/// The kind of an arbitrary `Hir` expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum HirKind {
+ /// The empty regular expression, which matches everything, including the
+ /// empty string.
+ Empty,
+ /// A single literal character that matches exactly this character.
+ Literal(Literal),
+ /// A single character class that matches any of the characters in the
+ /// class. A class can either consist of Unicode scalar values as
+ /// characters, or it can use bytes.
+ Class(Class),
+ /// An anchor assertion. An anchor assertion match always has zero length.
+ Anchor(Anchor),
+ /// A word boundary assertion, which may or may not be Unicode aware. A
+ /// word boundary assertion match always has zero length.
+ WordBoundary(WordBoundary),
+ /// A repetition operation applied to a child expression.
+ Repetition(Repetition),
+ /// A possibly capturing group, which contains a child expression.
+ Group(Group),
+ /// A concatenation of expressions. A concatenation always has at least two
+ /// child expressions.
+ ///
+ /// A concatenation matches only if each of its child expression matches
+ /// one after the other.
+ Concat(Vec<Hir>),
+ /// An alternation of expressions. An alternation always has at least two
+ /// child expressions.
+ ///
+ /// An alternation matches only if at least one of its child expression
+ /// matches. If multiple expressions match, then the leftmost is preferred.
+ Alternation(Vec<Hir>),
+}
+
+impl Hir {
+ /// Returns a reference to the underlying HIR kind.
+ pub fn kind(&self) -> &HirKind {
+ &self.kind
+ }
+
+ /// Consumes ownership of this HIR expression and returns its underlying
+ /// `HirKind`.
+ pub fn into_kind(mut self) -> HirKind {
+ use std::mem;
+ mem::replace(&mut self.kind, HirKind::Empty)
+ }
+
+ /// Returns an empty HIR expression.
+ ///
+ /// An empty HIR expression always matches, including the empty string.
+ pub fn empty() -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(true);
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ Hir { kind: HirKind::Empty, info }
+ }
+
+ /// Creates a literal HIR expression.
+ ///
+ /// If the given literal has a `Byte` variant with an ASCII byte, then this
+ /// method panics. This enforces the invariant that `Byte` variants are
+ /// only used to express matching of invalid UTF-8.
+ pub fn literal(lit: Literal) -> Hir {
+ if let Literal::Byte(b) = lit {
+ assert!(b > 0x7F);
+ }
+
+ let mut info = HirInfo::new();
+ info.set_always_utf8(lit.is_unicode());
+ info.set_all_assertions(false);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(false);
+ info.set_literal(true);
+ info.set_alternation_literal(true);
+ Hir { kind: HirKind::Literal(lit), info }
+ }
+
+ /// Creates a class HIR expression.
+ pub fn class(class: Class) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(class.is_always_utf8());
+ info.set_all_assertions(false);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(false);
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ Hir { kind: HirKind::Class(class), info }
+ }
+
+ /// Creates an anchor assertion HIR expression.
+ pub fn anchor(anchor: Anchor) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(true);
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ if let Anchor::StartText = anchor {
+ info.set_anchored_start(true);
+ info.set_line_anchored_start(true);
+ info.set_any_anchored_start(true);
+ }
+ if let Anchor::EndText = anchor {
+ info.set_anchored_end(true);
+ info.set_line_anchored_end(true);
+ info.set_any_anchored_end(true);
+ }
+ if let Anchor::StartLine = anchor {
+ info.set_line_anchored_start(true);
+ }
+ if let Anchor::EndLine = anchor {
+ info.set_line_anchored_end(true);
+ }
+ Hir { kind: HirKind::Anchor(anchor), info }
+ }
+
+ /// Creates a word boundary assertion HIR expression.
+ pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_anchored_start(false);
+ info.set_anchored_end(false);
+ info.set_line_anchored_start(false);
+ info.set_line_anchored_end(false);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ // A negated word boundary matches '', so that's fine. But \b does not
+ // match \b, so why do we say it can match the empty string? Well,
+ // because, if you search for \b against 'a', it will report [0, 0) and
+ // [1, 1) as matches, and both of those matches correspond to the empty
+ // string. Thus, only *certain* empty strings match \b, which similarly
+ // applies to \B.
+ info.set_match_empty(true);
+ // Negated ASCII word boundaries can match invalid UTF-8.
+ if let WordBoundary::AsciiNegate = word_boundary {
+ info.set_always_utf8(false);
+ }
+ Hir { kind: HirKind::WordBoundary(word_boundary), info }
+ }
+
+ /// Creates a repetition HIR expression.
+ pub fn repetition(rep: Repetition) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(rep.hir.is_always_utf8());
+ info.set_all_assertions(rep.hir.is_all_assertions());
+ // If this operator can match the empty string, then it can never
+ // be anchored.
+ info.set_anchored_start(
+ !rep.is_match_empty() && rep.hir.is_anchored_start(),
+ );
+ info.set_anchored_end(
+ !rep.is_match_empty() && rep.hir.is_anchored_end(),
+ );
+ info.set_line_anchored_start(
+ !rep.is_match_empty() && rep.hir.is_anchored_start(),
+ );
+ info.set_line_anchored_end(
+ !rep.is_match_empty() && rep.hir.is_anchored_end(),
+ );
+ info.set_any_anchored_start(rep.hir.is_any_anchored_start());
+ info.set_any_anchored_end(rep.hir.is_any_anchored_end());
+ info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ Hir { kind: HirKind::Repetition(rep), info }
+ }
+
+ /// Creates a group HIR expression.
+ pub fn group(group: Group) -> Hir {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(group.hir.is_always_utf8());
+ info.set_all_assertions(group.hir.is_all_assertions());
+ info.set_anchored_start(group.hir.is_anchored_start());
+ info.set_anchored_end(group.hir.is_anchored_end());
+ info.set_line_anchored_start(group.hir.is_line_anchored_start());
+ info.set_line_anchored_end(group.hir.is_line_anchored_end());
+ info.set_any_anchored_start(group.hir.is_any_anchored_start());
+ info.set_any_anchored_end(group.hir.is_any_anchored_end());
+ info.set_match_empty(group.hir.is_match_empty());
+ info.set_literal(false);
+ info.set_alternation_literal(false);
+ Hir { kind: HirKind::Group(group), info }
+ }
+
+ /// Returns the concatenation of the given expressions.
+ ///
+ /// This flattens the concatenation as appropriate.
+ pub fn concat(mut exprs: Vec<Hir>) -> Hir {
+ match exprs.len() {
+ 0 => Hir::empty(),
+ 1 => exprs.pop().unwrap(),
+ _ => {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(true);
+ info.set_literal(true);
+ info.set_alternation_literal(true);
+
+ // Some attributes require analyzing all sub-expressions.
+ for e in &exprs {
+ let x = info.is_always_utf8() && e.is_always_utf8();
+ info.set_always_utf8(x);
+
+ let x = info.is_all_assertions() && e.is_all_assertions();
+ info.set_all_assertions(x);
+
+ let x = info.is_any_anchored_start()
+ || e.is_any_anchored_start();
+ info.set_any_anchored_start(x);
+
+ let x =
+ info.is_any_anchored_end() || e.is_any_anchored_end();
+ info.set_any_anchored_end(x);
+
+ let x = info.is_match_empty() && e.is_match_empty();
+ info.set_match_empty(x);
+
+ let x = info.is_literal() && e.is_literal();
+ info.set_literal(x);
+
+ let x = info.is_alternation_literal()
+ && e.is_alternation_literal();
+ info.set_alternation_literal(x);
+ }
+ // Anchored attributes require something slightly more
+ // sophisticated. Normally, WLOG, to determine whether an
+ // expression is anchored to the start, we'd only need to check
+ // the first expression of a concatenation. However,
+ // expressions like `$\b^` are still anchored to the start,
+ // but the first expression in the concatenation *isn't*
+ // anchored to the start. So the "first" expression to look at
+ // is actually one that is either not an assertion or is
+ // specifically the StartText assertion.
+ info.set_anchored_start(
+ exprs
+ .iter()
+ .take_while(|e| {
+ e.is_anchored_start() || e.is_all_assertions()
+ })
+ .any(|e| e.is_anchored_start()),
+ );
+ // Similarly for the end anchor, but in reverse.
+ info.set_anchored_end(
+ exprs
+ .iter()
+ .rev()
+ .take_while(|e| {
+ e.is_anchored_end() || e.is_all_assertions()
+ })
+ .any(|e| e.is_anchored_end()),
+ );
+ // Repeat the process for line anchors.
+ info.set_line_anchored_start(
+ exprs
+ .iter()
+ .take_while(|e| {
+ e.is_line_anchored_start() || e.is_all_assertions()
+ })
+ .any(|e| e.is_line_anchored_start()),
+ );
+ info.set_line_anchored_end(
+ exprs
+ .iter()
+ .rev()
+ .take_while(|e| {
+ e.is_line_anchored_end() || e.is_all_assertions()
+ })
+ .any(|e| e.is_line_anchored_end()),
+ );
+ Hir { kind: HirKind::Concat(exprs), info }
+ }
+ }
+ }
+
+ /// Returns the alternation of the given expressions.
+ ///
+ /// This flattens the alternation as appropriate.
+ pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
+ match exprs.len() {
+ 0 => Hir::empty(),
+ 1 => exprs.pop().unwrap(),
+ _ => {
+ let mut info = HirInfo::new();
+ info.set_always_utf8(true);
+ info.set_all_assertions(true);
+ info.set_anchored_start(true);
+ info.set_anchored_end(true);
+ info.set_line_anchored_start(true);
+ info.set_line_anchored_end(true);
+ info.set_any_anchored_start(false);
+ info.set_any_anchored_end(false);
+ info.set_match_empty(false);
+ info.set_literal(false);
+ info.set_alternation_literal(true);
+
+ // Some attributes require analyzing all sub-expressions.
+ for e in &exprs {
+ let x = info.is_always_utf8() && e.is_always_utf8();
+ info.set_always_utf8(x);
+
+ let x = info.is_all_assertions() && e.is_all_assertions();
+ info.set_all_assertions(x);
+
+ let x = info.is_anchored_start() && e.is_anchored_start();
+ info.set_anchored_start(x);
+
+ let x = info.is_anchored_end() && e.is_anchored_end();
+ info.set_anchored_end(x);
+
+ let x = info.is_line_anchored_start()
+ && e.is_line_anchored_start();
+ info.set_line_anchored_start(x);
+
+ let x = info.is_line_anchored_end()
+ && e.is_line_anchored_end();
+ info.set_line_anchored_end(x);
+
+ let x = info.is_any_anchored_start()
+ || e.is_any_anchored_start();
+ info.set_any_anchored_start(x);
+
+ let x =
+ info.is_any_anchored_end() || e.is_any_anchored_end();
+ info.set_any_anchored_end(x);
+
+ let x = info.is_match_empty() || e.is_match_empty();
+ info.set_match_empty(x);
+
+ let x = info.is_alternation_literal() && e.is_literal();
+ info.set_alternation_literal(x);
+ }
+ Hir { kind: HirKind::Alternation(exprs), info }
+ }
+ }
+ }
+
+ /// Build an HIR expression for `.`.
+ ///
+ /// A `.` expression matches any character except for `\n`. To build an
+ /// expression that matches any character, including `\n`, use the `any`
+ /// method.
+ ///
+ /// If `bytes` is `true`, then this assumes characters are limited to a
+ /// single byte.
+ pub fn dot(bytes: bool) -> Hir {
+ if bytes {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\x09'));
+ cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ } else {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\x09'));
+ cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ }
+
+ /// Build an HIR expression for `(?s).`.
+ ///
+ /// A `(?s).` expression matches any character, including `\n`. To build an
+ /// expression that matches any character except for `\n`, then use the
+ /// `dot` method.
+ ///
+ /// If `bytes` is `true`, then this assumes characters are limited to a
+ /// single byte.
+ pub fn any(bytes: bool) -> Hir {
+ if bytes {
+ let mut cls = ClassBytes::empty();
+ cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
+ Hir::class(Class::Bytes(cls))
+ } else {
+ let mut cls = ClassUnicode::empty();
+ cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
+ Hir::class(Class::Unicode(cls))
+ }
+ }
+
+ /// Return true if and only if this HIR will always match valid UTF-8.
+ ///
+ /// When this returns false, then it is possible for this HIR expression
+ /// to match invalid UTF-8.
+ pub fn is_always_utf8(&self) -> bool {
+ self.info.is_always_utf8()
+ }
+
+ /// Returns true if and only if this entire HIR expression is made up of
+ /// zero-width assertions.
+ ///
+ /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
+ /// not `^a`.
+ pub fn is_all_assertions(&self) -> bool {
+ self.info.is_all_assertions()
+ }
+
+ /// Return true if and only if this HIR is required to match from the
+ /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
+ /// `^foo|^bar` but not `^foo|bar`.
+ pub fn is_anchored_start(&self) -> bool {
+ self.info.is_anchored_start()
+ }
+
+ /// Return true if and only if this HIR is required to match at the end
+ /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
+ /// `foo$|bar$` but not `foo$|bar`.
+ pub fn is_anchored_end(&self) -> bool {
+ self.info.is_anchored_end()
+ }
+
+ /// Return true if and only if this HIR is required to match from the
+ /// beginning of text or the beginning of a line. This includes expressions
+ /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
+ /// but not `^foo|bar` or `(?m)^foo|bar`.
+ ///
+ /// Note that if `is_anchored_start` is `true`, then
+ /// `is_line_anchored_start` will also be `true`. The reverse implication
+ /// is not true. For example, `(?m)^foo` is line anchored, but not
+ /// `is_anchored_start`.
+ pub fn is_line_anchored_start(&self) -> bool {
+ self.info.is_line_anchored_start()
+ }
+
+ /// Return true if and only if this HIR is required to match at the
+ /// end of text or the end of a line. This includes expressions like
+ /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
+ /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
+ ///
+ /// Note that if `is_anchored_end` is `true`, then
+ /// `is_line_anchored_end` will also be `true`. The reverse implication
+ /// is not true. For example, `(?m)foo$` is line anchored, but not
+ /// `is_anchored_end`.
+ pub fn is_line_anchored_end(&self) -> bool {
+ self.info.is_line_anchored_end()
+ }
+
+ /// Return true if and only if this HIR contains any sub-expression that
+ /// is required to match at the beginning of text. Specifically, this
+ /// returns true if the `^` symbol (when multiline mode is disabled) or the
+ /// `\A` escape appear anywhere in the regex.
+ pub fn is_any_anchored_start(&self) -> bool {
+ self.info.is_any_anchored_start()
+ }
+
+ /// Return true if and only if this HIR contains any sub-expression that is
+ /// required to match at the end of text. Specifically, this returns true
+ /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
+ /// appear anywhere in the regex.
+ pub fn is_any_anchored_end(&self) -> bool {
+ self.info.is_any_anchored_end()
+ }
+
+ /// Return true if and only if the empty string is part of the language
+ /// matched by this regular expression.
+ ///
+ /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b`
+ /// and `\B`, but not `a` or `a+`.
+ pub fn is_match_empty(&self) -> bool {
+ self.info.is_match_empty()
+ }
+
+ /// Return true if and only if this HIR is a simple literal. This is only
+ /// true when this HIR expression is either itself a `Literal` or a
+ /// concatenation of only `Literal`s.
+ ///
+ /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
+ /// `` are not (even though that contain sub-expressions that are literals).
+ pub fn is_literal(&self) -> bool {
+ self.info.is_literal()
+ }
+
+ /// Return true if and only if this HIR is either a simple literal or an
+ /// alternation of simple literals. This is only
+ /// true when this HIR expression is either itself a `Literal` or a
+ /// concatenation of only `Literal`s or an alternation of only `Literal`s.
+ ///
+ /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
+ /// literals, but `f+`, `(foo)`, `foo()`, ``
+ /// are not (even though that contain sub-expressions that are literals).
+ pub fn is_alternation_literal(&self) -> bool {
+ self.info.is_alternation_literal()
+ }
+}
+
+impl HirKind {
+ /// Return true if and only if this HIR is the empty regular expression.
+ ///
+ /// Note that this is not defined inductively. That is, it only tests if
+ /// this kind is the `Empty` variant. To get the inductive definition,
+ /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
+ pub fn is_empty(&self) -> bool {
+ match *self {
+ HirKind::Empty => true,
+ _ => false,
+ }
+ }
+
+ /// Returns true if and only if this kind has any (including possibly
+ /// empty) subexpressions.
+ pub fn has_subexprs(&self) -> bool {
+ match *self {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Anchor(_)
+ | HirKind::WordBoundary(_) => false,
+ HirKind::Group(_)
+ | HirKind::Repetition(_)
+ | HirKind::Concat(_)
+ | HirKind::Alternation(_) => true,
+ }
+ }
+}
+
+/// Print a display representation of this Hir.
+///
+/// The result of this is a valid regular expression pattern string.
+///
+/// This implementation uses constant stack space and heap space proportional
+/// to the size of the `Hir`.
+impl fmt::Display for Hir {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use crate::hir::print::Printer;
+ Printer::new().print(self, f)
+ }
+}
+
+/// The high-level intermediate representation of a literal.
+///
+/// A literal corresponds to a single character, where a character is either
+/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
+/// are preferred whenever possible. In particular, a `Byte` variant is only
+/// ever produced when it could match invalid UTF-8.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Literal {
+ /// A single character represented by a Unicode scalar value.
+ Unicode(char),
+ /// A single character represented by an arbitrary byte.
+ Byte(u8),
+}
+
+impl Literal {
+ /// Returns true if and only if this literal corresponds to a Unicode
+ /// scalar value.
+ pub fn is_unicode(&self) -> bool {
+ match *self {
+ Literal::Unicode(_) => true,
+ Literal::Byte(b) if b <= 0x7F => true,
+ Literal::Byte(_) => false,
+ }
+ }
+}
+
+/// The high-level intermediate representation of a character class.
+///
+/// A character class corresponds to a set of characters. A character is either
+/// defined by a Unicode scalar value or a byte. Unicode characters are used
+/// by default, while bytes are used when Unicode mode (via the `u` flag) is
+/// disabled.
+///
+/// A character class, regardless of its character type, is represented by a
+/// sequence of non-overlapping non-adjacent ranges of characters.
+///
+/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
+/// be produced even when it exclusively matches valid UTF-8. This is because
+/// a `Bytes` variant represents an intention by the author of the regular
+/// expression to disable Unicode mode, which in turn impacts the semantics of
+/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
+/// match the same set of strings.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Class {
+ /// A set of characters represented by Unicode scalar values.
+ Unicode(ClassUnicode),
+ /// A set of characters represented by arbitrary bytes (one byte per
+ /// character).
+ Bytes(ClassBytes),
+}
+
+impl Class {
+ /// Apply Unicode simple case folding to this character class, in place.
+ /// The character class will be expanded to include all simple case folded
+ /// character variants.
+ ///
+ /// If this is a byte oriented character class, then this will be limited
+ /// to the ASCII ranges `A-Z` and `a-z`.
+ pub fn case_fold_simple(&mut self) {
+ match *self {
+ Class::Unicode(ref mut x) => x.case_fold_simple(),
+ Class::Bytes(ref mut x) => x.case_fold_simple(),
+ }
+ }
+
+ /// Negate this character class in place.
+ ///
+ /// After completion, this character class will contain precisely the
+ /// characters that weren't previously in the class.
+ pub fn negate(&mut self) {
+ match *self {
+ Class::Unicode(ref mut x) => x.negate(),
+ Class::Bytes(ref mut x) => x.negate(),
+ }
+ }
+
+ /// Returns true if and only if this character class will only ever match
+ /// valid UTF-8.
+ ///
+ /// A character class can match invalid UTF-8 only when the following
+ /// conditions are met:
+ ///
+ /// 1. The translator was configured to permit generating an expression
+ /// that can match invalid UTF-8. (By default, this is disabled.)
+ /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
+ /// syntax or in the parser builder. By default, Unicode mode is
+ /// enabled.
+ pub fn is_always_utf8(&self) -> bool {
+ match *self {
+ Class::Unicode(_) => true,
+ Class::Bytes(ref x) => x.is_all_ascii(),
+ }
+ }
+}
+
+/// A set of characters represented by Unicode scalar values.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassUnicode {
+ set: IntervalSet<ClassUnicodeRange>,
+}
+
+impl ClassUnicode {
+ /// Create a new class from a sequence of ranges.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap.
+ pub fn new<I>(ranges: I) -> ClassUnicode
+ where
+ I: IntoIterator<Item = ClassUnicodeRange>,
+ {
+ ClassUnicode { set: IntervalSet::new(ranges) }
+ }
+
+ /// Create a new class with no ranges.
+ pub fn empty() -> ClassUnicode {
+ ClassUnicode::new(vec![])
+ }
+
+ /// Add a new range to this set.
+ pub fn push(&mut self, range: ClassUnicodeRange) {
+ self.set.push(range);
+ }
+
+ /// Return an iterator over all ranges in this class.
+ ///
+ /// The iterator yields ranges in ascending order.
+ pub fn iter(&self) -> ClassUnicodeIter<'_> {
+ ClassUnicodeIter(self.set.iter())
+ }
+
+ /// Return the underlying ranges as a slice.
+ pub fn ranges(&self) -> &[ClassUnicodeRange] {
+ self.set.intervals()
+ }
+
+ /// Expand this character class such that it contains all case folded
+ /// characters, according to Unicode's "simple" mapping. For example, if
+ /// this class consists of the range `a-z`, then applying case folding will
+ /// result in the class containing both the ranges `a-z` and `A-Z`.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics when the case mapping data necessary for this
+ /// routine to complete is unavailable. This occurs when the `unicode-case`
+ /// feature is not enabled.
+ ///
+ /// Callers should prefer using `try_case_fold_simple` instead, which will
+ /// return an error instead of panicking.
+ pub fn case_fold_simple(&mut self) {
+ self.set
+ .case_fold_simple()
+ .expect("unicode-case feature must be enabled");
+ }
+
+ /// Expand this character class such that it contains all case folded
+ /// characters, according to Unicode's "simple" mapping. For example, if
+ /// this class consists of the range `a-z`, then applying case folding will
+ /// result in the class containing both the ranges `a-z` and `A-Z`.
+ ///
+ /// # Error
+ ///
+ /// This routine returns an error when the case mapping data necessary
+ /// for this routine to complete is unavailable. This occurs when the
+ /// `unicode-case` feature is not enabled.
+ pub fn try_case_fold_simple(
+ &mut self,
+ ) -> result::Result<(), CaseFoldError> {
+ self.set.case_fold_simple()
+ }
+
+ /// Negate this character class.
+ ///
+ /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
+ /// set, then it will not be in this set after negation.
+ pub fn negate(&mut self) {
+ self.set.negate();
+ }
+
+ /// Union this character class with the given character class, in place.
+ pub fn union(&mut self, other: &ClassUnicode) {
+ self.set.union(&other.set);
+ }
+
+ /// Intersect this character class with the given character class, in
+ /// place.
+ pub fn intersect(&mut self, other: &ClassUnicode) {
+ self.set.intersect(&other.set);
+ }
+
+ /// Subtract the given character class from this character class, in place.
+ pub fn difference(&mut self, other: &ClassUnicode) {
+ self.set.difference(&other.set);
+ }
+
+ /// Compute the symmetric difference of the given character classes, in
+ /// place.
+ ///
+ /// This computes the symmetric difference of two character classes. This
+ /// removes all elements in this class that are also in the given class,
+ /// but all adds all elements from the given class that aren't in this
+ /// class. That is, the class will contain all elements in either class,
+ /// but will not contain any elements that are in both classes.
+ pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
+ self.set.symmetric_difference(&other.set);
+ }
+
+ /// Returns true if and only if this character class will either match
+ /// nothing or only ASCII bytes. Stated differently, this returns false
+ /// if and only if this class contains a non-ASCII codepoint.
+ pub fn is_all_ascii(&self) -> bool {
+ self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
+ }
+}
+
+/// An iterator over all ranges in a Unicode character class.
+///
+/// The lifetime `'a` refers to the lifetime of the underlying class.
+#[derive(Debug)]
+pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
+
+impl<'a> Iterator for ClassUnicodeIter<'a> {
+ type Item = &'a ClassUnicodeRange;
+
+ fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
+ self.0.next()
+ }
+}
+
+/// A single range of characters represented by Unicode scalar values.
+///
+/// The range is closed. That is, the start and end of the range are included
+/// in the range.
+#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
+pub struct ClassUnicodeRange {
+ start: char,
+ end: char,
+}
+
+impl fmt::Debug for ClassUnicodeRange {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let start = if !self.start.is_whitespace() && !self.start.is_control()
+ {
+ self.start.to_string()
+ } else {
+ format!("0x{:X}", self.start as u32)
+ };
+ let end = if !self.end.is_whitespace() && !self.end.is_control() {
+ self.end.to_string()
+ } else {
+ format!("0x{:X}", self.end as u32)
+ };
+ f.debug_struct("ClassUnicodeRange")
+ .field("start", &start)
+ .field("end", &end)
+ .finish()
+ }
+}
+
+impl Interval for ClassUnicodeRange {
+ type Bound = char;
+
+ #[inline]
+ fn lower(&self) -> char {
+ self.start
+ }
+ #[inline]
+ fn upper(&self) -> char {
+ self.end
+ }
+ #[inline]
+ fn set_lower(&mut self, bound: char) {
+ self.start = bound;
+ }
+ #[inline]
+ fn set_upper(&mut self, bound: char) {
+ self.end = bound;
+ }
+
+ /// Apply simple case folding to this Unicode scalar value range.
+ ///
+ /// Additional ranges are appended to the given vector. Canonical ordering
+ /// is *not* maintained in the given vector.
+ fn case_fold_simple(
+ &self,
+ ranges: &mut Vec<ClassUnicodeRange>,
+ ) -> Result<(), unicode::CaseFoldError> {
+ if !unicode::contains_simple_case_mapping(self.start, self.end)? {
+ return Ok(());
+ }
+ let start = self.start as u32;
+ let end = (self.end as u32).saturating_add(1);
+ let mut next_simple_cp = None;
+ for cp in (start..end).filter_map(char::from_u32) {
+ if next_simple_cp.map_or(false, |next| cp < next) {
+ continue;
+ }
+ let it = match unicode::simple_fold(cp)? {
+ Ok(it) => it,
+ Err(next) => {
+ next_simple_cp = next;
+ continue;
+ }
+ };
+ for cp_folded in it {
+ ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
+ }
+ }
+ Ok(())
+ }
+}
+
+impl ClassUnicodeRange {
+ /// Create a new Unicode scalar value range for a character class.
+ ///
+ /// The returned range is always in a canonical form. That is, the range
+ /// returned always satisfies the invariant that `start <= end`.
+ pub fn new(start: char, end: char) -> ClassUnicodeRange {
+ ClassUnicodeRange::create(start, end)
+ }
+
+ /// Return the start of this range.
+ ///
+ /// The start of a range is always less than or equal to the end of the
+ /// range.
+ pub fn start(&self) -> char {
+ self.start
+ }
+
+ /// Return the end of this range.
+ ///
+ /// The end of a range is always greater than or equal to the start of the
+ /// range.
+ pub fn end(&self) -> char {
+ self.end
+ }
+}
+
+/// A set of characters represented by arbitrary bytes (where one byte
+/// corresponds to one character).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ClassBytes {
+ set: IntervalSet<ClassBytesRange>,
+}
+
+impl ClassBytes {
+ /// Create a new class from a sequence of ranges.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap.
+ pub fn new<I>(ranges: I) -> ClassBytes
+ where
+ I: IntoIterator<Item = ClassBytesRange>,
+ {
+ ClassBytes { set: IntervalSet::new(ranges) }
+ }
+
+ /// Create a new class with no ranges.
+ pub fn empty() -> ClassBytes {
+ ClassBytes::new(vec![])
+ }
+
+ /// Add a new range to this set.
+ pub fn push(&mut self, range: ClassBytesRange) {
+ self.set.push(range);
+ }
+
+ /// Return an iterator over all ranges in this class.
+ ///
+ /// The iterator yields ranges in ascending order.
+ pub fn iter(&self) -> ClassBytesIter<'_> {
+ ClassBytesIter(self.set.iter())
+ }
+
+ /// Return the underlying ranges as a slice.
+ pub fn ranges(&self) -> &[ClassBytesRange] {
+ self.set.intervals()
+ }
+
+ /// Expand this character class such that it contains all case folded
+ /// characters. For example, if this class consists of the range `a-z`,
+ /// then applying case folding will result in the class containing both the
+ /// ranges `a-z` and `A-Z`.
+ ///
+ /// Note that this only applies ASCII case folding, which is limited to the
+ /// characters `a-z` and `A-Z`.
+ pub fn case_fold_simple(&mut self) {
+ self.set.case_fold_simple().expect("ASCII case folding never fails");
+ }
+
+ /// Negate this byte class.
+ ///
+ /// For all `b` where `b` is a any byte, if `b` was in this set, then it
+ /// will not be in this set after negation.
+ pub fn negate(&mut self) {
+ self.set.negate();
+ }
+
+ /// Union this byte class with the given byte class, in place.
+ pub fn union(&mut self, other: &ClassBytes) {
+ self.set.union(&other.set);
+ }
+
+ /// Intersect this byte class with the given byte class, in place.
+ pub fn intersect(&mut self, other: &ClassBytes) {
+ self.set.intersect(&other.set);
+ }
+
+ /// Subtract the given byte class from this byte class, in place.
+ pub fn difference(&mut self, other: &ClassBytes) {
+ self.set.difference(&other.set);
+ }
+
+ /// Compute the symmetric difference of the given byte classes, in place.
+ ///
+ /// This computes the symmetric difference of two byte classes. This
+ /// removes all elements in this class that are also in the given class,
+ /// but all adds all elements from the given class that aren't in this
+ /// class. That is, the class will contain all elements in either class,
+ /// but will not contain any elements that are in both classes.
+ pub fn symmetric_difference(&mut self, other: &ClassBytes) {
+ self.set.symmetric_difference(&other.set);
+ }
+
+ /// Returns true if and only if this character class will either match
+ /// nothing or only ASCII bytes. Stated differently, this returns false
+ /// if and only if this class contains a non-ASCII byte.
+ pub fn is_all_ascii(&self) -> bool {
+ self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
+ }
+}
+
+/// An iterator over all ranges in a byte character class.
+///
+/// The lifetime `'a` refers to the lifetime of the underlying class.
+#[derive(Debug)]
+pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
+
+impl<'a> Iterator for ClassBytesIter<'a> {
+ type Item = &'a ClassBytesRange;
+
+ fn next(&mut self) -> Option<&'a ClassBytesRange> {
+ self.0.next()
+ }
+}
+
+/// A single range of characters represented by arbitrary bytes.
+///
+/// The range is closed. That is, the start and end of the range are included
+/// in the range.
+#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
+pub struct ClassBytesRange {
+ start: u8,
+ end: u8,
+}
+
+impl Interval for ClassBytesRange {
+ type Bound = u8;
+
+ #[inline]
+ fn lower(&self) -> u8 {
+ self.start
+ }
+ #[inline]
+ fn upper(&self) -> u8 {
+ self.end
+ }
+ #[inline]
+ fn set_lower(&mut self, bound: u8) {
+ self.start = bound;
+ }
+ #[inline]
+ fn set_upper(&mut self, bound: u8) {
+ self.end = bound;
+ }
+
+ /// Apply simple case folding to this byte range. Only ASCII case mappings
+ /// (for a-z) are applied.
+ ///
+ /// Additional ranges are appended to the given vector. Canonical ordering
+ /// is *not* maintained in the given vector.
+ fn case_fold_simple(
+ &self,
+ ranges: &mut Vec<ClassBytesRange>,
+ ) -> Result<(), unicode::CaseFoldError> {
+ if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
+ let lower = cmp::max(self.start, b'a');
+ let upper = cmp::min(self.end, b'z');
+ ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
+ }
+ if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
+ let lower = cmp::max(self.start, b'A');
+ let upper = cmp::min(self.end, b'Z');
+ ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
+ }
+ Ok(())
+ }
+}
+
+impl ClassBytesRange {
+ /// Create a new byte range for a character class.
+ ///
+ /// The returned range is always in a canonical form. That is, the range
+ /// returned always satisfies the invariant that `start <= end`.
+ pub fn new(start: u8, end: u8) -> ClassBytesRange {
+ ClassBytesRange::create(start, end)
+ }
+
+ /// Return the start of this range.
+ ///
+ /// The start of a range is always less than or equal to the end of the
+ /// range.
+ pub fn start(&self) -> u8 {
+ self.start
+ }
+
+ /// Return the end of this range.
+ ///
+ /// The end of a range is always greater than or equal to the start of the
+ /// range.
+ pub fn end(&self) -> u8 {
+ self.end
+ }
+}
+
+impl fmt::Debug for ClassBytesRange {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut debug = f.debug_struct("ClassBytesRange");
+ if self.start <= 0x7F {
+ debug.field("start", &(self.start as char));
+ } else {
+ debug.field("start", &self.start);
+ }
+ if self.end <= 0x7F {
+ debug.field("end", &(self.end as char));
+ } else {
+ debug.field("end", &self.end);
+ }
+ debug.finish()
+ }
+}
+
+/// The high-level intermediate representation for an anchor assertion.
+///
+/// A matching anchor assertion is always zero-length.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Anchor {
+ /// Match the beginning of a line or the beginning of text. Specifically,
+ /// this matches at the starting position of the input, or at the position
+ /// immediately following a `\n` character.
+ StartLine,
+ /// Match the end of a line or the end of text. Specifically,
+ /// this matches at the end position of the input, or at the position
+ /// immediately preceding a `\n` character.
+ EndLine,
+ /// Match the beginning of text. Specifically, this matches at the starting
+ /// position of the input.
+ StartText,
+ /// Match the end of text. Specifically, this matches at the ending
+ /// position of the input.
+ EndText,
+}
+
+/// The high-level intermediate representation for a word-boundary assertion.
+///
+/// A matching word boundary assertion is always zero-length.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum WordBoundary {
+ /// Match a Unicode-aware word boundary. That is, this matches a position
+ /// where the left adjacent character and right adjacent character
+ /// correspond to a word and non-word or a non-word and word character.
+ Unicode,
+ /// Match a Unicode-aware negation of a word boundary.
+ UnicodeNegate,
+ /// Match an ASCII-only word boundary. That is, this matches a position
+ /// where the left adjacent character and right adjacent character
+ /// correspond to a word and non-word or a non-word and word character.
+ Ascii,
+ /// Match an ASCII-only negation of a word boundary.
+ AsciiNegate,
+}
+
+impl WordBoundary {
+ /// Returns true if and only if this word boundary assertion is negated.
+ pub fn is_negated(&self) -> bool {
+ match *self {
+ WordBoundary::Unicode | WordBoundary::Ascii => false,
+ WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
+ }
+ }
+}
+
+/// The high-level intermediate representation for a group.
+///
+/// This represents one of three possible group types:
+///
+/// 1. A non-capturing group (e.g., `(?:expr)`).
+/// 2. A capturing group (e.g., `(expr)`).
+/// 3. A named capturing group (e.g., `(?P<name>expr)`).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Group {
+ /// The kind of this group. If it is a capturing group, then the kind
+ /// contains the capture group index (and the name, if it is a named
+ /// group).
+ pub kind: GroupKind,
+ /// The expression inside the capturing group, which may be empty.
+ pub hir: Box<Hir>,
+}
+
+/// The kind of group.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum GroupKind {
+ /// A normal unnamed capturing group.
+ ///
+ /// The value is the capture index of the group.
+ CaptureIndex(u32),
+ /// A named capturing group.
+ CaptureName {
+ /// The name of the group.
+ name: String,
+ /// The capture index of the group.
+ index: u32,
+ },
+ /// A non-capturing group.
+ NonCapturing,
+}
+
+/// The high-level intermediate representation of a repetition operator.
+///
+/// A repetition operator permits the repetition of an arbitrary
+/// sub-expression.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Repetition {
+ /// The kind of this repetition operator.
+ pub kind: RepetitionKind,
+ /// Whether this repetition operator is greedy or not. A greedy operator
+ /// will match as much as it can. A non-greedy operator will match as
+ /// little as it can.
+ ///
+ /// Typically, operators are greedy by default and are only non-greedy when
+ /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
+ /// not. However, this can be inverted via the `U` "ungreedy" flag.
+ pub greedy: bool,
+ /// The expression being repeated.
+ pub hir: Box<Hir>,
+}
+
+impl Repetition {
+ /// Returns true if and only if this repetition operator makes it possible
+ /// to match the empty string.
+ ///
+ /// Note that this is not defined inductively. For example, while `a*`
+ /// will report `true`, `()+` will not, even though `()` matches the empty
+ /// string and one or more occurrences of something that matches the empty
+ /// string will always match the empty string. In order to get the
+ /// inductive definition, see the corresponding method on
+ /// [`Hir`](struct.Hir.html).
+ pub fn is_match_empty(&self) -> bool {
+ match self.kind {
+ RepetitionKind::ZeroOrOne => true,
+ RepetitionKind::ZeroOrMore => true,
+ RepetitionKind::OneOrMore => false,
+ RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
+ RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
+ RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
+ }
+ }
+}
+
+/// The kind of a repetition operator.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum RepetitionKind {
+ /// Matches a sub-expression zero or one times.
+ ZeroOrOne,
+ /// Matches a sub-expression zero or more times.
+ ZeroOrMore,
+ /// Matches a sub-expression one or more times.
+ OneOrMore,
+ /// Matches a sub-expression within a bounded range of times.
+ Range(RepetitionRange),
+}
+
+/// The kind of a counted repetition operator.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum RepetitionRange {
+ /// Matches a sub-expression exactly this many times.
+ Exactly(u32),
+ /// Matches a sub-expression at least this many times.
+ AtLeast(u32),
+ /// Matches a sub-expression at least `m` times and at most `n` times.
+ Bounded(u32, u32),
+}
+
+/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
+/// space but heap space proportional to the depth of the total `Hir`.
+impl Drop for Hir {
+ fn drop(&mut self) {
+ use std::mem;
+
+ match *self.kind() {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Anchor(_)
+ | HirKind::WordBoundary(_) => return,
+ HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
+ HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
+ HirKind::Concat(ref x) if x.is_empty() => return,
+ HirKind::Alternation(ref x) if x.is_empty() => return,
+ _ => {}
+ }
+
+ let mut stack = vec![mem::replace(self, Hir::empty())];
+ while let Some(mut expr) = stack.pop() {
+ match expr.kind {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Anchor(_)
+ | HirKind::WordBoundary(_) => {}
+ HirKind::Group(ref mut x) => {
+ stack.push(mem::replace(&mut x.hir, Hir::empty()));
+ }
+ HirKind::Repetition(ref mut x) => {
+ stack.push(mem::replace(&mut x.hir, Hir::empty()));
+ }
+ HirKind::Concat(ref mut x) => {
+ stack.extend(x.drain(..));
+ }
+ HirKind::Alternation(ref mut x) => {
+ stack.extend(x.drain(..));
+ }
+ }
+ }
+ }
+}
+
+/// A type that documents various attributes of an HIR expression.
+///
+/// These attributes are typically defined inductively on the HIR.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct HirInfo {
+ /// Represent yes/no questions by a bitfield to conserve space, since
+ /// this is included in every HIR expression.
+ ///
+ /// If more attributes need to be added, it is OK to increase the size of
+ /// this as appropriate.
+ bools: u16,
+}
+
+// A simple macro for defining bitfield accessors/mutators.
+macro_rules! define_bool {
+ ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
+ fn $is_fn_name(&self) -> bool {
+ self.bools & (0b1 << $bit) > 0
+ }
+
+ fn $set_fn_name(&mut self, yes: bool) {
+ if yes {
+ self.bools |= 1 << $bit;
+ } else {
+ self.bools &= !(1 << $bit);
+ }
+ }
+ };
+}
+
+impl HirInfo {
+ fn new() -> HirInfo {
+ HirInfo { bools: 0 }
+ }
+
+ define_bool!(0, is_always_utf8, set_always_utf8);
+ define_bool!(1, is_all_assertions, set_all_assertions);
+ define_bool!(2, is_anchored_start, set_anchored_start);
+ define_bool!(3, is_anchored_end, set_anchored_end);
+ define_bool!(4, is_line_anchored_start, set_line_anchored_start);
+ define_bool!(5, is_line_anchored_end, set_line_anchored_end);
+ define_bool!(6, is_any_anchored_start, set_any_anchored_start);
+ define_bool!(7, is_any_anchored_end, set_any_anchored_end);
+ define_bool!(8, is_match_empty, set_match_empty);
+ define_bool!(9, is_literal, set_literal);
+ define_bool!(10, is_alternation_literal, set_alternation_literal);
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
+ let ranges: Vec<ClassUnicodeRange> = ranges
+ .iter()
+ .map(|&(s, e)| ClassUnicodeRange::new(s, e))
+ .collect();
+ ClassUnicode::new(ranges)
+ }
+
+ fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
+ let ranges: Vec<ClassBytesRange> =
+ ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
+ ClassBytes::new(ranges)
+ }
+
+ fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
+ cls.iter().map(|x| (x.start(), x.end())).collect()
+ }
+
+ #[cfg(feature = "unicode-case")]
+ fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls.clone();
+ cls_.case_fold_simple();
+ cls_
+ }
+
+ fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.union(cls2);
+ cls_
+ }
+
+ fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.intersect(cls2);
+ cls_
+ }
+
+ fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.difference(cls2);
+ cls_
+ }
+
+ fn usymdifference(
+ cls1: &ClassUnicode,
+ cls2: &ClassUnicode,
+ ) -> ClassUnicode {
+ let mut cls_ = cls1.clone();
+ cls_.symmetric_difference(cls2);
+ cls_
+ }
+
+ fn unegate(cls: &ClassUnicode) -> ClassUnicode {
+ let mut cls_ = cls.clone();
+ cls_.negate();
+ cls_
+ }
+
+ fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
+ cls.iter().map(|x| (x.start(), x.end())).collect()
+ }
+
+ fn bcasefold(cls: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls.clone();
+ cls_.case_fold_simple();
+ cls_
+ }
+
+ fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.union(cls2);
+ cls_
+ }
+
+ fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.intersect(cls2);
+ cls_
+ }
+
+ fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.difference(cls2);
+ cls_
+ }
+
+ fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls1.clone();
+ cls_.symmetric_difference(cls2);
+ cls_
+ }
+
+ fn bnegate(cls: &ClassBytes) -> ClassBytes {
+ let mut cls_ = cls.clone();
+ cls_.negate();
+ cls_
+ }
+
+ #[test]
+ fn class_range_canonical_unicode() {
+ let range = ClassUnicodeRange::new('\u{00FF}', '\0');
+ assert_eq!('\0', range.start());
+ assert_eq!('\u{00FF}', range.end());
+ }
+
+ #[test]
+ fn class_range_canonical_bytes() {
+ let range = ClassBytesRange::new(b'\xFF', b'\0');
+ assert_eq!(b'\0', range.start());
+ assert_eq!(b'\xFF', range.end());
+ }
+
+ #[test]
+ fn class_canonicalize_unicode() {
+ let cls = uclass(&[('a', 'c'), ('x', 'z')]);
+ let expected = vec![('a', 'c'), ('x', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('x', 'z'), ('a', 'c')]);
+ let expected = vec![('a', 'c'), ('x', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('x', 'z'), ('w', 'y')]);
+ let expected = vec![('w', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[
+ ('c', 'f'),
+ ('a', 'g'),
+ ('d', 'j'),
+ ('a', 'c'),
+ ('m', 'p'),
+ ('l', 's'),
+ ]);
+ let expected = vec![('a', 'j'), ('l', 's')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('x', 'z'), ('u', 'w')]);
+ let expected = vec![('u', 'z')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
+ let expected = vec![('\x00', '\u{10FFFF}')];
+ assert_eq!(expected, uranges(&cls));
+
+ let cls = uclass(&[('a', 'a'), ('b', 'b')]);
+ let expected = vec![('a', 'b')];
+ assert_eq!(expected, uranges(&cls));
+ }
+
+ #[test]
+ fn class_canonicalize_bytes() {
+ let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
+ let expected = vec![(b'a', b'c'), (b'x', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
+ let expected = vec![(b'a', b'c'), (b'x', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
+ let expected = vec![(b'w', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[
+ (b'c', b'f'),
+ (b'a', b'g'),
+ (b'd', b'j'),
+ (b'a', b'c'),
+ (b'm', b'p'),
+ (b'l', b's'),
+ ]);
+ let expected = vec![(b'a', b'j'), (b'l', b's')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
+ let expected = vec![(b'u', b'z')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
+ let expected = vec![(b'\x00', b'\xFF')];
+ assert_eq!(expected, branges(&cls));
+
+ let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
+ let expected = vec![(b'a', b'b')];
+ assert_eq!(expected, branges(&cls));
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn class_case_fold_unicode() {
+ let cls = uclass(&[
+ ('C', 'F'),
+ ('A', 'G'),
+ ('D', 'J'),
+ ('A', 'C'),
+ ('M', 'P'),
+ ('L', 'S'),
+ ('c', 'f'),
+ ]);
+ let expected = uclass(&[
+ ('A', 'J'),
+ ('L', 'S'),
+ ('a', 'j'),
+ ('l', 's'),
+ ('\u{17F}', '\u{17F}'),
+ ]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('A', 'Z')]);
+ let expected = uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('a', 'z')]);
+ let expected = uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('A', 'A'), ('_', '_')]);
+ let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('A', 'A'), ('=', '=')]);
+ let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('\x00', '\x10')]);
+ assert_eq!(cls, ucasefold(&cls));
+
+ let cls = uclass(&[('k', 'k')]);
+ let expected =
+ uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
+ assert_eq!(expected, ucasefold(&cls));
+
+ let cls = uclass(&[('@', '@')]);
+ assert_eq!(cls, ucasefold(&cls));
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-case"))]
+ fn class_case_fold_unicode_disabled() {
+ let mut cls = uclass(&[
+ ('C', 'F'),
+ ('A', 'G'),
+ ('D', 'J'),
+ ('A', 'C'),
+ ('M', 'P'),
+ ('L', 'S'),
+ ('c', 'f'),
+ ]);
+ assert!(cls.try_case_fold_simple().is_err());
+ }
+
+ #[test]
+ #[should_panic]
+ #[cfg(not(feature = "unicode-case"))]
+ fn class_case_fold_unicode_disabled_panics() {
+ let mut cls = uclass(&[
+ ('C', 'F'),
+ ('A', 'G'),
+ ('D', 'J'),
+ ('A', 'C'),
+ ('M', 'P'),
+ ('L', 'S'),
+ ('c', 'f'),
+ ]);
+ cls.case_fold_simple();
+ }
+
+ #[test]
+ fn class_case_fold_bytes() {
+ let cls = bclass(&[
+ (b'C', b'F'),
+ (b'A', b'G'),
+ (b'D', b'J'),
+ (b'A', b'C'),
+ (b'M', b'P'),
+ (b'L', b'S'),
+ (b'c', b'f'),
+ ]);
+ let expected =
+ bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'A', b'Z')]);
+ let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'a', b'z')]);
+ let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
+ let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
+ let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\x10')]);
+ assert_eq!(cls, bcasefold(&cls));
+
+ let cls = bclass(&[(b'k', b'k')]);
+ let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
+ assert_eq!(expected, bcasefold(&cls));
+
+ let cls = bclass(&[(b'@', b'@')]);
+ assert_eq!(cls, bcasefold(&cls));
+ }
+
+ #[test]
+ fn class_negate_unicode() {
+ let cls = uclass(&[('a', 'a')]);
+ let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('a', 'a'), ('b', 'b')]);
+ let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('a', 'c'), ('x', 'z')]);
+ let expected = uclass(&[
+ ('\x00', '\x60'),
+ ('\x64', '\x77'),
+ ('\x7B', '\u{10FFFF}'),
+ ]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', 'a')]);
+ let expected = uclass(&[('\x62', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('a', '\u{10FFFF}')]);
+ let expected = uclass(&[('\x00', '\x60')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', '\u{10FFFF}')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[]);
+ let expected = uclass(&[('\x00', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls =
+ uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
+ let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', '\u{D7FF}')]);
+ let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\x00', '\u{D7FE}')]);
+ let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
+ let expected = uclass(&[('\x00', '\u{D7FF}')]);
+ assert_eq!(expected, unegate(&cls));
+
+ let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
+ let expected = uclass(&[('\x00', '\u{E000}')]);
+ assert_eq!(expected, unegate(&cls));
+ }
+
+ #[test]
+ fn class_negate_bytes() {
+ let cls = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
+ let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
+ let expected = bclass(&[
+ (b'\x00', b'\x60'),
+ (b'\x64', b'\x77'),
+ (b'\x7B', b'\xFF'),
+ ]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'\x00', b'a')]);
+ let expected = bclass(&[(b'\x62', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'a', b'\xFF')]);
+ let expected = bclass(&[(b'\x00', b'\x60')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\xFF')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[]);
+ let expected = bclass(&[(b'\x00', b'\xFF')]);
+ assert_eq!(expected, bnegate(&cls));
+
+ let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
+ let expected = bclass(&[(b'\xFE', b'\xFE')]);
+ assert_eq!(expected, bnegate(&cls));
+ }
+
+ #[test]
+ fn class_union_unicode() {
+ let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
+ let cls2 = uclass(&[('a', 'z')]);
+ let expected = uclass(&[('a', 'z'), ('A', 'C')]);
+ assert_eq!(expected, uunion(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_union_bytes() {
+ let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
+ let cls2 = bclass(&[(b'a', b'z')]);
+ let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
+ assert_eq!(expected, bunion(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_intersect_unicode() {
+ let cls1 = uclass(&[]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[('a', 'a')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('b', 'b')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('a', 'c')]);
+ let expected = uclass(&[('a', 'a')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b')]);
+ let cls2 = uclass(&[('a', 'c')]);
+ let expected = uclass(&[('a', 'b')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b')]);
+ let cls2 = uclass(&[('b', 'c')]);
+ let expected = uclass(&[('b', 'b')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b')]);
+ let cls2 = uclass(&[('c', 'd')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('b', 'c')]);
+ let cls2 = uclass(&[('a', 'd')]);
+ let expected = uclass(&[('b', 'c')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let cls2 = uclass(&[('a', 'h')]);
+ let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
+ let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+ let cls2 = uclass(&[('h', 'h')]);
+ let expected = uclass(&[('h', 'h')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
+ let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
+ let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
+ let expected = uclass(&[('b', 'f')]);
+ assert_eq!(expected, uintersect(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_intersect_bytes() {
+ let cls1 = bclass(&[]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[(b'a', b'a')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'b', b'b')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'a', b'c')]);
+ let expected = bclass(&[(b'a', b'a')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b')]);
+ let cls2 = bclass(&[(b'a', b'c')]);
+ let expected = bclass(&[(b'a', b'b')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b')]);
+ let cls2 = bclass(&[(b'b', b'c')]);
+ let expected = bclass(&[(b'b', b'b')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b')]);
+ let cls2 = bclass(&[(b'c', b'd')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'b', b'c')]);
+ let cls2 = bclass(&[(b'a', b'd')]);
+ let expected = bclass(&[(b'b', b'c')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'a', b'h')]);
+ let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
+ let cls2 = bclass(&[(b'h', b'h')]);
+ let expected = bclass(&[(b'h', b'h')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
+ let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
+ let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
+ let expected = bclass(&[(b'b', b'f')]);
+ assert_eq!(expected, bintersect(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_difference_unicode() {
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'a')]);
+ let cls2 = uclass(&[]);
+ let expected = uclass(&[('a', 'a')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('a', 'a')]);
+ let expected = uclass(&[('b', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('z', 'z')]);
+ let expected = uclass(&[('a', 'y')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('m', 'm')]);
+ let expected = uclass(&[('a', 'l'), ('n', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('a', 'z')]);
+ let expected = uclass(&[]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('d', 'v')]);
+ let expected = uclass(&[('a', 'c')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
+ let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('x', 'z')]);
+ let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('x', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+
+ let cls1 = uclass(&[('a', 'z')]);
+ let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
+ let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
+ assert_eq!(expected, udifference(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_difference_bytes() {
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'a')]);
+ let cls2 = bclass(&[]);
+ let expected = bclass(&[(b'a', b'a')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'a', b'a')]);
+ let expected = bclass(&[(b'b', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'z', b'z')]);
+ let expected = bclass(&[(b'a', b'y')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'm', b'm')]);
+ let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'a', b'z')]);
+ let expected = bclass(&[]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'd', b'v')]);
+ let expected = bclass(&[(b'a', b'c')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
+ let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'x', b'z')]);
+ let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'x', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+
+ let cls1 = bclass(&[(b'a', b'z')]);
+ let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
+ let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
+ assert_eq!(expected, bdifference(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_symmetric_difference_unicode() {
+ let cls1 = uclass(&[('a', 'm')]);
+ let cls2 = uclass(&[('g', 't')]);
+ let expected = uclass(&[('a', 'f'), ('n', 't')]);
+ assert_eq!(expected, usymdifference(&cls1, &cls2));
+ }
+
+ #[test]
+ fn class_symmetric_difference_bytes() {
+ let cls1 = bclass(&[(b'a', b'm')]);
+ let cls2 = bclass(&[(b'g', b't')]);
+ let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
+ assert_eq!(expected, bsymdifference(&cls1, &cls2));
+ }
+
+ #[test]
+ #[should_panic]
+ fn hir_byte_literal_non_ascii() {
+ Hir::literal(Literal::Byte(b'a'));
+ }
+
+ // We use a thread with an explicit stack size to test that our destructor
+ // for Hir can handle arbitrarily sized expressions in constant stack
+ // space. In case we run on a platform without threads (WASM?), we limit
+ // this test to Windows/Unix.
+ #[test]
+ #[cfg(any(unix, windows))]
+ fn no_stack_overflow_on_drop() {
+ use std::thread;
+
+ let run = || {
+ let mut expr = Hir::empty();
+ for _ in 0..100 {
+ expr = Hir::group(Group {
+ kind: GroupKind::NonCapturing,
+ hir: Box::new(expr),
+ });
+ expr = Hir::repetition(Repetition {
+ kind: RepetitionKind::ZeroOrOne,
+ greedy: true,
+ hir: Box::new(expr),
+ });
+
+ expr = Hir {
+ kind: HirKind::Concat(vec![expr]),
+ info: HirInfo::new(),
+ };
+ expr = Hir {
+ kind: HirKind::Alternation(vec![expr]),
+ info: HirInfo::new(),
+ };
+ }
+ assert!(!expr.kind.is_empty());
+ };
+
+ // We run our test on a thread with a small stack size so we can
+ // force the issue more easily.
+ thread::Builder::new()
+ .stack_size(1 << 10)
+ .spawn(run)
+ .unwrap()
+ .join()
+ .unwrap();
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/hir/print.rs b/third_party/rust/regex-syntax/src/hir/print.rs
new file mode 100644
index 0000000000..b71f3897cf
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/print.rs
@@ -0,0 +1,367 @@
+/*!
+This module provides a regular expression printer for `Hir`.
+*/
+
+use std::fmt;
+
+use crate::hir::visitor::{self, Visitor};
+use crate::hir::{self, Hir, HirKind};
+use crate::is_meta_character;
+
+/// A builder for constructing a printer.
+///
+/// Note that since a printer doesn't have any configuration knobs, this type
+/// remains unexported.
+#[derive(Clone, Debug)]
+struct PrinterBuilder {
+ _priv: (),
+}
+
+impl Default for PrinterBuilder {
+ fn default() -> PrinterBuilder {
+ PrinterBuilder::new()
+ }
+}
+
+impl PrinterBuilder {
+ fn new() -> PrinterBuilder {
+ PrinterBuilder { _priv: () }
+ }
+
+ fn build(&self) -> Printer {
+ Printer { _priv: () }
+ }
+}
+
+/// A printer for a regular expression's high-level intermediate
+/// representation.
+///
+/// A printer converts a high-level intermediate representation (HIR) to a
+/// regular expression pattern string. This particular printer uses constant
+/// stack space and heap space proportional to the size of the HIR.
+///
+/// Since this printer is only using the HIR, the pattern it prints will likely
+/// not resemble the original pattern at all. For example, a pattern like
+/// `\pL` will have its entire class written out.
+///
+/// The purpose of this printer is to provide a means to mutate an HIR and then
+/// build a regular expression from the result of that mutation. (A regex
+/// library could provide a constructor from this HIR explicitly, but that
+/// creates an unnecessary public coupling between the regex library and this
+/// specific HIR representation.)
+#[derive(Debug)]
+pub struct Printer {
+ _priv: (),
+}
+
+impl Printer {
+ /// Create a new printer.
+ pub fn new() -> Printer {
+ PrinterBuilder::new().build()
+ }
+
+ /// Print the given `Ast` to the given writer. The writer must implement
+ /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
+ /// here are a `fmt::Formatter` (which is available in `fmt::Display`
+ /// implementations) or a `&mut String`.
+ pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
+ visitor::visit(hir, Writer { wtr })
+ }
+}
+
+#[derive(Debug)]
+struct Writer<W> {
+ wtr: W,
+}
+
+impl<W: fmt::Write> Visitor for Writer<W> {
+ type Output = ();
+ type Err = fmt::Error;
+
+ fn finish(self) -> fmt::Result {
+ Ok(())
+ }
+
+ fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
+ match *hir.kind() {
+ HirKind::Empty
+ | HirKind::Repetition(_)
+ | HirKind::Concat(_)
+ | HirKind::Alternation(_) => {}
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
+ self.write_literal_char(c)?;
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ self.write_literal_byte(b)?;
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ self.wtr.write_str("[")?;
+ for range in cls.iter() {
+ if range.start() == range.end() {
+ self.write_literal_char(range.start())?;
+ } else {
+ self.write_literal_char(range.start())?;
+ self.wtr.write_str("-")?;
+ self.write_literal_char(range.end())?;
+ }
+ }
+ self.wtr.write_str("]")?;
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ self.wtr.write_str("(?-u:[")?;
+ for range in cls.iter() {
+ if range.start() == range.end() {
+ self.write_literal_class_byte(range.start())?;
+ } else {
+ self.write_literal_class_byte(range.start())?;
+ self.wtr.write_str("-")?;
+ self.write_literal_class_byte(range.end())?;
+ }
+ }
+ self.wtr.write_str("])")?;
+ }
+ HirKind::Anchor(hir::Anchor::StartLine) => {
+ self.wtr.write_str("(?m:^)")?;
+ }
+ HirKind::Anchor(hir::Anchor::EndLine) => {
+ self.wtr.write_str("(?m:$)")?;
+ }
+ HirKind::Anchor(hir::Anchor::StartText) => {
+ self.wtr.write_str(r"\A")?;
+ }
+ HirKind::Anchor(hir::Anchor::EndText) => {
+ self.wtr.write_str(r"\z")?;
+ }
+ HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
+ self.wtr.write_str(r"\b")?;
+ }
+ HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
+ self.wtr.write_str(r"\B")?;
+ }
+ HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
+ self.wtr.write_str(r"(?-u:\b)")?;
+ }
+ HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
+ self.wtr.write_str(r"(?-u:\B)")?;
+ }
+ HirKind::Group(ref x) => match x.kind {
+ hir::GroupKind::CaptureIndex(_) => {
+ self.wtr.write_str("(")?;
+ }
+ hir::GroupKind::CaptureName { ref name, .. } => {
+ write!(self.wtr, "(?P<{}>", name)?;
+ }
+ hir::GroupKind::NonCapturing => {
+ self.wtr.write_str("(?:")?;
+ }
+ },
+ }
+ Ok(())
+ }
+
+ fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
+ match *hir.kind() {
+ // Handled during visit_pre
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Anchor(_)
+ | HirKind::WordBoundary(_)
+ | HirKind::Concat(_)
+ | HirKind::Alternation(_) => {}
+ HirKind::Repetition(ref x) => {
+ match x.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ self.wtr.write_str("?")?;
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ self.wtr.write_str("*")?;
+ }
+ hir::RepetitionKind::OneOrMore => {
+ self.wtr.write_str("+")?;
+ }
+ hir::RepetitionKind::Range(ref x) => match *x {
+ hir::RepetitionRange::Exactly(m) => {
+ write!(self.wtr, "{{{}}}", m)?;
+ }
+ hir::RepetitionRange::AtLeast(m) => {
+ write!(self.wtr, "{{{},}}", m)?;
+ }
+ hir::RepetitionRange::Bounded(m, n) => {
+ write!(self.wtr, "{{{},{}}}", m, n)?;
+ }
+ },
+ }
+ if !x.greedy {
+ self.wtr.write_str("?")?;
+ }
+ }
+ HirKind::Group(_) => {
+ self.wtr.write_str(")")?;
+ }
+ }
+ Ok(())
+ }
+
+ fn visit_alternation_in(&mut self) -> fmt::Result {
+ self.wtr.write_str("|")
+ }
+}
+
+impl<W: fmt::Write> Writer<W> {
+ fn write_literal_char(&mut self, c: char) -> fmt::Result {
+ if is_meta_character(c) {
+ self.wtr.write_str("\\")?;
+ }
+ self.wtr.write_char(c)
+ }
+
+ fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
+ let c = b as char;
+ if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
+ self.write_literal_char(c)
+ } else {
+ write!(self.wtr, "(?-u:\\x{:02X})", b)
+ }
+ }
+
+ fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
+ let c = b as char;
+ if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
+ self.write_literal_char(c)
+ } else {
+ write!(self.wtr, "\\x{:02X}", b)
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::Printer;
+ use crate::ParserBuilder;
+
+ fn roundtrip(given: &str, expected: &str) {
+ roundtrip_with(|b| b, given, expected);
+ }
+
+ fn roundtrip_bytes(given: &str, expected: &str) {
+ roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
+ }
+
+ fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
+ where
+ F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
+ {
+ let mut builder = ParserBuilder::new();
+ f(&mut builder);
+ let hir = builder.build().parse(given).unwrap();
+
+ let mut printer = Printer::new();
+ let mut dst = String::new();
+ printer.print(&hir, &mut dst).unwrap();
+
+ // Check that the result is actually valid.
+ builder.build().parse(&dst).unwrap();
+
+ assert_eq!(expected, dst);
+ }
+
+ #[test]
+ fn print_literal() {
+ roundtrip("a", "a");
+ roundtrip(r"\xff", "\u{FF}");
+ roundtrip_bytes(r"\xff", "\u{FF}");
+ roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
+ roundtrip("☃", "☃");
+ }
+
+ #[test]
+ fn print_class() {
+ roundtrip(r"[a]", r"[a]");
+ roundtrip(r"[a-z]", r"[a-z]");
+ roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
+ roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
+ roundtrip(r"[-]", r"[\-]");
+ roundtrip(r"[☃-⛄]", r"[☃-⛄]");
+
+ roundtrip(r"(?-u)[a]", r"(?-u:[a])");
+ roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
+ roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
+
+ // The following test that the printer escapes meta characters
+ // in character classes.
+ roundtrip(r"[\[]", r"[\[]");
+ roundtrip(r"[Z-_]", r"[Z-_]");
+ roundtrip(r"[Z-_--Z]", r"[\[-_]");
+
+ // The following test that the printer escapes meta characters
+ // in byte oriented character classes.
+ roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
+ roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
+ roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
+ }
+
+ #[test]
+ fn print_anchor() {
+ roundtrip(r"^", r"\A");
+ roundtrip(r"$", r"\z");
+ roundtrip(r"(?m)^", r"(?m:^)");
+ roundtrip(r"(?m)$", r"(?m:$)");
+ }
+
+ #[test]
+ fn print_word_boundary() {
+ roundtrip(r"\b", r"\b");
+ roundtrip(r"\B", r"\B");
+ roundtrip(r"(?-u)\b", r"(?-u:\b)");
+ roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
+ }
+
+ #[test]
+ fn print_repetition() {
+ roundtrip("a?", "a?");
+ roundtrip("a??", "a??");
+ roundtrip("(?U)a?", "a??");
+
+ roundtrip("a*", "a*");
+ roundtrip("a*?", "a*?");
+ roundtrip("(?U)a*", "a*?");
+
+ roundtrip("a+", "a+");
+ roundtrip("a+?", "a+?");
+ roundtrip("(?U)a+", "a+?");
+
+ roundtrip("a{1}", "a{1}");
+ roundtrip("a{1,}", "a{1,}");
+ roundtrip("a{1,5}", "a{1,5}");
+ roundtrip("a{1}?", "a{1}?");
+ roundtrip("a{1,}?", "a{1,}?");
+ roundtrip("a{1,5}?", "a{1,5}?");
+ roundtrip("(?U)a{1}", "a{1}?");
+ roundtrip("(?U)a{1,}", "a{1,}?");
+ roundtrip("(?U)a{1,5}", "a{1,5}?");
+ }
+
+ #[test]
+ fn print_group() {
+ roundtrip("()", "()");
+ roundtrip("(?P<foo>)", "(?P<foo>)");
+ roundtrip("(?:)", "(?:)");
+
+ roundtrip("(a)", "(a)");
+ roundtrip("(?P<foo>a)", "(?P<foo>a)");
+ roundtrip("(?:a)", "(?:a)");
+
+ roundtrip("((((a))))", "((((a))))");
+ }
+
+ #[test]
+ fn print_alternation() {
+ roundtrip("|", "|");
+ roundtrip("||", "||");
+
+ roundtrip("a|b", "a|b");
+ roundtrip("a|b|c", "a|b|c");
+ roundtrip("foo|bar|quux", "foo|bar|quux");
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/hir/translate.rs b/third_party/rust/regex-syntax/src/hir/translate.rs
new file mode 100644
index 0000000000..890e1608b3
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/translate.rs
@@ -0,0 +1,3207 @@
+/*!
+Defines a translator that converts an `Ast` to an `Hir`.
+*/
+
+use std::cell::{Cell, RefCell};
+use std::result;
+
+use crate::ast::{self, Ast, Span, Visitor};
+use crate::hir::{self, Error, ErrorKind, Hir};
+use crate::unicode::{self, ClassQuery};
+
+type Result<T> = result::Result<T, Error>;
+
+/// A builder for constructing an AST->HIR translator.
+#[derive(Clone, Debug)]
+pub struct TranslatorBuilder {
+ allow_invalid_utf8: bool,
+ flags: Flags,
+}
+
+impl Default for TranslatorBuilder {
+ fn default() -> TranslatorBuilder {
+ TranslatorBuilder::new()
+ }
+}
+
+impl TranslatorBuilder {
+ /// Create a new translator builder with a default c onfiguration.
+ pub fn new() -> TranslatorBuilder {
+ TranslatorBuilder {
+ allow_invalid_utf8: false,
+ flags: Flags::default(),
+ }
+ }
+
+ /// Build a translator using the current configuration.
+ pub fn build(&self) -> Translator {
+ Translator {
+ stack: RefCell::new(vec![]),
+ flags: Cell::new(self.flags),
+ allow_invalid_utf8: self.allow_invalid_utf8,
+ }
+ }
+
+ /// When enabled, translation will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// When disabled (the default), the translator is guaranteed to produce
+ /// an expression that will only ever match valid UTF-8 (otherwise, the
+ /// translator will return an error).
+ ///
+ /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
+ /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
+ /// the parser to return an error. Namely, a negated ASCII word boundary
+ /// can result in matching positions that aren't valid UTF-8 boundaries.
+ pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.allow_invalid_utf8 = yes;
+ self
+ }
+
+ /// Enable or disable the case insensitive flag (`i`) by default.
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.case_insensitive = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the multi-line matching flag (`m`) by default.
+ pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.multi_line = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag (`s`) by
+ /// default.
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut TranslatorBuilder {
+ self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag (`U`) by default.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.swap_greed = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.unicode = if yes { None } else { Some(false) };
+ self
+ }
+}
+
+/// A translator maps abstract syntax to a high level intermediate
+/// representation.
+///
+/// A translator may be benefit from reuse. That is, a translator can translate
+/// many abstract syntax trees.
+///
+/// A `Translator` can be configured in more detail via a
+/// [`TranslatorBuilder`](struct.TranslatorBuilder.html).
+#[derive(Clone, Debug)]
+pub struct Translator {
+ /// Our call stack, but on the heap.
+ stack: RefCell<Vec<HirFrame>>,
+ /// The current flag settings.
+ flags: Cell<Flags>,
+ /// Whether we're allowed to produce HIR that can match arbitrary bytes.
+ allow_invalid_utf8: bool,
+}
+
+impl Translator {
+ /// Create a new translator using the default configuration.
+ pub fn new() -> Translator {
+ TranslatorBuilder::new().build()
+ }
+
+ /// Translate the given abstract syntax tree (AST) into a high level
+ /// intermediate representation (HIR).
+ ///
+ /// If there was a problem doing the translation, then an HIR-specific
+ /// error is returned.
+ ///
+ /// The original pattern string used to produce the `Ast` *must* also be
+ /// provided. The translator does not use the pattern string during any
+ /// correct translation, but is used for error reporting.
+ pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
+ ast::visit(ast, TranslatorI::new(self, pattern))
+ }
+}
+
+/// An HirFrame is a single stack frame, represented explicitly, which is
+/// created for each item in the Ast that we traverse.
+///
+/// Note that technically, this type doesn't represent our entire stack
+/// frame. In particular, the Ast visitor represents any state associated with
+/// traversing the Ast itself.
+#[derive(Clone, Debug)]
+enum HirFrame {
+ /// An arbitrary HIR expression. These get pushed whenever we hit a base
+ /// case in the Ast. They get popped after an inductive (i.e., recursive)
+ /// step is complete.
+ Expr(Hir),
+ /// A Unicode character class. This frame is mutated as we descend into
+ /// the Ast of a character class (which is itself its own mini recursive
+ /// structure).
+ ClassUnicode(hir::ClassUnicode),
+ /// A byte-oriented character class. This frame is mutated as we descend
+ /// into the Ast of a character class (which is itself its own mini
+ /// recursive structure).
+ ///
+ /// Byte character classes are created when Unicode mode (`u`) is disabled.
+ /// If `allow_invalid_utf8` is disabled (the default), then a byte
+ /// character is only permitted to match ASCII text.
+ ClassBytes(hir::ClassBytes),
+ /// This is pushed on to the stack upon first seeing any kind of group,
+ /// indicated by parentheses (including non-capturing groups). It is popped
+ /// upon leaving a group.
+ Group {
+ /// The old active flags when this group was opened.
+ ///
+ /// If this group sets flags, then the new active flags are set to the
+ /// result of merging the old flags with the flags introduced by this
+ /// group. If the group doesn't set any flags, then this is simply
+ /// equivalent to whatever flags were set when the group was opened.
+ ///
+ /// When this group is popped, the active flags should be restored to
+ /// the flags set here.
+ ///
+ /// The "active" flags correspond to whatever flags are set in the
+ /// Translator.
+ old_flags: Flags,
+ },
+ /// This is pushed whenever a concatenation is observed. After visiting
+ /// every sub-expression in the concatenation, the translator's stack is
+ /// popped until it sees a Concat frame.
+ Concat,
+ /// This is pushed whenever an alternation is observed. After visiting
+ /// every sub-expression in the alternation, the translator's stack is
+ /// popped until it sees an Alternation frame.
+ Alternation,
+}
+
+impl HirFrame {
+ /// Assert that the current stack frame is an Hir expression and return it.
+ fn unwrap_expr(self) -> Hir {
+ match self {
+ HirFrame::Expr(expr) => expr,
+ _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
+ }
+ }
+
+ /// Assert that the current stack frame is a Unicode class expression and
+ /// return it.
+ fn unwrap_class_unicode(self) -> hir::ClassUnicode {
+ match self {
+ HirFrame::ClassUnicode(cls) => cls,
+ _ => panic!(
+ "tried to unwrap Unicode class \
+ from HirFrame, got: {:?}",
+ self
+ ),
+ }
+ }
+
+ /// Assert that the current stack frame is a byte class expression and
+ /// return it.
+ fn unwrap_class_bytes(self) -> hir::ClassBytes {
+ match self {
+ HirFrame::ClassBytes(cls) => cls,
+ _ => panic!(
+ "tried to unwrap byte class \
+ from HirFrame, got: {:?}",
+ self
+ ),
+ }
+ }
+
+ /// Assert that the current stack frame is a group indicator and return
+ /// its corresponding flags (the flags that were active at the time the
+ /// group was entered).
+ fn unwrap_group(self) -> Flags {
+ match self {
+ HirFrame::Group { old_flags } => old_flags,
+ _ => {
+ panic!("tried to unwrap group from HirFrame, got: {:?}", self)
+ }
+ }
+ }
+}
+
+impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
+ type Output = Hir;
+ type Err = Error;
+
+ fn finish(self) -> Result<Hir> {
+ // ... otherwise, we should have exactly one HIR on the stack.
+ assert_eq!(self.trans().stack.borrow().len(), 1);
+ Ok(self.pop().unwrap().unwrap_expr())
+ }
+
+ fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
+ match *ast {
+ Ast::Class(ast::Class::Bracketed(_)) => {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ Ast::Group(ref x) => {
+ let old_flags = x
+ .flags()
+ .map(|ast| self.set_flags(ast))
+ .unwrap_or_else(|| self.flags());
+ self.push(HirFrame::Group { old_flags });
+ }
+ Ast::Concat(ref x) if x.asts.is_empty() => {}
+ Ast::Concat(_) => {
+ self.push(HirFrame::Concat);
+ }
+ Ast::Alternation(ref x) if x.asts.is_empty() => {}
+ Ast::Alternation(_) => {
+ self.push(HirFrame::Alternation);
+ }
+ _ => {}
+ }
+ Ok(())
+ }
+
+ fn visit_post(&mut self, ast: &Ast) -> Result<()> {
+ match *ast {
+ Ast::Empty(_) => {
+ self.push(HirFrame::Expr(Hir::empty()));
+ }
+ Ast::Flags(ref x) => {
+ self.set_flags(&x.flags);
+ // Flags in the AST are generally considered directives and
+ // not actual sub-expressions. However, they can be used in
+ // the concrete syntax like `((?i))`, and we need some kind of
+ // indication of an expression there, and Empty is the correct
+ // choice.
+ //
+ // There can also be things like `(?i)+`, but we rule those out
+ // in the parser. In the future, we might allow them for
+ // consistency sake.
+ self.push(HirFrame::Expr(Hir::empty()));
+ }
+ Ast::Literal(ref x) => {
+ self.push(HirFrame::Expr(self.hir_literal(x)?));
+ }
+ Ast::Dot(span) => {
+ self.push(HirFrame::Expr(self.hir_dot(span)?));
+ }
+ Ast::Assertion(ref x) => {
+ self.push(HirFrame::Expr(self.hir_assertion(x)?));
+ }
+ Ast::Class(ast::Class::Perl(ref x)) => {
+ if self.flags().unicode() {
+ let cls = self.hir_perl_unicode_class(x)?;
+ let hcls = hir::Class::Unicode(cls);
+ self.push(HirFrame::Expr(Hir::class(hcls)));
+ } else {
+ let cls = self.hir_perl_byte_class(x);
+ let hcls = hir::Class::Bytes(cls);
+ self.push(HirFrame::Expr(Hir::class(hcls)));
+ }
+ }
+ Ast::Class(ast::Class::Unicode(ref x)) => {
+ let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
+ self.push(HirFrame::Expr(Hir::class(cls)));
+ }
+ Ast::Class(ast::Class::Bracketed(ref ast)) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ self.unicode_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls,
+ )?;
+ if cls.ranges().is_empty() {
+ return Err(self.error(
+ ast.span,
+ ErrorKind::EmptyClassNotAllowed,
+ ));
+ }
+ let expr = Hir::class(hir::Class::Unicode(cls));
+ self.push(HirFrame::Expr(expr));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ self.bytes_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls,
+ )?;
+ if cls.ranges().is_empty() {
+ return Err(self.error(
+ ast.span,
+ ErrorKind::EmptyClassNotAllowed,
+ ));
+ }
+
+ let expr = Hir::class(hir::Class::Bytes(cls));
+ self.push(HirFrame::Expr(expr));
+ }
+ }
+ Ast::Repetition(ref x) => {
+ let expr = self.pop().unwrap().unwrap_expr();
+ self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
+ }
+ Ast::Group(ref x) => {
+ let expr = self.pop().unwrap().unwrap_expr();
+ let old_flags = self.pop().unwrap().unwrap_group();
+ self.trans().flags.set(old_flags);
+ self.push(HirFrame::Expr(self.hir_group(x, expr)));
+ }
+ Ast::Concat(_) => {
+ let mut exprs = vec![];
+ while let Some(HirFrame::Expr(expr)) = self.pop() {
+ if !expr.kind().is_empty() {
+ exprs.push(expr);
+ }
+ }
+ exprs.reverse();
+ self.push(HirFrame::Expr(Hir::concat(exprs)));
+ }
+ Ast::Alternation(_) => {
+ let mut exprs = vec![];
+ while let Some(HirFrame::Expr(expr)) = self.pop() {
+ exprs.push(expr);
+ }
+ exprs.reverse();
+ self.push(HirFrame::Expr(Hir::alternation(exprs)));
+ }
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_item_pre(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ match *ast {
+ ast::ClassSetItem::Bracketed(_) => {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ // We needn't handle the Union case here since the visitor will
+ // do it for us.
+ _ => {}
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_item_post(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ match *ast {
+ ast::ClassSetItem::Empty(_) => {}
+ ast::ClassSetItem::Literal(ref x) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ let byte = self.class_literal_byte(x)?;
+ cls.push(hir::ClassBytesRange::new(byte, byte));
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Range(ref x) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ let start = self.class_literal_byte(&x.start)?;
+ let end = self.class_literal_byte(&x.end)?;
+ cls.push(hir::ClassBytesRange::new(start, end));
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Ascii(ref x) => {
+ if self.flags().unicode() {
+ let xcls = self.hir_ascii_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let xcls = self.hir_ascii_byte_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Unicode(ref x) => {
+ let xcls = self.hir_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ }
+ ast::ClassSetItem::Perl(ref x) => {
+ if self.flags().unicode() {
+ let xcls = self.hir_perl_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let xcls = self.hir_perl_byte_class(x);
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Bracketed(ref ast) => {
+ if self.flags().unicode() {
+ let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
+ self.unicode_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls1,
+ )?;
+
+ let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
+ cls2.union(&cls1);
+ self.push(HirFrame::ClassUnicode(cls2));
+ } else {
+ let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
+ self.bytes_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls1,
+ )?;
+
+ let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
+ cls2.union(&cls1);
+ self.push(HirFrame::ClassBytes(cls2));
+ }
+ }
+ // This is handled automatically by the visitor.
+ ast::ClassSetItem::Union(_) => {}
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_pre(
+ &mut self,
+ _op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_in(
+ &mut self,
+ _op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_post(
+ &mut self,
+ op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ use crate::ast::ClassSetBinaryOpKind::*;
+
+ if self.flags().unicode() {
+ let mut rhs = self.pop().unwrap().unwrap_class_unicode();
+ let mut lhs = self.pop().unwrap().unwrap_class_unicode();
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ if self.flags().case_insensitive() {
+ rhs.try_case_fold_simple().map_err(|_| {
+ self.error(
+ op.rhs.span().clone(),
+ ErrorKind::UnicodeCaseUnavailable,
+ )
+ })?;
+ lhs.try_case_fold_simple().map_err(|_| {
+ self.error(
+ op.lhs.span().clone(),
+ ErrorKind::UnicodeCaseUnavailable,
+ )
+ })?;
+ }
+ match op.kind {
+ Intersection => lhs.intersect(&rhs),
+ Difference => lhs.difference(&rhs),
+ SymmetricDifference => lhs.symmetric_difference(&rhs),
+ }
+ cls.union(&lhs);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut rhs = self.pop().unwrap().unwrap_class_bytes();
+ let mut lhs = self.pop().unwrap().unwrap_class_bytes();
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ if self.flags().case_insensitive() {
+ rhs.case_fold_simple();
+ lhs.case_fold_simple();
+ }
+ match op.kind {
+ Intersection => lhs.intersect(&rhs),
+ Difference => lhs.difference(&rhs),
+ SymmetricDifference => lhs.symmetric_difference(&rhs),
+ }
+ cls.union(&lhs);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+}
+
+/// The internal implementation of a translator.
+///
+/// This type is responsible for carrying around the original pattern string,
+/// which is not tied to the internal state of a translator.
+///
+/// A TranslatorI exists for the time it takes to translate a single Ast.
+#[derive(Clone, Debug)]
+struct TranslatorI<'t, 'p> {
+ trans: &'t Translator,
+ pattern: &'p str,
+}
+
+impl<'t, 'p> TranslatorI<'t, 'p> {
+ /// Build a new internal translator.
+ fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
+ TranslatorI { trans, pattern }
+ }
+
+ /// Return a reference to the underlying translator.
+ fn trans(&self) -> &Translator {
+ &self.trans
+ }
+
+ /// Push the given frame on to the call stack.
+ fn push(&self, frame: HirFrame) {
+ self.trans().stack.borrow_mut().push(frame);
+ }
+
+ /// Pop the top of the call stack. If the call stack is empty, return None.
+ fn pop(&self) -> Option<HirFrame> {
+ self.trans().stack.borrow_mut().pop()
+ }
+
+ /// Create a new error with the given span and error type.
+ fn error(&self, span: Span, kind: ErrorKind) -> Error {
+ Error { kind, pattern: self.pattern.to_string(), span }
+ }
+
+ /// Return a copy of the active flags.
+ fn flags(&self) -> Flags {
+ self.trans().flags.get()
+ }
+
+ /// Set the flags of this translator from the flags set in the given AST.
+ /// Then, return the old flags.
+ fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
+ let old_flags = self.flags();
+ let mut new_flags = Flags::from_ast(ast_flags);
+ new_flags.merge(&old_flags);
+ self.trans().flags.set(new_flags);
+ old_flags
+ }
+
+ fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> {
+ let ch = match self.literal_to_char(lit)? {
+ byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)),
+ hir::Literal::Unicode(ch) => ch,
+ };
+ if self.flags().case_insensitive() {
+ self.hir_from_char_case_insensitive(lit.span, ch)
+ } else {
+ self.hir_from_char(lit.span, ch)
+ }
+ }
+
+ /// Convert an Ast literal to its scalar representation.
+ ///
+ /// When Unicode mode is enabled, then this always succeeds and returns a
+ /// `char` (Unicode scalar value).
+ ///
+ /// When Unicode mode is disabled, then a raw byte is returned. If that
+ /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns
+ /// an error.
+ fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> {
+ if self.flags().unicode() {
+ return Ok(hir::Literal::Unicode(lit.c));
+ }
+ let byte = match lit.byte() {
+ None => return Ok(hir::Literal::Unicode(lit.c)),
+ Some(byte) => byte,
+ };
+ if byte <= 0x7F {
+ return Ok(hir::Literal::Unicode(byte as char));
+ }
+ if !self.trans().allow_invalid_utf8 {
+ return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
+ }
+ Ok(hir::Literal::Byte(byte))
+ }
+
+ fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> {
+ if !self.flags().unicode() && c.len_utf8() > 1 {
+ return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ }
+ Ok(Hir::literal(hir::Literal::Unicode(c)))
+ }
+
+ fn hir_from_char_case_insensitive(
+ &self,
+ span: Span,
+ c: char,
+ ) -> Result<Hir> {
+ if self.flags().unicode() {
+ // If case folding won't do anything, then don't bother trying.
+ let map =
+ unicode::contains_simple_case_mapping(c, c).map_err(|_| {
+ self.error(span, ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ if !map {
+ return self.hir_from_char(span, c);
+ }
+ let mut cls =
+ hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
+ c, c,
+ )]);
+ cls.try_case_fold_simple().map_err(|_| {
+ self.error(span, ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ Ok(Hir::class(hir::Class::Unicode(cls)))
+ } else {
+ if c.len_utf8() > 1 {
+ return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ }
+ // If case folding won't do anything, then don't bother trying.
+ match c {
+ 'A'..='Z' | 'a'..='z' => {}
+ _ => return self.hir_from_char(span, c),
+ }
+ let mut cls =
+ hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
+ c as u8, c as u8,
+ )]);
+ cls.case_fold_simple();
+ Ok(Hir::class(hir::Class::Bytes(cls)))
+ }
+ }
+
+ fn hir_dot(&self, span: Span) -> Result<Hir> {
+ let unicode = self.flags().unicode();
+ if !unicode && !self.trans().allow_invalid_utf8 {
+ return Err(self.error(span, ErrorKind::InvalidUtf8));
+ }
+ Ok(if self.flags().dot_matches_new_line() {
+ Hir::any(!unicode)
+ } else {
+ Hir::dot(!unicode)
+ })
+ }
+
+ fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
+ let unicode = self.flags().unicode();
+ let multi_line = self.flags().multi_line();
+ Ok(match asst.kind {
+ ast::AssertionKind::StartLine => Hir::anchor(if multi_line {
+ hir::Anchor::StartLine
+ } else {
+ hir::Anchor::StartText
+ }),
+ ast::AssertionKind::EndLine => Hir::anchor(if multi_line {
+ hir::Anchor::EndLine
+ } else {
+ hir::Anchor::EndText
+ }),
+ ast::AssertionKind::StartText => {
+ Hir::anchor(hir::Anchor::StartText)
+ }
+ ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText),
+ ast::AssertionKind::WordBoundary => {
+ Hir::word_boundary(if unicode {
+ hir::WordBoundary::Unicode
+ } else {
+ hir::WordBoundary::Ascii
+ })
+ }
+ ast::AssertionKind::NotWordBoundary => {
+ Hir::word_boundary(if unicode {
+ hir::WordBoundary::UnicodeNegate
+ } else {
+ // It is possible for negated ASCII word boundaries to
+ // match at invalid UTF-8 boundaries, even when searching
+ // valid UTF-8.
+ if !self.trans().allow_invalid_utf8 {
+ return Err(
+ self.error(asst.span, ErrorKind::InvalidUtf8)
+ );
+ }
+ hir::WordBoundary::AsciiNegate
+ })
+ }
+ })
+ }
+
+ fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
+ let kind = match group.kind {
+ ast::GroupKind::CaptureIndex(idx) => {
+ hir::GroupKind::CaptureIndex(idx)
+ }
+ ast::GroupKind::CaptureName(ref capname) => {
+ hir::GroupKind::CaptureName {
+ name: capname.name.clone(),
+ index: capname.index,
+ }
+ }
+ ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing,
+ };
+ Hir::group(hir::Group { kind, hir: Box::new(expr) })
+ }
+
+ fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
+ let kind = match rep.op.kind {
+ ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne,
+ ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore,
+ ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore,
+ ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
+ hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m))
+ }
+ ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
+ hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m))
+ }
+ ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
+ m,
+ n,
+ )) => {
+ hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n))
+ }
+ };
+ let greedy =
+ if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
+ Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) })
+ }
+
+ fn hir_unicode_class(
+ &self,
+ ast_class: &ast::ClassUnicode,
+ ) -> Result<hir::ClassUnicode> {
+ use crate::ast::ClassUnicodeKind::*;
+
+ if !self.flags().unicode() {
+ return Err(
+ self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
+ );
+ }
+ let query = match ast_class.kind {
+ OneLetter(name) => ClassQuery::OneLetter(name),
+ Named(ref name) => ClassQuery::Binary(name),
+ NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
+ property_name: name,
+ property_value: value,
+ },
+ };
+ let mut result = self.convert_unicode_class_error(
+ &ast_class.span,
+ unicode::class(query),
+ );
+ if let Ok(ref mut class) = result {
+ self.unicode_fold_and_negate(
+ &ast_class.span,
+ ast_class.negated,
+ class,
+ )?;
+ if class.ranges().is_empty() {
+ let err = self
+ .error(ast_class.span, ErrorKind::EmptyClassNotAllowed);
+ return Err(err);
+ }
+ }
+ result
+ }
+
+ fn hir_ascii_unicode_class(
+ &self,
+ ast: &ast::ClassAscii,
+ ) -> Result<hir::ClassUnicode> {
+ let mut cls = hir::ClassUnicode::new(
+ ascii_class(&ast.kind)
+ .iter()
+ .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)),
+ );
+ self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+ Ok(cls)
+ }
+
+ fn hir_ascii_byte_class(
+ &self,
+ ast: &ast::ClassAscii,
+ ) -> Result<hir::ClassBytes> {
+ let mut cls = hir::ClassBytes::new(
+ ascii_class(&ast.kind)
+ .iter()
+ .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)),
+ );
+ self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+ Ok(cls)
+ }
+
+ fn hir_perl_unicode_class(
+ &self,
+ ast_class: &ast::ClassPerl,
+ ) -> Result<hir::ClassUnicode> {
+ use crate::ast::ClassPerlKind::*;
+
+ assert!(self.flags().unicode());
+ let result = match ast_class.kind {
+ Digit => unicode::perl_digit(),
+ Space => unicode::perl_space(),
+ Word => unicode::perl_word(),
+ };
+ let mut class =
+ self.convert_unicode_class_error(&ast_class.span, result)?;
+ // We needn't apply case folding here because the Perl Unicode classes
+ // are already closed under Unicode simple case folding.
+ if ast_class.negated {
+ class.negate();
+ }
+ Ok(class)
+ }
+
+ fn hir_perl_byte_class(
+ &self,
+ ast_class: &ast::ClassPerl,
+ ) -> hir::ClassBytes {
+ use crate::ast::ClassPerlKind::*;
+
+ assert!(!self.flags().unicode());
+ let mut class = match ast_class.kind {
+ Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
+ Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
+ Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
+ };
+ // We needn't apply case folding here because the Perl ASCII classes
+ // are already closed (under ASCII case folding).
+ if ast_class.negated {
+ class.negate();
+ }
+ class
+ }
+
+ /// Converts the given Unicode specific error to an HIR translation error.
+ ///
+ /// The span given should approximate the position at which an error would
+ /// occur.
+ fn convert_unicode_class_error(
+ &self,
+ span: &Span,
+ result: unicode::Result<hir::ClassUnicode>,
+ ) -> Result<hir::ClassUnicode> {
+ result.map_err(|err| {
+ let sp = span.clone();
+ match err {
+ unicode::Error::PropertyNotFound => {
+ self.error(sp, ErrorKind::UnicodePropertyNotFound)
+ }
+ unicode::Error::PropertyValueNotFound => {
+ self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
+ }
+ unicode::Error::PerlClassNotFound => {
+ self.error(sp, ErrorKind::UnicodePerlClassNotFound)
+ }
+ }
+ })
+ }
+
+ fn unicode_fold_and_negate(
+ &self,
+ span: &Span,
+ negated: bool,
+ class: &mut hir::ClassUnicode,
+ ) -> Result<()> {
+ // Note that we must apply case folding before negation!
+ // Consider `(?i)[^x]`. If we applied negation field, then
+ // the result would be the character class that matched any
+ // Unicode scalar value.
+ if self.flags().case_insensitive() {
+ class.try_case_fold_simple().map_err(|_| {
+ self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ }
+ if negated {
+ class.negate();
+ }
+ Ok(())
+ }
+
+ fn bytes_fold_and_negate(
+ &self,
+ span: &Span,
+ negated: bool,
+ class: &mut hir::ClassBytes,
+ ) -> Result<()> {
+ // Note that we must apply case folding before negation!
+ // Consider `(?i)[^x]`. If we applied negation first, then
+ // the result would be the character class that matched any
+ // Unicode scalar value.
+ if self.flags().case_insensitive() {
+ class.case_fold_simple();
+ }
+ if negated {
+ class.negate();
+ }
+ if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
+ return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
+ }
+ Ok(())
+ }
+
+ /// Return a scalar byte value suitable for use as a literal in a byte
+ /// character class.
+ fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
+ match self.literal_to_char(ast)? {
+ hir::Literal::Byte(byte) => Ok(byte),
+ hir::Literal::Unicode(ch) => {
+ if ch <= 0x7F as char {
+ Ok(ch as u8)
+ } else {
+ // We can't feasibly support Unicode in
+ // byte oriented classes. Byte classes don't
+ // do Unicode case folding.
+ Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
+ }
+ }
+ }
+ }
+}
+
+/// A translator's representation of a regular expression's flags at any given
+/// moment in time.
+///
+/// Each flag can be in one of three states: absent, present but disabled or
+/// present but enabled.
+#[derive(Clone, Copy, Debug, Default)]
+struct Flags {
+ case_insensitive: Option<bool>,
+ multi_line: Option<bool>,
+ dot_matches_new_line: Option<bool>,
+ swap_greed: Option<bool>,
+ unicode: Option<bool>,
+ // Note that `ignore_whitespace` is omitted here because it is handled
+ // entirely in the parser.
+}
+
+impl Flags {
+ fn from_ast(ast: &ast::Flags) -> Flags {
+ let mut flags = Flags::default();
+ let mut enable = true;
+ for item in &ast.items {
+ match item.kind {
+ ast::FlagsItemKind::Negation => {
+ enable = false;
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
+ flags.case_insensitive = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
+ flags.multi_line = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
+ flags.dot_matches_new_line = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
+ flags.swap_greed = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
+ flags.unicode = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
+ }
+ }
+ flags
+ }
+
+ fn merge(&mut self, previous: &Flags) {
+ if self.case_insensitive.is_none() {
+ self.case_insensitive = previous.case_insensitive;
+ }
+ if self.multi_line.is_none() {
+ self.multi_line = previous.multi_line;
+ }
+ if self.dot_matches_new_line.is_none() {
+ self.dot_matches_new_line = previous.dot_matches_new_line;
+ }
+ if self.swap_greed.is_none() {
+ self.swap_greed = previous.swap_greed;
+ }
+ if self.unicode.is_none() {
+ self.unicode = previous.unicode;
+ }
+ }
+
+ fn case_insensitive(&self) -> bool {
+ self.case_insensitive.unwrap_or(false)
+ }
+
+ fn multi_line(&self) -> bool {
+ self.multi_line.unwrap_or(false)
+ }
+
+ fn dot_matches_new_line(&self) -> bool {
+ self.dot_matches_new_line.unwrap_or(false)
+ }
+
+ fn swap_greed(&self) -> bool {
+ self.swap_greed.unwrap_or(false)
+ }
+
+ fn unicode(&self) -> bool {
+ self.unicode.unwrap_or(true)
+ }
+}
+
+fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
+ let ranges: Vec<_> = ascii_class(kind)
+ .iter()
+ .cloned()
+ .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8))
+ .collect();
+ hir::ClassBytes::new(ranges)
+}
+
+fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
+ use crate::ast::ClassAsciiKind::*;
+ match *kind {
+ Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')],
+ Alpha => &[('A', 'Z'), ('a', 'z')],
+ Ascii => &[('\x00', '\x7F')],
+ Blank => &[('\t', '\t'), (' ', ' ')],
+ Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')],
+ Digit => &[('0', '9')],
+ Graph => &[('!', '~')],
+ Lower => &[('a', 'z')],
+ Print => &[(' ', '~')],
+ Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')],
+ Space => &[
+ ('\t', '\t'),
+ ('\n', '\n'),
+ ('\x0B', '\x0B'),
+ ('\x0C', '\x0C'),
+ ('\r', '\r'),
+ (' ', ' '),
+ ],
+ Upper => &[('A', 'Z')],
+ Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')],
+ Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')],
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::ast::parse::ParserBuilder;
+ use crate::ast::{self, Ast, Position, Span};
+ use crate::hir::{self, Hir, HirKind};
+ use crate::unicode::{self, ClassQuery};
+
+ use super::{ascii_class, TranslatorBuilder};
+
+ // We create these errors to compare with real hir::Errors in the tests.
+ // We define equality between TestError and hir::Error to disregard the
+ // pattern string in hir::Error, which is annoying to provide in tests.
+ #[derive(Clone, Debug)]
+ struct TestError {
+ span: Span,
+ kind: hir::ErrorKind,
+ }
+
+ impl PartialEq<hir::Error> for TestError {
+ fn eq(&self, other: &hir::Error) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ impl PartialEq<TestError> for hir::Error {
+ fn eq(&self, other: &TestError) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ fn parse(pattern: &str) -> Ast {
+ ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
+ }
+
+ fn t(pattern: &str) -> Hir {
+ TranslatorBuilder::new()
+ .allow_invalid_utf8(false)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap()
+ }
+
+ fn t_err(pattern: &str) -> hir::Error {
+ TranslatorBuilder::new()
+ .allow_invalid_utf8(false)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap_err()
+ }
+
+ fn t_bytes(pattern: &str) -> Hir {
+ TranslatorBuilder::new()
+ .allow_invalid_utf8(true)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap()
+ }
+
+ fn hir_lit(s: &str) -> Hir {
+ match s.len() {
+ 0 => Hir::empty(),
+ _ => {
+ let lits = s
+ .chars()
+ .map(hir::Literal::Unicode)
+ .map(Hir::literal)
+ .collect();
+ Hir::concat(lits)
+ }
+ }
+ }
+
+ fn hir_blit(s: &[u8]) -> Hir {
+ match s.len() {
+ 0 => Hir::empty(),
+ 1 => Hir::literal(hir::Literal::Byte(s[0])),
+ _ => {
+ let lits = s
+ .iter()
+ .cloned()
+ .map(hir::Literal::Byte)
+ .map(Hir::literal)
+ .collect();
+ Hir::concat(lits)
+ }
+ }
+ }
+
+ fn hir_group(i: u32, expr: Hir) -> Hir {
+ Hir::group(hir::Group {
+ kind: hir::GroupKind::CaptureIndex(i),
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir {
+ Hir::group(hir::Group {
+ kind: hir::GroupKind::CaptureName {
+ name: name.to_string(),
+ index: i,
+ },
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_group_nocap(expr: Hir) -> Hir {
+ Hir::group(hir::Group {
+ kind: hir::GroupKind::NonCapturing,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_quest(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrOne,
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_star(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_plus(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::OneOrMore,
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::Range(range),
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_alt(alts: Vec<Hir>) -> Hir {
+ Hir::alternation(alts)
+ }
+
+ fn hir_cat(exprs: Vec<Hir>) -> Hir {
+ Hir::concat(exprs)
+ }
+
+ #[allow(dead_code)]
+ fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
+ Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
+ }
+
+ #[allow(dead_code)]
+ fn hir_uclass_perl_word() -> Hir {
+ Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
+ }
+
+ fn hir_uclass(ranges: &[(char, char)]) -> Hir {
+ let ranges: Vec<hir::ClassUnicodeRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
+ .collect();
+ Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges)))
+ }
+
+ fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
+ let ranges: Vec<hir::ClassBytesRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
+ .collect();
+ Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
+ }
+
+ fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir {
+ let ranges: Vec<hir::ClassBytesRange> = ranges
+ .iter()
+ .map(|&(s, e)| {
+ assert!(s as u32 <= 0x7F);
+ assert!(e as u32 <= 0x7F);
+ hir::ClassBytesRange::new(s as u8, e as u8)
+ })
+ .collect();
+ Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
+ }
+
+ fn hir_case_fold(expr: Hir) -> Hir {
+ match expr.into_kind() {
+ HirKind::Class(mut cls) => {
+ cls.case_fold_simple();
+ Hir::class(cls)
+ }
+ _ => panic!("cannot case fold non-class Hir expr"),
+ }
+ }
+
+ fn hir_negate(expr: Hir) -> Hir {
+ match expr.into_kind() {
+ HirKind::Class(mut cls) => {
+ cls.negate();
+ Hir::class(cls)
+ }
+ _ => panic!("cannot negate non-class Hir expr"),
+ }
+ }
+
+ #[allow(dead_code)]
+ fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
+ use crate::hir::Class::{Bytes, Unicode};
+
+ match (expr1.into_kind(), expr2.into_kind()) {
+ (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
+ c1.union(&c2);
+ Hir::class(hir::Class::Unicode(c1))
+ }
+ (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
+ c1.union(&c2);
+ Hir::class(hir::Class::Bytes(c1))
+ }
+ _ => panic!("cannot union non-class Hir exprs"),
+ }
+ }
+
+ #[allow(dead_code)]
+ fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
+ use crate::hir::Class::{Bytes, Unicode};
+
+ match (expr1.into_kind(), expr2.into_kind()) {
+ (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
+ c1.difference(&c2);
+ Hir::class(hir::Class::Unicode(c1))
+ }
+ (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
+ c1.difference(&c2);
+ Hir::class(hir::Class::Bytes(c1))
+ }
+ _ => panic!("cannot difference non-class Hir exprs"),
+ }
+ }
+
+ fn hir_anchor(anchor: hir::Anchor) -> Hir {
+ Hir::anchor(anchor)
+ }
+
+ fn hir_word(wb: hir::WordBoundary) -> Hir {
+ Hir::word_boundary(wb)
+ }
+
+ #[test]
+ fn empty() {
+ assert_eq!(t(""), Hir::empty());
+ assert_eq!(t("(?i)"), Hir::empty());
+ assert_eq!(t("()"), hir_group(1, Hir::empty()));
+ assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
+ assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty()));
+ assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
+ assert_eq!(
+ t("()|()"),
+ hir_alt(vec![
+ hir_group(1, Hir::empty()),
+ hir_group(2, Hir::empty()),
+ ])
+ );
+ assert_eq!(
+ t("(|b)"),
+ hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
+ );
+ assert_eq!(
+ t("(a|)"),
+ hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
+ );
+ assert_eq!(
+ t("(a||c)"),
+ hir_group(
+ 1,
+ hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
+ )
+ );
+ assert_eq!(
+ t("(||)"),
+ hir_group(
+ 1,
+ hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
+ )
+ );
+ }
+
+ #[test]
+ fn literal() {
+ assert_eq!(t("a"), hir_lit("a"));
+ assert_eq!(t("(?-u)a"), hir_lit("a"));
+ assert_eq!(t("☃"), hir_lit("☃"));
+ assert_eq!(t("abcd"), hir_lit("abcd"));
+
+ assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
+ assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
+ assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
+ assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
+
+ assert_eq!(
+ t_err("(?-u)☃"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(8, 1, 7)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"(?-u)\xFF"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn literal_case_insensitive() {
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i:a)"),
+ hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("a(?i)a(?-i)a"),
+ hir_cat(vec![
+ hir_lit("a"),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_lit("a"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)ab@c"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_uclass(&[('B', 'B'), ('b', 'b')]),
+ hir_lit("@"),
+ hir_uclass(&[('C', 'C'), ('c', 'c')]),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)β"),
+ hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
+ );
+
+ assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?-u)a(?i)a(?-i)a"),
+ hir_cat(vec![
+ hir_lit("a"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_lit("a"),
+ ])
+ );
+ assert_eq!(
+ t("(?i-u)ab@c"),
+ hir_cat(vec![
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
+ hir_lit("@"),
+ hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
+ ])
+ );
+
+ assert_eq!(
+ t_bytes("(?i-u)a"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(
+ t_bytes("(?i-u)\x61"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(
+ t_bytes(r"(?i-u)\x61"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
+
+ assert_eq!(
+ t_err("(?i-u)β"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(8, 1, 8),
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn dot() {
+ assert_eq!(
+ t("."),
+ hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),])
+ );
+ assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),]));
+ assert_eq!(
+ t_bytes("(?-u)."),
+ hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),])
+ );
+ assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
+
+ // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
+ assert_eq!(
+ t_err("(?-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(6, 1, 7)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?s-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn assertions() {
+ assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
+ assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
+ assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
+ assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
+ assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
+ assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
+ assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
+ assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
+
+ assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
+ assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
+ assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
+ assert_eq!(
+ t_bytes(r"(?-u)\B"),
+ hir_word(hir::WordBoundary::AsciiNegate)
+ );
+
+ assert_eq!(
+ t_err(r"(?-u)\B"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn group() {
+ assert_eq!(t("(a)"), hir_group(1, hir_lit("a")));
+ assert_eq!(
+ t("(a)(b)"),
+ hir_cat(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group(2, hir_lit("b")),
+ ])
+ );
+ assert_eq!(
+ t("(a)|(b)"),
+ hir_alt(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group(2, hir_lit("b")),
+ ])
+ );
+ assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty()));
+ assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a")));
+ assert_eq!(
+ t("(?P<foo>a)(?P<bar>b)"),
+ hir_cat(vec![
+ hir_group_name(1, "foo", hir_lit("a")),
+ hir_group_name(2, "bar", hir_lit("b")),
+ ])
+ );
+ assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
+ assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a")));
+ assert_eq!(
+ t("(?:a)(b)"),
+ hir_cat(vec![
+ hir_group_nocap(hir_lit("a")),
+ hir_group(1, hir_lit("b")),
+ ])
+ );
+ assert_eq!(
+ t("(a)(?:b)(c)"),
+ hir_cat(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group_nocap(hir_lit("b")),
+ hir_group(2, hir_lit("c")),
+ ])
+ );
+ assert_eq!(
+ t("(a)(?P<foo>b)(c)"),
+ hir_cat(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group_name(2, "foo", hir_lit("b")),
+ hir_group(3, hir_lit("c")),
+ ])
+ );
+ assert_eq!(t("()"), hir_group(1, Hir::empty()));
+ assert_eq!(t("((?i))"), hir_group(1, Hir::empty()));
+ assert_eq!(t("((?x))"), hir_group(1, Hir::empty()));
+ assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty())));
+ }
+
+ #[test]
+ fn flags() {
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i:a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])),
+ hir_lit("a"),
+ ])
+ );
+ assert_eq!(
+ t("(?i-u:a)β"),
+ hir_cat(vec![
+ hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("β"),
+ ])
+ );
+ assert_eq!(
+ t("(?:(?i-u)a)b"),
+ hir_cat(vec![
+ hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("b"),
+ ])
+ );
+ assert_eq!(
+ t("((?i-u)a)b"),
+ hir_cat(vec![
+ hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("b"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)(?-i:a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_lit("a")),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?im)a^"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_anchor(hir::Anchor::StartLine),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?im)a^(?i-m)a^"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_anchor(hir::Anchor::StartLine),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_anchor(hir::Anchor::StartText),
+ ])
+ );
+ assert_eq!(
+ t("(?U)a*a*?(?-U)a*a*?"),
+ hir_cat(vec![
+ hir_star(false, hir_lit("a")),
+ hir_star(true, hir_lit("a")),
+ hir_star(true, hir_lit("a")),
+ hir_star(false, hir_lit("a")),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?:a(?i)a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_cat(vec![
+ hir_lit("a"),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])),
+ hir_lit("a"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)(?:a(?-i)a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_lit("a"),
+ ])),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])
+ );
+ }
+
+ #[test]
+ fn escape() {
+ assert_eq!(
+ t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
+ hir_lit(r"\.+*?()|[]{}^$#")
+ );
+ }
+
+ #[test]
+ fn repetition() {
+ assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
+ assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
+ assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
+ assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
+ assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
+ assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
+
+ assert_eq!(
+ t("a{1}"),
+ hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,}"),
+ hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,2}"),
+ hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1}?"),
+ hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,}?"),
+ hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,2}?"),
+ hir_range(
+ false,
+ hir::RepetitionRange::Bounded(1, 2),
+ hir_lit("a"),
+ )
+ );
+
+ assert_eq!(
+ t("ab?"),
+ hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
+ );
+ assert_eq!(
+ t("(ab)?"),
+ hir_quest(
+ true,
+ hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
+ )
+ );
+ assert_eq!(
+ t("a|b?"),
+ hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
+ );
+ }
+
+ #[test]
+ fn cat_alt() {
+ assert_eq!(
+ t("(ab)"),
+ hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
+ );
+ assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),]));
+ assert_eq!(
+ t("a|b|c"),
+ hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
+ );
+ assert_eq!(
+ t("ab|bc|cd"),
+ hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
+ );
+ assert_eq!(
+ t("(a|b)"),
+ hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),]))
+ );
+ assert_eq!(
+ t("(a|b|c)"),
+ hir_group(
+ 1,
+ hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
+ )
+ );
+ assert_eq!(
+ t("(ab|bc|cd)"),
+ hir_group(
+ 1,
+ hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
+ )
+ );
+ assert_eq!(
+ t("(ab|(bc|(cd)))"),
+ hir_group(
+ 1,
+ hir_alt(vec![
+ hir_lit("ab"),
+ hir_group(
+ 2,
+ hir_alt(vec![
+ hir_lit("bc"),
+ hir_group(3, hir_lit("cd")),
+ ])
+ ),
+ ])
+ )
+ );
+ }
+
+ #[test]
+ fn class_ascii() {
+ assert_eq!(
+ t("[[:alnum:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum))
+ );
+ assert_eq!(
+ t("[[:alpha:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha))
+ );
+ assert_eq!(
+ t("[[:ascii:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii))
+ );
+ assert_eq!(
+ t("[[:blank:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank))
+ );
+ assert_eq!(
+ t("[[:cntrl:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl))
+ );
+ assert_eq!(
+ t("[[:digit:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t("[[:graph:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph))
+ );
+ assert_eq!(
+ t("[[:lower:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))
+ );
+ assert_eq!(
+ t("[[:print:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Print))
+ );
+ assert_eq!(
+ t("[[:punct:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct))
+ );
+ assert_eq!(
+ t("[[:space:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t("[[:upper:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper))
+ );
+ assert_eq!(
+ t("[[:word:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Word))
+ );
+ assert_eq!(
+ t("[[:xdigit:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit))
+ );
+
+ assert_eq!(
+ t("[[:^lower:]]"),
+ hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[[:lower:]]"),
+ hir_uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ])
+ );
+
+ assert_eq!(
+ t("(?-u)[[:lower:]]"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower))
+ );
+ assert_eq!(
+ t("(?i-u)[[:lower:]]"),
+ hir_case_fold(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Lower
+ )))
+ );
+
+ assert_eq!(
+ t_err("(?-u)[[:^lower:]]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(16, 1, 17)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?i-u)[[:^lower:]]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(7, 1, 8),
+ Position::new(17, 1, 18)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_ascii_multiple() {
+ // See: https://github.com/rust-lang/regex/issues/680
+ assert_eq!(
+ t("[[:alnum:][:^ascii:]]"),
+ hir_union(
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)),
+ hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
+ ),
+ );
+ assert_eq!(
+ t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
+ hir_union(
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)),
+ hir_bclass(&[(0x80, 0xFF)]),
+ ),
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-perl")]
+ fn class_perl() {
+ // Unicode
+ assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
+ assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
+ assert_eq!(t(r"\w"), hir_uclass_perl_word());
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\d"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\s"),
+ hir_uclass_query(ClassQuery::Binary("space"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
+
+ // Unicode, negated
+ assert_eq!(
+ t(r"\D"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ assert_eq!(
+ t(r"\S"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
+ );
+ assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\D"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\S"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
+
+ // ASCII only
+ assert_eq!(
+ t(r"(?-u)\d"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t(r"(?-u)\s"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t(r"(?-u)\w"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
+ );
+ assert_eq!(
+ t(r"(?i-u)\d"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t(r"(?i-u)\s"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t(r"(?i-u)\w"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
+ );
+
+ // ASCII only, negated
+ assert_eq!(
+ t(r"(?-u)\D"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t(r"(?-u)\S"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Space
+ )))
+ );
+ assert_eq!(
+ t(r"(?-u)\W"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Word
+ )))
+ );
+ assert_eq!(
+ t(r"(?i-u)\D"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t(r"(?i-u)\S"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Space
+ )))
+ );
+ assert_eq!(
+ t(r"(?i-u)\W"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Word
+ )))
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-perl"))]
+ fn class_perl_word_disabled() {
+ assert_eq!(
+ t_err(r"\w"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
+ fn class_perl_space_disabled() {
+ assert_eq!(
+ t_err(r"\s"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(all(
+ not(feature = "unicode-perl"),
+ not(feature = "unicode-gencat")
+ ))]
+ fn class_perl_digit_disabled() {
+ assert_eq!(
+ t_err(r"\d"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-gencat")]
+ fn class_unicode_gencat() {
+ assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
+ assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
+ assert_eq!(
+ t(r"\p{Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{se PaRa ToR}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{gc:Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{gc=Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{Other}"),
+ hir_uclass_query(ClassQuery::Binary("Other"))
+ );
+ assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
+
+ assert_eq!(
+ t(r"\PZ"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+ assert_eq!(
+ t(r"\P{separator}"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+ assert_eq!(
+ t(r"\P{gc!=separator}"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+
+ assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
+ assert_eq!(
+ t(r"\p{assigned}"),
+ hir_uclass_query(ClassQuery::Binary("Assigned"))
+ );
+ assert_eq!(
+ t(r"\p{ascii}"),
+ hir_uclass_query(ClassQuery::Binary("ASCII"))
+ );
+ assert_eq!(
+ t(r"\p{gc:any}"),
+ hir_uclass_query(ClassQuery::Binary("Any"))
+ );
+ assert_eq!(
+ t(r"\p{gc:assigned}"),
+ hir_uclass_query(ClassQuery::Binary("Assigned"))
+ );
+ assert_eq!(
+ t(r"\p{gc:ascii}"),
+ hir_uclass_query(ClassQuery::Binary("ASCII"))
+ );
+
+ assert_eq!(
+ t_err(r"(?-u)\pZ"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(8, 1, 9)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"(?-u)\p{Separator}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(18, 1, 19)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\pE"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(3, 1, 4)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{gc:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-gencat"))]
+ fn class_unicode_gencat_disabled() {
+ assert_eq!(
+ t_err(r"\p{Separator}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(13, 1, 14)
+ ),
+ }
+ );
+
+ assert_eq!(
+ t_err(r"\p{Any}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-script")]
+ fn class_unicode_script() {
+ assert_eq!(
+ t(r"\p{Greek}"),
+ hir_uclass_query(ClassQuery::Binary("Greek"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\p{Greek}"),
+ hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\P{Greek}"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "Greek"
+ ))))
+ );
+
+ assert_eq!(
+ t_err(r"\p{sc:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{scx:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-script"))]
+ fn class_unicode_script_disabled() {
+ assert_eq!(
+ t_err(r"\p{Greek}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+
+ assert_eq!(
+ t_err(r"\p{scx:Greek}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(13, 1, 14)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-age")]
+ fn class_unicode_age() {
+ assert_eq!(
+ t_err(r"\p{age:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-gencat")]
+ fn class_unicode_any_empty() {
+ assert_eq!(
+ t_err(r"\P{any}"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-age"))]
+ fn class_unicode_age_disabled() {
+ assert_eq!(
+ t_err(r"\p{age:3.0}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed() {
+ assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')]));
+ assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')])));
+ assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
+ assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
+ assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
+ assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
+ assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
+ assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\pZ]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\p{separator}]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\PZ]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\P{separator}]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(all(
+ feature = "unicode-case",
+ any(feature = "unicode-perl", feature = "unicode-gencat")
+ ))]
+ assert_eq!(
+ t(r"(?i)[^\D]"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[^\P{greek}]"),
+ hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
+ );
+
+ assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
+ assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
+ assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[k]"),
+ hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[β]"),
+ hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
+ );
+ assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
+
+ assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')])));
+ assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')])));
+ assert_eq!(
+ t_bytes("(?-u)[^a]"),
+ hir_negate(hir_bclass(&[(b'a', b'a')]))
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(
+ t(r"[^\d]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\pZ]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\p{separator}]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[^\p{greek}]"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "greek"
+ ))))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[\P{greek}]"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "greek"
+ ))))
+ );
+
+ // Test some weird cases.
+ assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
+
+ assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
+ assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
+
+ assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
+ assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
+
+ assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
+ assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
+
+ assert_eq!(
+ t_err("(?-u)[^a]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
+ assert_eq!(
+ t_err(r"[^\s\S]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
+ assert_eq!(
+ t_err(r"(?-u)[^\s\S]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(12, 1, 13)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed_union() {
+ assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[a\pZb]"),
+ hir_union(
+ hir_uclass(&[('a', 'b')]),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ );
+ #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"[\pZ\p{Greek}]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[\p{age:3.0}\pZ\p{Greek}]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("cyrillic")),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )
+ )
+ );
+
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-case",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
+ hir_case_fold(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ ))
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
+ hir_negate(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ ))
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-case",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
+ hir_negate(hir_case_fold(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )))
+ );
+ }
+
+ #[test]
+ fn class_bracketed_nested() {
+ assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[])));
+
+ assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
+ assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[a[^c]]"),
+ hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[a-b[^c]]"),
+ hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
+ );
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[^a-b[^c]]"),
+ hir_uclass(&[('C', 'C'), ('c', 'c')])
+ );
+
+ assert_eq!(
+ t_err(r"[^a-c[^c]]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t_err(r"(?i)[^a-c[^c]]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(4, 1, 5),
+ Position::new(14, 1, 15)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed_intersect() {
+ assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
+ assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
+ assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
+ assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
+ assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
+ assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
+
+ assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
+ assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
+ assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[abc&&b-c]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[abc&&[b-c]]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[[abc]&&[b-c]]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[a-z&&b-y&&c-x]"),
+ hir_case_fold(hir_uclass(&[('c', 'x')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[c-da-b&&a-d]"),
+ hir_case_fold(hir_uclass(&[('a', 'd')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[a-d&&c-da-b]"),
+ hir_case_fold(hir_uclass(&[('a', 'd')]))
+ );
+
+ assert_eq!(
+ t("(?i-u)[abc&&b-c]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[abc&&[b-c]]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[[abc]&&[b-c]]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[a-z&&b-y&&c-x]"),
+ hir_case_fold(hir_bclass(&[(b'c', b'x')]))
+ );
+ assert_eq!(
+ t("(?i-u)[c-da-b&&a-d]"),
+ hir_case_fold(hir_bclass(&[(b'a', b'd')]))
+ );
+ assert_eq!(
+ t("(?i-u)[a-d&&c-da-b]"),
+ hir_case_fold(hir_bclass(&[(b'a', b'd')]))
+ );
+
+ // In `[a^]`, `^` does not need to be escaped, so it makes sense that
+ // `^` is also allowed to be unescaped after `&&`.
+ assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
+ // `]` needs to be escaped after `&&` since it's not at start of class.
+ assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
+ assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
+ // Test precedence.
+ assert_eq!(
+ t(r"[a-w&&[^c-g]z]"),
+ hir_uclass(&[('a', 'b'), ('h', 'w')])
+ );
+ }
+
+ #[test]
+ fn class_bracketed_intersect_negate() {
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^\w&&\d]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^[\w&&\d]]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^[^\w&&\d]]"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
+
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t_bytes(r"(?-u)[^\w&&\d]"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[a-z&&a-c]]"),
+ hir_negate(hir_bclass(&[(b'a', b'c')]))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[\w&&\d]]"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[^\w&&\d]]"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Word
+ )))
+ );
+ }
+
+ #[test]
+ fn class_bracketed_difference() {
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\pL--[:ascii:]]"),
+ hir_difference(
+ hir_uclass_query(ClassQuery::Binary("letter")),
+ hir_uclass(&[('\0', '\x7F')])
+ )
+ );
+
+ assert_eq!(
+ t(r"(?-u)[[:alpha:]--[:lower:]]"),
+ hir_bclass(&[(b'A', b'Z')])
+ );
+ }
+
+ #[test]
+ fn class_bracketed_symmetric_difference() {
+ #[cfg(feature = "unicode-script")]
+ assert_eq!(
+ t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
+ hir_uclass(&[
+ ('\u{0342}', '\u{0342}'),
+ ('\u{0345}', '\u{0345}'),
+ ('\u{1DC0}', '\u{1DC1}'),
+ ])
+ );
+ assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
+
+ assert_eq!(
+ t(r"(?-u)[a-g~~c-j]"),
+ hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
+ );
+ }
+
+ #[test]
+ fn ignore_whitespace() {
+ assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
+ assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
+ assert_eq!(
+ t(r"(?x)\x # comment
+{ # comment
+ 53 # comment
+} #comment"),
+ hir_lit("S")
+ );
+
+ assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
+ assert_eq!(
+ t(r"(?x)\x # comment
+ 53 # comment"),
+ hir_lit("S")
+ );
+ assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
+
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"(?x)\p # comment
+{ # comment
+ Separator # comment
+} # comment"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+
+ assert_eq!(
+ t(r"(?x)a # comment
+{ # comment
+ 5 # comment
+ , # comment
+ 10 # comment
+} # comment"),
+ hir_range(
+ true,
+ hir::RepetitionRange::Bounded(5, 10),
+ hir_lit("a")
+ )
+ );
+
+ assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
+ }
+
+ #[test]
+ fn analysis_is_always_utf8() {
+ // Positive examples.
+ assert!(t_bytes(r"a").is_always_utf8());
+ assert!(t_bytes(r"ab").is_always_utf8());
+ assert!(t_bytes(r"(?-u)a").is_always_utf8());
+ assert!(t_bytes(r"(?-u)ab").is_always_utf8());
+ assert!(t_bytes(r"\xFF").is_always_utf8());
+ assert!(t_bytes(r"\xFF\xFF").is_always_utf8());
+ assert!(t_bytes(r"[^a]").is_always_utf8());
+ assert!(t_bytes(r"[^a][^a]").is_always_utf8());
+ assert!(t_bytes(r"\b").is_always_utf8());
+ assert!(t_bytes(r"\B").is_always_utf8());
+ assert!(t_bytes(r"(?-u)\b").is_always_utf8());
+
+ // Negative examples.
+ assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)\B").is_always_utf8());
+ }
+
+ #[test]
+ fn analysis_is_all_assertions() {
+ // Positive examples.
+ assert!(t(r"\b").is_all_assertions());
+ assert!(t(r"\B").is_all_assertions());
+ assert!(t(r"^").is_all_assertions());
+ assert!(t(r"$").is_all_assertions());
+ assert!(t(r"\A").is_all_assertions());
+ assert!(t(r"\z").is_all_assertions());
+ assert!(t(r"$^\z\A\b\B").is_all_assertions());
+ assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions());
+ assert!(t(r"^$|$^").is_all_assertions());
+ assert!(t(r"((\b)+())*^").is_all_assertions());
+
+ // Negative examples.
+ assert!(!t(r"^a").is_all_assertions());
+ }
+
+ #[test]
+ fn analysis_is_anchored() {
+ // Positive examples.
+ assert!(t(r"^").is_anchored_start());
+ assert!(t(r"$").is_anchored_end());
+ assert!(t(r"^").is_line_anchored_start());
+ assert!(t(r"$").is_line_anchored_end());
+
+ assert!(t(r"^^").is_anchored_start());
+ assert!(t(r"$$").is_anchored_end());
+ assert!(t(r"^^").is_line_anchored_start());
+ assert!(t(r"$$").is_line_anchored_end());
+
+ assert!(t(r"^$").is_anchored_start());
+ assert!(t(r"^$").is_anchored_end());
+ assert!(t(r"^$").is_line_anchored_start());
+ assert!(t(r"^$").is_line_anchored_end());
+
+ assert!(t(r"^foo").is_anchored_start());
+ assert!(t(r"foo$").is_anchored_end());
+ assert!(t(r"^foo").is_line_anchored_start());
+ assert!(t(r"foo$").is_line_anchored_end());
+
+ assert!(t(r"^foo|^bar").is_anchored_start());
+ assert!(t(r"foo$|bar$").is_anchored_end());
+ assert!(t(r"^foo|^bar").is_line_anchored_start());
+ assert!(t(r"foo$|bar$").is_line_anchored_end());
+
+ assert!(t(r"^(foo|bar)").is_anchored_start());
+ assert!(t(r"(foo|bar)$").is_anchored_end());
+ assert!(t(r"^(foo|bar)").is_line_anchored_start());
+ assert!(t(r"(foo|bar)$").is_line_anchored_end());
+
+ assert!(t(r"^+").is_anchored_start());
+ assert!(t(r"$+").is_anchored_end());
+ assert!(t(r"^+").is_line_anchored_start());
+ assert!(t(r"$+").is_line_anchored_end());
+ assert!(t(r"^++").is_anchored_start());
+ assert!(t(r"$++").is_anchored_end());
+ assert!(t(r"^++").is_line_anchored_start());
+ assert!(t(r"$++").is_line_anchored_end());
+ assert!(t(r"(^)+").is_anchored_start());
+ assert!(t(r"($)+").is_anchored_end());
+ assert!(t(r"(^)+").is_line_anchored_start());
+ assert!(t(r"($)+").is_line_anchored_end());
+
+ assert!(t(r"$^").is_anchored_start());
+ assert!(t(r"$^").is_anchored_start());
+ assert!(t(r"$^").is_line_anchored_end());
+ assert!(t(r"$^").is_line_anchored_end());
+ assert!(t(r"$^|^$").is_anchored_start());
+ assert!(t(r"$^|^$").is_anchored_end());
+ assert!(t(r"$^|^$").is_line_anchored_start());
+ assert!(t(r"$^|^$").is_line_anchored_end());
+
+ assert!(t(r"\b^").is_anchored_start());
+ assert!(t(r"$\b").is_anchored_end());
+ assert!(t(r"\b^").is_line_anchored_start());
+ assert!(t(r"$\b").is_line_anchored_end());
+ assert!(t(r"^(?m:^)").is_anchored_start());
+ assert!(t(r"(?m:$)$").is_anchored_end());
+ assert!(t(r"^(?m:^)").is_line_anchored_start());
+ assert!(t(r"(?m:$)$").is_line_anchored_end());
+ assert!(t(r"(?m:^)^").is_anchored_start());
+ assert!(t(r"$(?m:$)").is_anchored_end());
+ assert!(t(r"(?m:^)^").is_line_anchored_start());
+ assert!(t(r"$(?m:$)").is_line_anchored_end());
+
+ // Negative examples.
+ assert!(!t(r"(?m)^").is_anchored_start());
+ assert!(!t(r"(?m)$").is_anchored_end());
+ assert!(!t(r"(?m:^$)|$^").is_anchored_start());
+ assert!(!t(r"(?m:^$)|$^").is_anchored_end());
+ assert!(!t(r"$^|(?m:^$)").is_anchored_start());
+ assert!(!t(r"$^|(?m:^$)").is_anchored_end());
+
+ assert!(!t(r"a^").is_anchored_start());
+ assert!(!t(r"$a").is_anchored_start());
+ assert!(!t(r"a^").is_line_anchored_start());
+ assert!(!t(r"$a").is_line_anchored_start());
+
+ assert!(!t(r"a^").is_anchored_end());
+ assert!(!t(r"$a").is_anchored_end());
+ assert!(!t(r"a^").is_line_anchored_end());
+ assert!(!t(r"$a").is_line_anchored_end());
+
+ assert!(!t(r"^foo|bar").is_anchored_start());
+ assert!(!t(r"foo|bar$").is_anchored_end());
+ assert!(!t(r"^foo|bar").is_line_anchored_start());
+ assert!(!t(r"foo|bar$").is_line_anchored_end());
+
+ assert!(!t(r"^*").is_anchored_start());
+ assert!(!t(r"$*").is_anchored_end());
+ assert!(!t(r"^*").is_line_anchored_start());
+ assert!(!t(r"$*").is_line_anchored_end());
+ assert!(!t(r"^*+").is_anchored_start());
+ assert!(!t(r"$*+").is_anchored_end());
+ assert!(!t(r"^*+").is_line_anchored_start());
+ assert!(!t(r"$*+").is_line_anchored_end());
+ assert!(!t(r"^+*").is_anchored_start());
+ assert!(!t(r"$+*").is_anchored_end());
+ assert!(!t(r"^+*").is_line_anchored_start());
+ assert!(!t(r"$+*").is_line_anchored_end());
+ assert!(!t(r"(^)*").is_anchored_start());
+ assert!(!t(r"($)*").is_anchored_end());
+ assert!(!t(r"(^)*").is_line_anchored_start());
+ assert!(!t(r"($)*").is_line_anchored_end());
+ }
+
+ #[test]
+ fn analysis_is_line_anchored() {
+ assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start());
+ assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end());
+
+ assert!(t(r"(?m)^foo|^bar").is_line_anchored_start());
+ assert!(t(r"(?m)foo$|bar$").is_line_anchored_end());
+
+ assert!(t(r"(?m)^").is_line_anchored_start());
+ assert!(t(r"(?m)$").is_line_anchored_end());
+
+ assert!(t(r"(?m:^$)|$^").is_line_anchored_start());
+ assert!(t(r"(?m:^$)|$^").is_line_anchored_end());
+
+ assert!(t(r"$^|(?m:^$)").is_line_anchored_start());
+ assert!(t(r"$^|(?m:^$)").is_line_anchored_end());
+ }
+
+ #[test]
+ fn analysis_is_any_anchored() {
+ // Positive examples.
+ assert!(t(r"^").is_any_anchored_start());
+ assert!(t(r"$").is_any_anchored_end());
+ assert!(t(r"\A").is_any_anchored_start());
+ assert!(t(r"\z").is_any_anchored_end());
+
+ // Negative examples.
+ assert!(!t(r"(?m)^").is_any_anchored_start());
+ assert!(!t(r"(?m)$").is_any_anchored_end());
+ assert!(!t(r"$").is_any_anchored_start());
+ assert!(!t(r"^").is_any_anchored_end());
+ }
+
+ #[test]
+ fn analysis_is_match_empty() {
+ // Positive examples.
+ assert!(t(r"").is_match_empty());
+ assert!(t(r"()").is_match_empty());
+ assert!(t(r"()*").is_match_empty());
+ assert!(t(r"()+").is_match_empty());
+ assert!(t(r"()?").is_match_empty());
+ assert!(t(r"a*").is_match_empty());
+ assert!(t(r"a?").is_match_empty());
+ assert!(t(r"a{0}").is_match_empty());
+ assert!(t(r"a{0,}").is_match_empty());
+ assert!(t(r"a{0,1}").is_match_empty());
+ assert!(t(r"a{0,10}").is_match_empty());
+ #[cfg(feature = "unicode-gencat")]
+ assert!(t(r"\pL*").is_match_empty());
+ assert!(t(r"a*|b").is_match_empty());
+ assert!(t(r"b|a*").is_match_empty());
+ assert!(t(r"a|").is_match_empty());
+ assert!(t(r"|a").is_match_empty());
+ assert!(t(r"a||b").is_match_empty());
+ assert!(t(r"a*a?(abcd)*").is_match_empty());
+ assert!(t(r"^").is_match_empty());
+ assert!(t(r"$").is_match_empty());
+ assert!(t(r"(?m)^").is_match_empty());
+ assert!(t(r"(?m)$").is_match_empty());
+ assert!(t(r"\A").is_match_empty());
+ assert!(t(r"\z").is_match_empty());
+ assert!(t(r"\B").is_match_empty());
+ assert!(t_bytes(r"(?-u)\B").is_match_empty());
+ assert!(t(r"\b").is_match_empty());
+ assert!(t(r"(?-u)\b").is_match_empty());
+
+ // Negative examples.
+ assert!(!t(r"a+").is_match_empty());
+ assert!(!t(r"a{1}").is_match_empty());
+ assert!(!t(r"a{1,}").is_match_empty());
+ assert!(!t(r"a{1,2}").is_match_empty());
+ assert!(!t(r"a{1,10}").is_match_empty());
+ assert!(!t(r"b|a").is_match_empty());
+ assert!(!t(r"a*a+(abcd)*").is_match_empty());
+ }
+
+ #[test]
+ fn analysis_is_literal() {
+ // Positive examples.
+ assert!(t(r"a").is_literal());
+ assert!(t(r"ab").is_literal());
+ assert!(t(r"abc").is_literal());
+ assert!(t(r"(?m)abc").is_literal());
+
+ // Negative examples.
+ assert!(!t(r"").is_literal());
+ assert!(!t(r"^").is_literal());
+ assert!(!t(r"a|b").is_literal());
+ assert!(!t(r"(a)").is_literal());
+ assert!(!t(r"a+").is_literal());
+ assert!(!t(r"foo(a)").is_literal());
+ assert!(!t(r"(a)foo").is_literal());
+ assert!(!t(r"[a]").is_literal());
+ }
+
+ #[test]
+ fn analysis_is_alternation_literal() {
+ // Positive examples.
+ assert!(t(r"a").is_alternation_literal());
+ assert!(t(r"ab").is_alternation_literal());
+ assert!(t(r"abc").is_alternation_literal());
+ assert!(t(r"(?m)abc").is_alternation_literal());
+ assert!(t(r"a|b").is_alternation_literal());
+ assert!(t(r"a|b|c").is_alternation_literal());
+ assert!(t(r"foo|bar").is_alternation_literal());
+ assert!(t(r"foo|bar|baz").is_alternation_literal());
+
+ // Negative examples.
+ assert!(!t(r"").is_alternation_literal());
+ assert!(!t(r"^").is_alternation_literal());
+ assert!(!t(r"(a)").is_alternation_literal());
+ assert!(!t(r"a+").is_alternation_literal());
+ assert!(!t(r"foo(a)").is_alternation_literal());
+ assert!(!t(r"(a)foo").is_alternation_literal());
+ assert!(!t(r"[a]").is_alternation_literal());
+ assert!(!t(r"[a]|b").is_alternation_literal());
+ assert!(!t(r"a|[b]").is_alternation_literal());
+ assert!(!t(r"(a)|b").is_alternation_literal());
+ assert!(!t(r"a|(b)").is_alternation_literal());
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/hir/visitor.rs b/third_party/rust/regex-syntax/src/hir/visitor.rs
new file mode 100644
index 0000000000..4f5a70909c
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/visitor.rs
@@ -0,0 +1,203 @@
+use crate::hir::{self, Hir, HirKind};
+
+/// A trait for visiting the high-level IR (HIR) in depth first order.
+///
+/// The principle aim of this trait is to enable callers to perform case
+/// analysis on a high-level intermediate representation of a regular
+/// expression without necessarily using recursion. In particular, this permits
+/// callers to do case analysis with constant stack usage, which can be
+/// important since the size of an HIR may be proportional to end user input.
+///
+/// Typical usage of this trait involves providing an implementation and then
+/// running it using the [`visit`](fn.visit.html) function.
+pub trait Visitor {
+ /// The result of visiting an HIR.
+ type Output;
+ /// An error that visiting an HIR might return.
+ type Err;
+
+ /// All implementors of `Visitor` must provide a `finish` method, which
+ /// yields the result of visiting the HIR or an error.
+ fn finish(self) -> Result<Self::Output, Self::Err>;
+
+ /// This method is called before beginning traversal of the HIR.
+ fn start(&mut self) {}
+
+ /// This method is called on an `Hir` before descending into child `Hir`
+ /// nodes.
+ fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called on an `Hir` after descending all of its child
+ /// `Hir` nodes.
+ fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
+ /// This method is called between child nodes of an alternation.
+ fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
+ Ok(())
+ }
+}
+
+/// Executes an implementation of `Visitor` in constant stack space.
+///
+/// This function will visit every node in the given `Hir` while calling
+/// appropriate methods provided by the
+/// [`Visitor`](trait.Visitor.html) trait.
+///
+/// The primary use case for this method is when one wants to perform case
+/// analysis over an `Hir` without using a stack size proportional to the depth
+/// of the `Hir`. Namely, this method will instead use constant stack space,
+/// but will use heap space proportional to the size of the `Hir`. This may be
+/// desirable in cases where the size of `Hir` is proportional to end user
+/// input.
+///
+/// If the visitor returns an error at any point, then visiting is stopped and
+/// the error is returned.
+pub fn visit<V: Visitor>(hir: &Hir, visitor: V) -> Result<V::Output, V::Err> {
+ HeapVisitor::new().visit(hir, visitor)
+}
+
+/// HeapVisitor visits every item in an `Hir` recursively using constant stack
+/// size and a heap size proportional to the size of the `Hir`.
+struct HeapVisitor<'a> {
+ /// A stack of `Hir` nodes. This is roughly analogous to the call stack
+ /// used in a typical recursive visitor.
+ stack: Vec<(&'a Hir, Frame<'a>)>,
+}
+
+/// Represents a single stack frame while performing structural induction over
+/// an `Hir`.
+enum Frame<'a> {
+ /// A stack frame allocated just before descending into a repetition
+ /// operator's child node.
+ Repetition(&'a hir::Repetition),
+ /// A stack frame allocated just before descending into a group's child
+ /// node.
+ Group(&'a hir::Group),
+ /// The stack frame used while visiting every child node of a concatenation
+ /// of expressions.
+ Concat {
+ /// The child node we are currently visiting.
+ head: &'a Hir,
+ /// The remaining child nodes to visit (which may be empty).
+ tail: &'a [Hir],
+ },
+ /// The stack frame used while visiting every child node of an alternation
+ /// of expressions.
+ Alternation {
+ /// The child node we are currently visiting.
+ head: &'a Hir,
+ /// The remaining child nodes to visit (which may be empty).
+ tail: &'a [Hir],
+ },
+}
+
+impl<'a> HeapVisitor<'a> {
+ fn new() -> HeapVisitor<'a> {
+ HeapVisitor { stack: vec![] }
+ }
+
+ fn visit<V: Visitor>(
+ &mut self,
+ mut hir: &'a Hir,
+ mut visitor: V,
+ ) -> Result<V::Output, V::Err> {
+ self.stack.clear();
+
+ visitor.start();
+ loop {
+ visitor.visit_pre(hir)?;
+ if let Some(x) = self.induct(hir) {
+ let child = x.child();
+ self.stack.push((hir, x));
+ hir = child;
+ continue;
+ }
+ // No induction means we have a base case, so we can post visit
+ // it now.
+ visitor.visit_post(hir)?;
+
+ // At this point, we now try to pop our call stack until it is
+ // either empty or we hit another inductive case.
+ loop {
+ let (post_hir, frame) = match self.stack.pop() {
+ None => return visitor.finish(),
+ Some((post_hir, frame)) => (post_hir, frame),
+ };
+ // If this is a concat/alternate, then we might have additional
+ // inductive steps to process.
+ if let Some(x) = self.pop(frame) {
+ if let Frame::Alternation { .. } = x {
+ visitor.visit_alternation_in()?;
+ }
+ hir = x.child();
+ self.stack.push((post_hir, x));
+ break;
+ }
+ // Otherwise, we've finished visiting all the child nodes for
+ // this HIR, so we can post visit it now.
+ visitor.visit_post(post_hir)?;
+ }
+ }
+ }
+
+ /// Build a stack frame for the given HIR if one is needed (which occurs if
+ /// and only if there are child nodes in the HIR). Otherwise, return None.
+ fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> {
+ match *hir.kind() {
+ HirKind::Repetition(ref x) => Some(Frame::Repetition(x)),
+ HirKind::Group(ref x) => Some(Frame::Group(x)),
+ HirKind::Concat(ref x) if x.is_empty() => None,
+ HirKind::Concat(ref x) => {
+ Some(Frame::Concat { head: &x[0], tail: &x[1..] })
+ }
+ HirKind::Alternation(ref x) if x.is_empty() => None,
+ HirKind::Alternation(ref x) => {
+ Some(Frame::Alternation { head: &x[0], tail: &x[1..] })
+ }
+ _ => None,
+ }
+ }
+
+ /// Pops the given frame. If the frame has an additional inductive step,
+ /// then return it, otherwise return `None`.
+ fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
+ match induct {
+ Frame::Repetition(_) => None,
+ Frame::Group(_) => None,
+ Frame::Concat { tail, .. } => {
+ if tail.is_empty() {
+ None
+ } else {
+ Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
+ }
+ }
+ Frame::Alternation { tail, .. } => {
+ if tail.is_empty() {
+ None
+ } else {
+ Some(Frame::Alternation {
+ head: &tail[0],
+ tail: &tail[1..],
+ })
+ }
+ }
+ }
+ }
+}
+
+impl<'a> Frame<'a> {
+ /// Perform the next inductive step on this frame and return the next
+ /// child HIR node to visit.
+ fn child(&self) -> &'a Hir {
+ match *self {
+ Frame::Repetition(rep) => &rep.hir,
+ Frame::Group(group) => &group.hir,
+ Frame::Concat { head, .. } => head,
+ Frame::Alternation { head, .. } => head,
+ }
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/lib.rs b/third_party/rust/regex-syntax/src/lib.rs
new file mode 100644
index 0000000000..1dfb38af39
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/lib.rs
@@ -0,0 +1,312 @@
+/*!
+This crate provides a robust regular expression parser.
+
+This crate defines two primary types:
+
+* [`Ast`](ast/enum.Ast.html) is the abstract syntax of a regular expression.
+ An abstract syntax corresponds to a *structured representation* of the
+ concrete syntax of a regular expression, where the concrete syntax is the
+ pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it
+ can be converted back to the original concrete syntax (modulo some details,
+ like whitespace). To a first approximation, the abstract syntax is complex
+ and difficult to analyze.
+* [`Hir`](hir/struct.Hir.html) is the high-level intermediate representation
+ ("HIR" or "high-level IR" for short) of regular expression. It corresponds to
+ an intermediate state of a regular expression that sits between the abstract
+ syntax and the low level compiled opcodes that are eventually responsible for
+ executing a regular expression search. Given some high-level IR, it is not
+ possible to produce the original concrete syntax (although it is possible to
+ produce an equivalent concrete syntax, but it will likely scarcely resemble
+ the original pattern). To a first approximation, the high-level IR is simple
+ and easy to analyze.
+
+These two types come with conversion routines:
+
+* An [`ast::parse::Parser`](ast/parse/struct.Parser.html) converts concrete
+ syntax (a `&str`) to an [`Ast`](ast/enum.Ast.html).
+* A [`hir::translate::Translator`](hir/translate/struct.Translator.html)
+ converts an [`Ast`](ast/enum.Ast.html) to a [`Hir`](hir/struct.Hir.html).
+
+As a convenience, the above two conversion routines are combined into one via
+the top-level [`Parser`](struct.Parser.html) type. This `Parser` will first
+convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`.
+
+
+# Example
+
+This example shows how to parse a pattern string into its HIR:
+
+```
+use regex_syntax::Parser;
+use regex_syntax::hir::{self, Hir};
+
+let hir = Parser::new().parse("a|b").unwrap();
+assert_eq!(hir, Hir::alternation(vec![
+ Hir::literal(hir::Literal::Unicode('a')),
+ Hir::literal(hir::Literal::Unicode('b')),
+]));
+```
+
+
+# Concrete syntax supported
+
+The concrete syntax is documented as part of the public API of the
+[`regex` crate](https://docs.rs/regex/%2A/regex/#syntax).
+
+
+# Input safety
+
+A key feature of this library is that it is safe to use with end user facing
+input. This plays a significant role in the internal implementation. In
+particular:
+
+1. Parsers provide a `nest_limit` option that permits callers to control how
+ deeply nested a regular expression is allowed to be. This makes it possible
+ to do case analysis over an `Ast` or an `Hir` using recursion without
+ worrying about stack overflow.
+2. Since relying on a particular stack size is brittle, this crate goes to
+ great lengths to ensure that all interactions with both the `Ast` and the
+ `Hir` do not use recursion. Namely, they use constant stack space and heap
+ space proportional to the size of the original pattern string (in bytes).
+ This includes the type's corresponding destructors. (One exception to this
+ is literal extraction, but this will eventually get fixed.)
+
+
+# Error reporting
+
+The `Display` implementations on all `Error` types exposed in this library
+provide nice human readable errors that are suitable for showing to end users
+in a monospace font.
+
+
+# Literal extraction
+
+This crate provides limited support for
+[literal extraction from `Hir` values](hir/literal/struct.Literals.html).
+Be warned that literal extraction currently uses recursion, and therefore,
+stack size proportional to the size of the `Hir`.
+
+The purpose of literal extraction is to speed up searches. That is, if you
+know a regular expression must match a prefix or suffix literal, then it is
+often quicker to search for instances of that literal, and then confirm or deny
+the match using the full regular expression engine. These optimizations are
+done automatically in the `regex` crate.
+
+
+# Crate features
+
+An important feature provided by this crate is its Unicode support. This
+includes things like case folding, boolean properties, general categories,
+scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`.
+However, a downside of this support is that it requires bundling several
+Unicode data tables that are substantial in size.
+
+A fair number of use cases do not require full Unicode support. For this
+reason, this crate exposes a number of features to control which Unicode
+data is available.
+
+If a regular expression attempts to use a Unicode feature that is not available
+because the corresponding crate feature was disabled, then translating that
+regular expression to an `Hir` will return an error. (It is still possible
+construct an `Ast` for such a regular expression, since Unicode data is not
+used until translation to an `Hir`.) Stated differently, enabling or disabling
+any of the features below can only add or subtract from the total set of valid
+regular expressions. Enabling or disabling a feature will never modify the
+match semantics of a regular expression.
+
+The following features are available:
+
+* **unicode** -
+ Enables all Unicode features. This feature is enabled by default, and will
+ always cover all Unicode features, even if more are added in the future.
+* **unicode-age** -
+ Provide the data for the
+ [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
+ This makes it possible to use classes like `\p{Age:6.0}` to refer to all
+ codepoints first introduced in Unicode 6.0
+* **unicode-bool** -
+ Provide the data for numerous Unicode boolean properties. The full list
+ is not included here, but contains properties like `Alphabetic`, `Emoji`,
+ `Lowercase`, `Math`, `Uppercase` and `White_Space`.
+* **unicode-case** -
+ Provide the data for case insensitive matching using
+ [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
+* **unicode-gencat** -
+ Provide the data for
+ [Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
+ This includes, but is not limited to, `Decimal_Number`, `Letter`,
+ `Math_Symbol`, `Number` and `Punctuation`.
+* **unicode-perl** -
+ Provide the data for supporting the Unicode-aware Perl character classes,
+ corresponding to `\w`, `\s` and `\d`. This is also necessary for using
+ Unicode-aware word boundary assertions. Note that if this feature is
+ disabled, the `\s` and `\d` character classes are still available if the
+ `unicode-bool` and `unicode-gencat` features are enabled, respectively.
+* **unicode-script** -
+ Provide the data for
+ [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
+ This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
+ `Latin` and `Thai`.
+* **unicode-segment** -
+ Provide the data necessary to provide the properties used to implement the
+ [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
+ This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
+ `\p{sb=ATerm}`.
+*/
+
+#![deny(missing_docs)]
+#![warn(missing_debug_implementations)]
+#![forbid(unsafe_code)]
+
+pub use crate::error::{Error, Result};
+pub use crate::parser::{Parser, ParserBuilder};
+pub use crate::unicode::UnicodeWordError;
+
+pub mod ast;
+mod either;
+mod error;
+pub mod hir;
+mod parser;
+mod unicode;
+mod unicode_tables;
+pub mod utf8;
+
+/// Escapes all regular expression meta characters in `text`.
+///
+/// The string returned may be safely used as a literal in a regular
+/// expression.
+pub fn escape(text: &str) -> String {
+ let mut quoted = String::new();
+ escape_into(text, &mut quoted);
+ quoted
+}
+
+/// Escapes all meta characters in `text` and writes the result into `buf`.
+///
+/// This will append escape characters into the given buffer. The characters
+/// that are appended are safe to use as a literal in a regular expression.
+pub fn escape_into(text: &str, buf: &mut String) {
+ buf.reserve(text.len());
+ for c in text.chars() {
+ if is_meta_character(c) {
+ buf.push('\\');
+ }
+ buf.push(c);
+ }
+}
+
+/// Returns true if the given character has significance in a regex.
+///
+/// These are the only characters that are allowed to be escaped, with one
+/// exception: an ASCII space character may be escaped when extended mode (with
+/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns
+/// `false`.
+///
+/// Note that the set of characters for which this function returns `true` or
+/// `false` is fixed and won't change in a semver compatible release.
+pub fn is_meta_character(c: char) -> bool {
+ match c {
+ '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
+ | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
+ _ => false,
+ }
+}
+
+/// Returns true if and only if the given character is a Unicode word
+/// character.
+///
+/// A Unicode word character is defined by
+/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
+/// In particular, a character
+/// is considered a word character if it is in either of the `Alphabetic` or
+/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
+/// or `Connector_Punctuation` general categories.
+///
+/// # Panics
+///
+/// If the `unicode-perl` feature is not enabled, then this function panics.
+/// For this reason, it is recommended that callers use
+/// [`try_is_word_character`](fn.try_is_word_character.html)
+/// instead.
+pub fn is_word_character(c: char) -> bool {
+ try_is_word_character(c).expect("unicode-perl feature must be enabled")
+}
+
+/// Returns true if and only if the given character is a Unicode word
+/// character.
+///
+/// A Unicode word character is defined by
+/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
+/// In particular, a character
+/// is considered a word character if it is in either of the `Alphabetic` or
+/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
+/// or `Connector_Punctuation` general categories.
+///
+/// # Errors
+///
+/// If the `unicode-perl` feature is not enabled, then this function always
+/// returns an error.
+pub fn try_is_word_character(
+ c: char,
+) -> std::result::Result<bool, UnicodeWordError> {
+ unicode::is_word_character(c)
+}
+
+/// Returns true if and only if the given character is an ASCII word character.
+///
+/// An ASCII word character is defined by the following character class:
+/// `[_0-9a-zA-Z]'.
+pub fn is_word_byte(c: u8) -> bool {
+ match c {
+ b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
+ _ => false,
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn escape_meta() {
+ assert_eq!(
+ escape(r"\.+*?()|[]{}^$#&-~"),
+ r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string()
+ );
+ }
+
+ #[test]
+ fn word_byte() {
+ assert!(is_word_byte(b'a'));
+ assert!(!is_word_byte(b'-'));
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-perl")]
+ fn word_char() {
+ assert!(is_word_character('a'), "ASCII");
+ assert!(is_word_character('à'), "Latin-1");
+ assert!(is_word_character('β'), "Greek");
+ assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)");
+ assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)");
+ assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)");
+ assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)");
+ assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)");
+ assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)");
+ assert!(!is_word_character('-'));
+ assert!(!is_word_character('☃'));
+ }
+
+ #[test]
+ #[should_panic]
+ #[cfg(not(feature = "unicode-perl"))]
+ fn word_char_disabled_panic() {
+ assert!(is_word_character('a'));
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-perl"))]
+ fn word_char_disabled_error() {
+ assert!(try_is_word_character('a').is_err());
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/parser.rs b/third_party/rust/regex-syntax/src/parser.rs
new file mode 100644
index 0000000000..ded95b280a
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/parser.rs
@@ -0,0 +1,200 @@
+use crate::ast;
+use crate::hir;
+
+use crate::Result;
+
+/// A builder for a regular expression parser.
+///
+/// This builder permits modifying configuration options for the parser.
+///
+/// This type combines the builder options for both the
+/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html)
+/// and the
+/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html).
+#[derive(Clone, Debug, Default)]
+pub struct ParserBuilder {
+ ast: ast::parse::ParserBuilder,
+ hir: hir::translate::TranslatorBuilder,
+}
+
+impl ParserBuilder {
+ /// Create a new parser builder with a default configuration.
+ pub fn new() -> ParserBuilder {
+ ParserBuilder::default()
+ }
+
+ /// Build a parser from this configuration with the given pattern.
+ pub fn build(&self) -> Parser {
+ Parser { ast: self.ast.build(), hir: self.hir.build() }
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an `Ast` using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire Ast is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since this parser
+ /// implementation will limit itself to heap space proportional to the
+ /// length of the pattern string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation, which results in a nest
+ /// depth of `1`. In general, a nest limit is not something that manifests
+ /// in an obvious way in the concrete syntax, therefore, it should not be
+ /// used in a granular way.
+ pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
+ self.ast.nest_limit(limit);
+ self
+ }
+
+ /// Whether to support octal syntax or not.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints in
+ /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+ /// `\141` are all equivalent regular expressions, where the last example
+ /// shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem, it does
+ /// make good error messages harder. That is, in PCRE based regex engines,
+ /// syntax like `\0` invokes a backreference, which is explicitly
+ /// unsupported in Rust's regex engine. However, many users expect it to
+ /// be supported. Therefore, when octal support is disabled, the error
+ /// message will explicitly mention that backreferences aren't supported.
+ ///
+ /// Octal syntax is disabled by default.
+ pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.ast.octal(yes);
+ self
+ }
+
+ /// When enabled, the parser will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// When disabled (the default), the parser is guaranteed to produce
+ /// an expression that will only ever match valid UTF-8 (otherwise, the
+ /// parser will return an error).
+ ///
+ /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
+ /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
+ /// the parser to return an error. Namely, a negated ASCII word boundary
+ /// can result in matching positions that aren't valid UTF-8 boundaries.
+ pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.hir.allow_invalid_utf8(yes);
+ self
+ }
+
+ /// Enable verbose mode in the regular expression.
+ ///
+ /// When enabled, verbose mode permits insignificant whitespace in many
+ /// places in the regular expression, as well as comments. Comments are
+ /// started using `#` and continue until the end of the line.
+ ///
+ /// By default, this is disabled. It may be selectively enabled in the
+ /// regular expression by using the `x` flag regardless of this setting.
+ pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.ast.ignore_whitespace(yes);
+ self
+ }
+
+ /// Enable or disable the case insensitive flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `i` flag.
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.hir.case_insensitive(yes);
+ self
+ }
+
+ /// Enable or disable the multi-line matching flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `m` flag.
+ pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.hir.multi_line(yes);
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `s` flag.
+ pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.hir.dot_matches_new_line(yes);
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `U` flag.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.hir.swap_greed(yes);
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ ///
+ /// By default this is **enabled**. It may alternatively be selectively
+ /// disabled in the regular expression itself via the `u` flag.
+ ///
+ /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
+ /// default), a regular expression will fail to parse if Unicode mode is
+ /// disabled and a sub-expression could possibly match invalid UTF-8.
+ pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
+ self.hir.unicode(yes);
+ self
+ }
+}
+
+/// A convenience parser for regular expressions.
+///
+/// This parser takes as input a regular expression pattern string (the
+/// "concrete syntax") and returns a high-level intermediate representation
+/// (the HIR) suitable for most types of analysis. In particular, this parser
+/// hides the intermediate state of producing an AST (the "abstract syntax").
+/// The AST is itself far more complex than the HIR, so this parser serves as a
+/// convenience for never having to deal with it at all.
+///
+/// If callers have more fine grained use cases that need an AST, then please
+/// see the [`ast::parse`](ast/parse/index.html) module.
+///
+/// A `Parser` can be configured in more detail via a
+/// [`ParserBuilder`](struct.ParserBuilder.html).
+#[derive(Clone, Debug)]
+pub struct Parser {
+ ast: ast::parse::Parser,
+ hir: hir::translate::Translator,
+}
+
+impl Parser {
+ /// Create a new parser with a default configuration.
+ ///
+ /// The parser can be run with `parse` method. The parse method returns
+ /// a high level intermediate representation of the given regular
+ /// expression.
+ ///
+ /// To set configuration options on the parser, use
+ /// [`ParserBuilder`](struct.ParserBuilder.html).
+ pub fn new() -> Parser {
+ ParserBuilder::new().build()
+ }
+
+ /// Parse the regular expression into a high level intermediate
+ /// representation.
+ pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> {
+ let ast = self.ast.parse(pattern)?;
+ let hir = self.hir.translate(pattern, &ast)?;
+ Ok(hir)
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/unicode.rs b/third_party/rust/regex-syntax/src/unicode.rs
new file mode 100644
index 0000000000..8194d7f55b
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode.rs
@@ -0,0 +1,1001 @@
+use std::error;
+use std::fmt;
+use std::result;
+
+use crate::hir;
+
+/// A type alias for errors specific to Unicode handling of classes.
+pub type Result<T> = result::Result<T, Error>;
+
+/// An inclusive range of codepoints from a generated file (hence the static
+/// lifetime).
+type Range = &'static [(char, char)];
+
+/// An error that occurs when dealing with Unicode.
+///
+/// We don't impl the Error trait here because these always get converted
+/// into other public errors. (This error type isn't exported.)
+#[derive(Debug)]
+pub enum Error {
+ PropertyNotFound,
+ PropertyValueNotFound,
+ // Not used when unicode-perl is enabled.
+ #[allow(dead_code)]
+ PerlClassNotFound,
+}
+
+/// A type alias for errors specific to Unicode case folding.
+pub type FoldResult<T> = result::Result<T, CaseFoldError>;
+
+/// An error that occurs when Unicode-aware simple case folding fails.
+///
+/// This error can occur when the case mapping tables necessary for Unicode
+/// aware case folding are unavailable. This only occurs when the
+/// `unicode-case` feature is disabled. (The feature is enabled by default.)
+#[derive(Debug)]
+pub struct CaseFoldError(());
+
+impl error::Error for CaseFoldError {}
+
+impl fmt::Display for CaseFoldError {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(
+ f,
+ "Unicode-aware case folding is not available \
+ (probably because the unicode-case feature is not enabled)"
+ )
+ }
+}
+
+/// An error that occurs when the Unicode-aware `\w` class is unavailable.
+///
+/// This error can occur when the data tables necessary for the Unicode aware
+/// Perl character class `\w` are unavailable. This only occurs when the
+/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
+#[derive(Debug)]
+pub struct UnicodeWordError(());
+
+impl error::Error for UnicodeWordError {}
+
+impl fmt::Display for UnicodeWordError {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(
+ f,
+ "Unicode-aware \\w class is not available \
+ (probably because the unicode-perl feature is not enabled)"
+ )
+ }
+}
+
+/// Return an iterator over the equivalence class of simple case mappings
+/// for the given codepoint. The equivalence class does not include the
+/// given codepoint.
+///
+/// If the equivalence class is empty, then this returns the next scalar
+/// value that has a non-empty equivalence class, if it exists. If no such
+/// scalar value exists, then `None` is returned. The point of this behavior
+/// is to permit callers to avoid calling `simple_fold` more than they need
+/// to, since there is some cost to fetching the equivalence class.
+///
+/// This returns an error if the Unicode case folding tables are not available.
+pub fn simple_fold(
+ c: char,
+) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
+ #[cfg(not(feature = "unicode-case"))]
+ fn imp(
+ _: char,
+ ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
+ {
+ use std::option::IntoIter;
+ Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
+ }
+
+ #[cfg(feature = "unicode-case")]
+ fn imp(
+ c: char,
+ ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
+ {
+ use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
+
+ Ok(CASE_FOLDING_SIMPLE
+ .binary_search_by_key(&c, |&(c1, _)| c1)
+ .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied())
+ .map_err(|i| {
+ if i >= CASE_FOLDING_SIMPLE.len() {
+ None
+ } else {
+ Some(CASE_FOLDING_SIMPLE[i].0)
+ }
+ }))
+ }
+
+ imp(c)
+}
+
+/// Returns true if and only if the given (inclusive) range contains at least
+/// one Unicode scalar value that has a non-empty non-trivial simple case
+/// mapping.
+///
+/// This function panics if `end < start`.
+///
+/// This returns an error if the Unicode case folding tables are not available.
+pub fn contains_simple_case_mapping(
+ start: char,
+ end: char,
+) -> FoldResult<bool> {
+ #[cfg(not(feature = "unicode-case"))]
+ fn imp(_: char, _: char) -> FoldResult<bool> {
+ Err(CaseFoldError(()))
+ }
+
+ #[cfg(feature = "unicode-case")]
+ fn imp(start: char, end: char) -> FoldResult<bool> {
+ use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
+ use std::cmp::Ordering;
+
+ assert!(start <= end);
+ Ok(CASE_FOLDING_SIMPLE
+ .binary_search_by(|&(c, _)| {
+ if start <= c && c <= end {
+ Ordering::Equal
+ } else if c > end {
+ Ordering::Greater
+ } else {
+ Ordering::Less
+ }
+ })
+ .is_ok())
+ }
+
+ imp(start, end)
+}
+
+/// A query for finding a character class defined by Unicode. This supports
+/// either use of a property name directly, or lookup by property value. The
+/// former generally refers to Binary properties (see UTS#44, Table 8), but
+/// as a special exception (see UTS#18, Section 1.2) both general categories
+/// (an enumeration) and scripts (a catalog) are supported as if each of their
+/// possible values were a binary property.
+///
+/// In all circumstances, property names and values are normalized and
+/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
+///
+/// The lifetime `'a` refers to the shorter of the lifetimes of property name
+/// and property value.
+#[derive(Debug)]
+pub enum ClassQuery<'a> {
+ /// Return a class corresponding to a Unicode binary property, named by
+ /// a single letter.
+ OneLetter(char),
+ /// Return a class corresponding to a Unicode binary property.
+ ///
+ /// Note that, by special exception (see UTS#18, Section 1.2), both
+ /// general category values and script values are permitted here as if
+ /// they were a binary property.
+ Binary(&'a str),
+ /// Return a class corresponding to all codepoints whose property
+ /// (identified by `property_name`) corresponds to the given value
+ /// (identified by `property_value`).
+ ByValue {
+ /// A property name.
+ property_name: &'a str,
+ /// A property value.
+ property_value: &'a str,
+ },
+}
+
+impl<'a> ClassQuery<'a> {
+ fn canonicalize(&self) -> Result<CanonicalClassQuery> {
+ match *self {
+ ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
+ ClassQuery::Binary(name) => self.canonical_binary(name),
+ ClassQuery::ByValue { property_name, property_value } => {
+ let property_name = symbolic_name_normalize(property_name);
+ let property_value = symbolic_name_normalize(property_value);
+
+ let canon_name = match canonical_prop(&property_name)? {
+ None => return Err(Error::PropertyNotFound),
+ Some(canon_name) => canon_name,
+ };
+ Ok(match canon_name {
+ "General_Category" => {
+ let canon = match canonical_gencat(&property_value)? {
+ None => return Err(Error::PropertyValueNotFound),
+ Some(canon) => canon,
+ };
+ CanonicalClassQuery::GeneralCategory(canon)
+ }
+ "Script" => {
+ let canon = match canonical_script(&property_value)? {
+ None => return Err(Error::PropertyValueNotFound),
+ Some(canon) => canon,
+ };
+ CanonicalClassQuery::Script(canon)
+ }
+ _ => {
+ let vals = match property_values(canon_name)? {
+ None => return Err(Error::PropertyValueNotFound),
+ Some(vals) => vals,
+ };
+ let canon_val =
+ match canonical_value(vals, &property_value) {
+ None => {
+ return Err(Error::PropertyValueNotFound)
+ }
+ Some(canon_val) => canon_val,
+ };
+ CanonicalClassQuery::ByValue {
+ property_name: canon_name,
+ property_value: canon_val,
+ }
+ }
+ })
+ }
+ }
+ }
+
+ fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
+ let norm = symbolic_name_normalize(name);
+
+ // This is a special case where 'cf' refers to the 'Format' general
+ // category, but where the 'cf' abbreviation is also an abbreviation
+ // for the 'Case_Folding' property. But we want to treat it as
+ // a general category. (Currently, we don't even support the
+ // 'Case_Folding' property. But if we do in the future, users will be
+ // required to spell it out.)
+ if norm != "cf" {
+ if let Some(canon) = canonical_prop(&norm)? {
+ return Ok(CanonicalClassQuery::Binary(canon));
+ }
+ }
+ if let Some(canon) = canonical_gencat(&norm)? {
+ return Ok(CanonicalClassQuery::GeneralCategory(canon));
+ }
+ if let Some(canon) = canonical_script(&norm)? {
+ return Ok(CanonicalClassQuery::Script(canon));
+ }
+ Err(Error::PropertyNotFound)
+ }
+}
+
+/// Like ClassQuery, but its parameters have been canonicalized. This also
+/// differentiates binary properties from flattened general categories and
+/// scripts.
+#[derive(Debug, Eq, PartialEq)]
+enum CanonicalClassQuery {
+ /// The canonical binary property name.
+ Binary(&'static str),
+ /// The canonical general category name.
+ GeneralCategory(&'static str),
+ /// The canonical script name.
+ Script(&'static str),
+ /// An arbitrary association between property and value, both of which
+ /// have been canonicalized.
+ ///
+ /// Note that by construction, the property name of ByValue will never
+ /// be General_Category or Script. Those two cases are subsumed by the
+ /// eponymous variants.
+ ByValue {
+ /// The canonical property name.
+ property_name: &'static str,
+ /// The canonical property value.
+ property_value: &'static str,
+ },
+}
+
+/// Looks up a Unicode class given a query. If one doesn't exist, then
+/// `None` is returned.
+pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> {
+ use self::CanonicalClassQuery::*;
+
+ match query.canonicalize()? {
+ Binary(name) => bool_property(name),
+ GeneralCategory(name) => gencat(name),
+ Script(name) => script(name),
+ ByValue { property_name: "Age", property_value } => {
+ let mut class = hir::ClassUnicode::empty();
+ for set in ages(property_value)? {
+ class.union(&hir_class(set));
+ }
+ Ok(class)
+ }
+ ByValue { property_name: "Script_Extensions", property_value } => {
+ script_extension(property_value)
+ }
+ ByValue {
+ property_name: "Grapheme_Cluster_Break",
+ property_value,
+ } => gcb(property_value),
+ ByValue { property_name: "Sentence_Break", property_value } => {
+ sb(property_value)
+ }
+ ByValue { property_name: "Word_Break", property_value } => {
+ wb(property_value)
+ }
+ _ => {
+ // What else should we support?
+ Err(Error::PropertyNotFound)
+ }
+ }
+}
+
+/// Returns a Unicode aware class for \w.
+///
+/// This returns an error if the data is not available for \w.
+pub fn perl_word() -> Result<hir::ClassUnicode> {
+ #[cfg(not(feature = "unicode-perl"))]
+ fn imp() -> Result<hir::ClassUnicode> {
+ Err(Error::PerlClassNotFound)
+ }
+
+ #[cfg(feature = "unicode-perl")]
+ fn imp() -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::perl_word::PERL_WORD;
+ Ok(hir_class(PERL_WORD))
+ }
+
+ imp()
+}
+
+/// Returns a Unicode aware class for \s.
+///
+/// This returns an error if the data is not available for \s.
+pub fn perl_space() -> Result<hir::ClassUnicode> {
+ #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
+ fn imp() -> Result<hir::ClassUnicode> {
+ Err(Error::PerlClassNotFound)
+ }
+
+ #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
+ fn imp() -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::perl_space::WHITE_SPACE;
+ Ok(hir_class(WHITE_SPACE))
+ }
+
+ #[cfg(feature = "unicode-bool")]
+ fn imp() -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::property_bool::WHITE_SPACE;
+ Ok(hir_class(WHITE_SPACE))
+ }
+
+ imp()
+}
+
+/// Returns a Unicode aware class for \d.
+///
+/// This returns an error if the data is not available for \d.
+pub fn perl_digit() -> Result<hir::ClassUnicode> {
+ #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
+ fn imp() -> Result<hir::ClassUnicode> {
+ Err(Error::PerlClassNotFound)
+ }
+
+ #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
+ fn imp() -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
+ Ok(hir_class(DECIMAL_NUMBER))
+ }
+
+ #[cfg(feature = "unicode-gencat")]
+ fn imp() -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::general_category::DECIMAL_NUMBER;
+ Ok(hir_class(DECIMAL_NUMBER))
+ }
+
+ imp()
+}
+
+/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
+pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
+ let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
+ .collect();
+ hir::ClassUnicode::new(hir_ranges)
+}
+
+/// Returns true only if the given codepoint is in the `\w` character class.
+///
+/// If the `unicode-perl` feature is not enabled, then this returns an error.
+pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
+ #[cfg(not(feature = "unicode-perl"))]
+ fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
+ Err(UnicodeWordError(()))
+ }
+
+ #[cfg(feature = "unicode-perl")]
+ fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
+ use crate::is_word_byte;
+ use crate::unicode_tables::perl_word::PERL_WORD;
+ use std::cmp::Ordering;
+
+ if c <= 0x7F as char && is_word_byte(c as u8) {
+ return Ok(true);
+ }
+ Ok(PERL_WORD
+ .binary_search_by(|&(start, end)| {
+ if start <= c && c <= end {
+ Ordering::Equal
+ } else if start > c {
+ Ordering::Greater
+ } else {
+ Ordering::Less
+ }
+ })
+ .is_ok())
+ }
+
+ imp(c)
+}
+
+/// A mapping of property values for a specific property.
+///
+/// The first element of each tuple is a normalized property value while the
+/// second element of each tuple is the corresponding canonical property
+/// value.
+type PropertyValues = &'static [(&'static str, &'static str)];
+
+fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
+ Ok(match normalized_value {
+ "any" => Some("Any"),
+ "assigned" => Some("Assigned"),
+ "ascii" => Some("ASCII"),
+ _ => {
+ let gencats = property_values("General_Category")?.unwrap();
+ canonical_value(gencats, normalized_value)
+ }
+ })
+}
+
+fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
+ let scripts = property_values("Script")?.unwrap();
+ Ok(canonical_value(scripts, normalized_value))
+}
+
+/// Find the canonical property name for the given normalized property name.
+///
+/// If no such property exists, then `None` is returned.
+///
+/// The normalized property name must have been normalized according to
+/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
+///
+/// If the property names data is not available, then an error is returned.
+fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
+ #[cfg(not(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+ )))]
+ fn imp(_: &str) -> Result<Option<&'static str>> {
+ Err(Error::PropertyNotFound)
+ }
+
+ #[cfg(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+ ))]
+ fn imp(name: &str) -> Result<Option<&'static str>> {
+ use crate::unicode_tables::property_names::PROPERTY_NAMES;
+
+ Ok(PROPERTY_NAMES
+ .binary_search_by_key(&name, |&(n, _)| n)
+ .ok()
+ .map(|i| PROPERTY_NAMES[i].1))
+ }
+
+ imp(normalized_name)
+}
+
+/// Find the canonical property value for the given normalized property
+/// value.
+///
+/// The given property values should correspond to the values for the property
+/// under question, which can be found using `property_values`.
+///
+/// If no such property value exists, then `None` is returned.
+///
+/// The normalized property value must have been normalized according to
+/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
+fn canonical_value(
+ vals: PropertyValues,
+ normalized_value: &str,
+) -> Option<&'static str> {
+ vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
+ .ok()
+ .map(|i| vals[i].1)
+}
+
+/// Return the table of property values for the given property name.
+///
+/// If the property values data is not available, then an error is returned.
+fn property_values(
+ canonical_property_name: &'static str,
+) -> Result<Option<PropertyValues>> {
+ #[cfg(not(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+ )))]
+ fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
+ Err(Error::PropertyValueNotFound)
+ }
+
+ #[cfg(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+ ))]
+ fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
+ use crate::unicode_tables::property_values::PROPERTY_VALUES;
+
+ Ok(PROPERTY_VALUES
+ .binary_search_by_key(&name, |&(n, _)| n)
+ .ok()
+ .map(|i| PROPERTY_VALUES[i].1))
+ }
+
+ imp(canonical_property_name)
+}
+
+// This is only used in some cases, but small enough to just let it be dead
+// instead of figuring out (and maintaining) the right set of features.
+#[allow(dead_code)]
+fn property_set(
+ name_map: &'static [(&'static str, Range)],
+ canonical: &'static str,
+) -> Option<Range> {
+ name_map
+ .binary_search_by_key(&canonical, |x| x.0)
+ .ok()
+ .map(|i| name_map[i].1)
+}
+
+/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
+/// of codepoints that were added in a particular revision of Unicode. The
+/// iterator yields items in chronological order.
+///
+/// If the given age value isn't valid or if the data isn't available, then an
+/// error is returned instead.
+fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
+ #[cfg(not(feature = "unicode-age"))]
+ fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
+ use std::option::IntoIter;
+ Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
+ }
+
+ #[cfg(feature = "unicode-age")]
+ fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
+ use crate::unicode_tables::age;
+
+ const AGES: &[(&str, Range)] = &[
+ ("V1_1", age::V1_1),
+ ("V2_0", age::V2_0),
+ ("V2_1", age::V2_1),
+ ("V3_0", age::V3_0),
+ ("V3_1", age::V3_1),
+ ("V3_2", age::V3_2),
+ ("V4_0", age::V4_0),
+ ("V4_1", age::V4_1),
+ ("V5_0", age::V5_0),
+ ("V5_1", age::V5_1),
+ ("V5_2", age::V5_2),
+ ("V6_0", age::V6_0),
+ ("V6_1", age::V6_1),
+ ("V6_2", age::V6_2),
+ ("V6_3", age::V6_3),
+ ("V7_0", age::V7_0),
+ ("V8_0", age::V8_0),
+ ("V9_0", age::V9_0),
+ ("V10_0", age::V10_0),
+ ("V11_0", age::V11_0),
+ ("V12_0", age::V12_0),
+ ("V12_1", age::V12_1),
+ ("V13_0", age::V13_0),
+ ("V14_0", age::V14_0),
+ ("V15_0", age::V15_0),
+ ];
+ assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
+
+ let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
+ match pos {
+ None => Err(Error::PropertyValueNotFound),
+ Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)),
+ }
+ }
+
+ imp(canonical_age)
+}
+
+/// Returns the Unicode HIR class corresponding to the given general category.
+///
+/// Name canonicalization is assumed to be performed by the caller.
+///
+/// If the given general category could not be found, or if the general
+/// category data is not available, then an error is returned.
+fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
+ #[cfg(not(feature = "unicode-gencat"))]
+ fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
+ Err(Error::PropertyNotFound)
+ }
+
+ #[cfg(feature = "unicode-gencat")]
+ fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::general_category::BY_NAME;
+ match name {
+ "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
+ "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
+ "Assigned" => {
+ let mut cls = gencat("Unassigned")?;
+ cls.negate();
+ Ok(cls)
+ }
+ name => property_set(BY_NAME, name)
+ .map(hir_class)
+ .ok_or(Error::PropertyValueNotFound),
+ }
+ }
+
+ match canonical_name {
+ "Decimal_Number" => perl_digit(),
+ name => imp(name),
+ }
+}
+
+/// Returns the Unicode HIR class corresponding to the given script.
+///
+/// Name canonicalization is assumed to be performed by the caller.
+///
+/// If the given script could not be found, or if the script data is not
+/// available, then an error is returned.
+fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
+ #[cfg(not(feature = "unicode-script"))]
+ fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
+ Err(Error::PropertyNotFound)
+ }
+
+ #[cfg(feature = "unicode-script")]
+ fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::script::BY_NAME;
+ property_set(BY_NAME, name)
+ .map(hir_class)
+ .ok_or(Error::PropertyValueNotFound)
+ }
+
+ imp(canonical_name)
+}
+
+/// Returns the Unicode HIR class corresponding to the given script extension.
+///
+/// Name canonicalization is assumed to be performed by the caller.
+///
+/// If the given script extension could not be found, or if the script data is
+/// not available, then an error is returned.
+fn script_extension(
+ canonical_name: &'static str,
+) -> Result<hir::ClassUnicode> {
+ #[cfg(not(feature = "unicode-script"))]
+ fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
+ Err(Error::PropertyNotFound)
+ }
+
+ #[cfg(feature = "unicode-script")]
+ fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::script_extension::BY_NAME;
+ property_set(BY_NAME, name)
+ .map(hir_class)
+ .ok_or(Error::PropertyValueNotFound)
+ }
+
+ imp(canonical_name)
+}
+
+/// Returns the Unicode HIR class corresponding to the given Unicode boolean
+/// property.
+///
+/// Name canonicalization is assumed to be performed by the caller.
+///
+/// If the given boolean property could not be found, or if the boolean
+/// property data is not available, then an error is returned.
+fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
+ #[cfg(not(feature = "unicode-bool"))]
+ fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
+ Err(Error::PropertyNotFound)
+ }
+
+ #[cfg(feature = "unicode-bool")]
+ fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::property_bool::BY_NAME;
+ property_set(BY_NAME, name)
+ .map(hir_class)
+ .ok_or(Error::PropertyNotFound)
+ }
+
+ match canonical_name {
+ "Decimal_Number" => perl_digit(),
+ "White_Space" => perl_space(),
+ name => imp(name),
+ }
+}
+
+/// Returns the Unicode HIR class corresponding to the given grapheme cluster
+/// break property.
+///
+/// Name canonicalization is assumed to be performed by the caller.
+///
+/// If the given property could not be found, or if the corresponding data is
+/// not available, then an error is returned.
+fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
+ #[cfg(not(feature = "unicode-segment"))]
+ fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
+ Err(Error::PropertyNotFound)
+ }
+
+ #[cfg(feature = "unicode-segment")]
+ fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
+ property_set(BY_NAME, name)
+ .map(hir_class)
+ .ok_or(Error::PropertyValueNotFound)
+ }
+
+ imp(canonical_name)
+}
+
+/// Returns the Unicode HIR class corresponding to the given word break
+/// property.
+///
+/// Name canonicalization is assumed to be performed by the caller.
+///
+/// If the given property could not be found, or if the corresponding data is
+/// not available, then an error is returned.
+fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
+ #[cfg(not(feature = "unicode-segment"))]
+ fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
+ Err(Error::PropertyNotFound)
+ }
+
+ #[cfg(feature = "unicode-segment")]
+ fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::word_break::BY_NAME;
+ property_set(BY_NAME, name)
+ .map(hir_class)
+ .ok_or(Error::PropertyValueNotFound)
+ }
+
+ imp(canonical_name)
+}
+
+/// Returns the Unicode HIR class corresponding to the given sentence
+/// break property.
+///
+/// Name canonicalization is assumed to be performed by the caller.
+///
+/// If the given property could not be found, or if the corresponding data is
+/// not available, then an error is returned.
+fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
+ #[cfg(not(feature = "unicode-segment"))]
+ fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
+ Err(Error::PropertyNotFound)
+ }
+
+ #[cfg(feature = "unicode-segment")]
+ fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
+ use crate::unicode_tables::sentence_break::BY_NAME;
+ property_set(BY_NAME, name)
+ .map(hir_class)
+ .ok_or(Error::PropertyValueNotFound)
+ }
+
+ imp(canonical_name)
+}
+
+/// Like symbolic_name_normalize_bytes, but operates on a string.
+fn symbolic_name_normalize(x: &str) -> String {
+ let mut tmp = x.as_bytes().to_vec();
+ let len = symbolic_name_normalize_bytes(&mut tmp).len();
+ tmp.truncate(len);
+ // This should always succeed because `symbolic_name_normalize_bytes`
+ // guarantees that `&tmp[..len]` is always valid UTF-8.
+ //
+ // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
+ // to be worth skipping the additional safety check. A benchmark must
+ // justify it first.
+ String::from_utf8(tmp).unwrap()
+}
+
+/// Normalize the given symbolic name in place according to UAX44-LM3.
+///
+/// A "symbolic name" typically corresponds to property names and property
+/// value aliases. Note, though, that it should not be applied to property
+/// string values.
+///
+/// The slice returned is guaranteed to be valid UTF-8 for all possible values
+/// of `slice`.
+///
+/// See: https://unicode.org/reports/tr44/#UAX44-LM3
+fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
+ // I couldn't find a place in the standard that specified that property
+ // names/aliases had a particular structure (unlike character names), but
+ // we assume that it's ASCII only and drop anything that isn't ASCII.
+ let mut start = 0;
+ let mut starts_with_is = false;
+ if slice.len() >= 2 {
+ // Ignore any "is" prefix.
+ starts_with_is = slice[0..2] == b"is"[..]
+ || slice[0..2] == b"IS"[..]
+ || slice[0..2] == b"iS"[..]
+ || slice[0..2] == b"Is"[..];
+ if starts_with_is {
+ start = 2;
+ }
+ }
+ let mut next_write = 0;
+ for i in start..slice.len() {
+ // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
+ // UTF-8, we ensure that the slice contains only ASCII bytes. In
+ // particular, we drop every non-ASCII byte from the normalized string.
+ let b = slice[i];
+ if b == b' ' || b == b'_' || b == b'-' {
+ continue;
+ } else if b'A' <= b && b <= b'Z' {
+ slice[next_write] = b + (b'a' - b'A');
+ next_write += 1;
+ } else if b <= 0x7F {
+ slice[next_write] = b;
+ next_write += 1;
+ }
+ }
+ // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
+ // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
+ // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
+ // is actually an alias for the 'Other' general category.
+ if starts_with_is && next_write == 1 && slice[0] == b'c' {
+ slice[0] = b'i';
+ slice[1] = b's';
+ slice[2] = b'c';
+ next_write = 3;
+ }
+ &mut slice[..next_write]
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{
+ contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
+ symbolic_name_normalize_bytes,
+ };
+
+ #[cfg(feature = "unicode-case")]
+ fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
+ simple_fold(c).unwrap().unwrap()
+ }
+
+ #[cfg(feature = "unicode-case")]
+ fn simple_fold_err(c: char) -> Option<char> {
+ match simple_fold(c).unwrap() {
+ Ok(_) => unreachable!("simple_fold returned Ok iterator"),
+ Err(next) => next,
+ }
+ }
+
+ #[cfg(feature = "unicode-case")]
+ fn contains_case_map(start: char, end: char) -> bool {
+ contains_simple_case_mapping(start, end).unwrap()
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn simple_fold_k() {
+ let xs: Vec<char> = simple_fold_ok('k').collect();
+ assert_eq!(xs, vec!['K', 'K']);
+
+ let xs: Vec<char> = simple_fold_ok('K').collect();
+ assert_eq!(xs, vec!['k', 'K']);
+
+ let xs: Vec<char> = simple_fold_ok('K').collect();
+ assert_eq!(xs, vec!['K', 'k']);
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn simple_fold_a() {
+ let xs: Vec<char> = simple_fold_ok('a').collect();
+ assert_eq!(xs, vec!['A']);
+
+ let xs: Vec<char> = simple_fold_ok('A').collect();
+ assert_eq!(xs, vec!['a']);
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn simple_fold_empty() {
+ assert_eq!(Some('A'), simple_fold_err('?'));
+ assert_eq!(Some('A'), simple_fold_err('@'));
+ assert_eq!(Some('a'), simple_fold_err('['));
+ assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn simple_fold_max() {
+ assert_eq!(None, simple_fold_err('\u{10FFFE}'));
+ assert_eq!(None, simple_fold_err('\u{10FFFF}'));
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-case"))]
+ fn simple_fold_disabled() {
+ assert!(simple_fold('a').is_err());
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-case")]
+ fn range_contains() {
+ assert!(contains_case_map('A', 'A'));
+ assert!(contains_case_map('Z', 'Z'));
+ assert!(contains_case_map('A', 'Z'));
+ assert!(contains_case_map('@', 'A'));
+ assert!(contains_case_map('Z', '['));
+ assert!(contains_case_map('☃', 'Ⰰ'));
+
+ assert!(!contains_case_map('[', '['));
+ assert!(!contains_case_map('[', '`'));
+
+ assert!(!contains_case_map('☃', '☃'));
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-case"))]
+ fn range_contains_disabled() {
+ assert!(contains_simple_case_mapping('a', 'a').is_err());
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-gencat")]
+ fn regression_466() {
+ use super::{CanonicalClassQuery, ClassQuery};
+
+ let q = ClassQuery::OneLetter('C');
+ assert_eq!(
+ q.canonicalize().unwrap(),
+ CanonicalClassQuery::GeneralCategory("Other")
+ );
+ }
+
+ #[test]
+ fn sym_normalize() {
+ let sym_norm = symbolic_name_normalize;
+
+ assert_eq!(sym_norm("Line_Break"), "linebreak");
+ assert_eq!(sym_norm("Line-break"), "linebreak");
+ assert_eq!(sym_norm("linebreak"), "linebreak");
+ assert_eq!(sym_norm("BA"), "ba");
+ assert_eq!(sym_norm("ba"), "ba");
+ assert_eq!(sym_norm("Greek"), "greek");
+ assert_eq!(sym_norm("isGreek"), "greek");
+ assert_eq!(sym_norm("IS_Greek"), "greek");
+ assert_eq!(sym_norm("isc"), "isc");
+ assert_eq!(sym_norm("is c"), "isc");
+ assert_eq!(sym_norm("is_c"), "isc");
+ }
+
+ #[test]
+ fn valid_utf8_symbolic() {
+ let mut x = b"abc\xFFxyz".to_vec();
+ let y = symbolic_name_normalize_bytes(&mut x);
+ assert_eq!(y, b"abcxyz");
+ }
+}
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/LICENSE-UNICODE b/third_party/rust/regex-syntax/src/unicode_tables/LICENSE-UNICODE
new file mode 100644
index 0000000000..b82826bdbd
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/LICENSE-UNICODE
@@ -0,0 +1,57 @@
+UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
+
+Unicode Data Files include all data files under the directories
+http://www.unicode.org/Public/, http://www.unicode.org/reports/,
+http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
+http://www.unicode.org/utility/trac/browser/.
+
+Unicode Data Files do not include PDF online code charts under the
+directory http://www.unicode.org/Public/.
+
+Software includes any source code published in the Unicode Standard
+or under the directories
+http://www.unicode.org/Public/, http://www.unicode.org/reports/,
+http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
+http://www.unicode.org/utility/trac/browser/.
+
+NOTICE TO USER: Carefully read the following legal agreement.
+BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
+DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
+YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT.
+IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
+THE DATA FILES OR SOFTWARE.
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 1991-2018 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/age.rs b/third_party/rust/regex-syntax/src/unicode_tables/age.rs
new file mode 100644
index 0000000000..71f4861e07
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/age.rs
@@ -0,0 +1,1791 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate age ucd-15.0.0 --chars
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
+ ("V10_0", V10_0),
+ ("V11_0", V11_0),
+ ("V12_0", V12_0),
+ ("V12_1", V12_1),
+ ("V13_0", V13_0),
+ ("V14_0", V14_0),
+ ("V15_0", V15_0),
+ ("V1_1", V1_1),
+ ("V2_0", V2_0),
+ ("V2_1", V2_1),
+ ("V3_0", V3_0),
+ ("V3_1", V3_1),
+ ("V3_2", V3_2),
+ ("V4_0", V4_0),
+ ("V4_1", V4_1),
+ ("V5_0", V5_0),
+ ("V5_1", V5_1),
+ ("V5_2", V5_2),
+ ("V6_0", V6_0),
+ ("V6_1", V6_1),
+ ("V6_2", V6_2),
+ ("V6_3", V6_3),
+ ("V7_0", V7_0),
+ ("V8_0", V8_0),
+ ("V9_0", V9_0),
+];
+
+pub const V10_0: &'static [(char, char)] = &[
+ ('ࡠ', 'ࡪ'),
+ ('ৼ', '৽'),
+ ('\u{afa}', '\u{aff}'),
+ ('\u{d00}', '\u{d00}'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('᳷', '᳷'),
+ ('\u{1df6}', '\u{1df9}'),
+ ('₿', '₿'),
+ ('⏿', '⏿'),
+ ('⯒', '⯒'),
+ ('⹅', '⹉'),
+ ('ㄮ', 'ㄮ'),
+ ('鿖', '鿪'),
+ ('𐌭', '𐌯'),
+ ('𑨀', '\u{11a47}'),
+ ('𑩐', '𑪃'),
+ ('𑪆', '𑪜'),
+ ('𑪞', '𑪢'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d47}'),
+ ('𑵐', '𑵙'),
+ ('𖿡', '𖿡'),
+ ('𛀂', '𛄞'),
+ ('𛅰', '𛋻'),
+ ('🉠', '🉥'),
+ ('🛓', '🛔'),
+ ('🛷', '🛸'),
+ ('🤀', '🤋'),
+ ('🤟', '🤟'),
+ ('🤨', '🤯'),
+ ('🤱', '🤲'),
+ ('🥌', '🥌'),
+ ('🥟', '🥫'),
+ ('🦒', '🦗'),
+ ('🧐', '🧦'),
+ ('𬺰', '𮯠'),
+];
+
+pub const V11_0: &'static [(char, char)] = &[
+ ('ՠ', 'ՠ'),
+ ('ֈ', 'ֈ'),
+ ('ׯ', 'ׯ'),
+ ('\u{7fd}', '߿'),
+ ('\u{8d3}', '\u{8d3}'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('੶', '੶'),
+ ('\u{c04}', '\u{c04}'),
+ ('಄', '಄'),
+ ('ᡸ', 'ᡸ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('⮺', '⮼'),
+ ('⯓', '⯫'),
+ ('⯰', '⯾'),
+ ('⹊', '⹎'),
+ ('ㄯ', 'ㄯ'),
+ ('鿫', '鿯'),
+ ('ꞯ', 'ꞯ'),
+ ('Ꞹ', 'ꞹ'),
+ ('ꣾ', '\u{a8ff}'),
+ ('𐨴', '𐨵'),
+ ('𐩈', '𐩈'),
+ ('𐴀', '\u{10d27}'),
+ ('𐴰', '𐴹'),
+ ('𐼀', '𐼧'),
+ ('𐼰', '𐽙'),
+ ('\u{110cd}', '\u{110cd}'),
+ ('𑅄', '𑅆'),
+ ('\u{1133b}', '\u{1133b}'),
+ ('\u{1145e}', '\u{1145e}'),
+ ('𑜚', '𑜚'),
+ ('𑠀', '𑠻'),
+ ('𑪝', '𑪝'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '𑶘'),
+ ('𑶠', '𑶩'),
+ ('𑻠', '𑻸'),
+ ('𖹀', '𖺚'),
+ ('𘟭', '𘟱'),
+ ('𝋠', '𝋳'),
+ ('𝍲', '𝍸'),
+ ('𞱱', '𞲴'),
+ ('🄯', '🄯'),
+ ('🛹', '🛹'),
+ ('🟕', '🟘'),
+ ('🥍', '🥏'),
+ ('🥬', '🥰'),
+ ('🥳', '🥶'),
+ ('🥺', '🥺'),
+ ('🥼', '🥿'),
+ ('🦘', '🦢'),
+ ('🦰', '🦹'),
+ ('🧁', '🧂'),
+ ('🧧', '🧿'),
+ ('🩠', '🩭'),
+];
+
+pub const V12_0: &'static [(char, char)] = &[
+ ('౷', '౷'),
+ ('ຆ', 'ຆ'),
+ ('ຉ', 'ຉ'),
+ ('ຌ', 'ຌ'),
+ ('ຎ', 'ຓ'),
+ ('ຘ', 'ຘ'),
+ ('ຠ', 'ຠ'),
+ ('ຨ', 'ຩ'),
+ ('ຬ', 'ຬ'),
+ ('\u{eba}', '\u{eba}'),
+ ('ᳺ', 'ᳺ'),
+ ('⯉', '⯉'),
+ ('⯿', '⯿'),
+ ('⹏', '⹏'),
+ ('Ꞻ', 'ꞿ'),
+ ('Ꟃ', 'Ᶎ'),
+ ('ꭦ', 'ꭧ'),
+ ('𐿠', '𐿶'),
+ ('𑑟', '𑑟'),
+ ('𑚸', '𑚸'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '\u{119d7}'),
+ ('\u{119da}', '𑧤'),
+ ('𑪄', '𑪅'),
+ ('𑿀', '𑿱'),
+ ('𑿿', '𑿿'),
+ ('\u{13430}', '\u{13438}'),
+ ('𖽅', '𖽊'),
+ ('\u{16f4f}', '\u{16f4f}'),
+ ('𖽿', '𖾇'),
+ ('𖿢', '𖿣'),
+ ('𘟲', '𘟷'),
+ ('𛅐', '𛅒'),
+ ('𛅤', '𛅧'),
+ ('𞄀', '𞄬'),
+ ('\u{1e130}', '𞄽'),
+ ('𞅀', '𞅉'),
+ ('𞅎', '𞅏'),
+ ('𞋀', '𞋹'),
+ ('𞋿', '𞋿'),
+ ('𞥋', '𞥋'),
+ ('𞴁', '𞴽'),
+ ('🅬', '🅬'),
+ ('🛕', '🛕'),
+ ('🛺', '🛺'),
+ ('🟠', '🟫'),
+ ('🤍', '🤏'),
+ ('🤿', '🤿'),
+ ('🥱', '🥱'),
+ ('🥻', '🥻'),
+ ('🦥', '🦪'),
+ ('🦮', '🦯'),
+ ('🦺', '🦿'),
+ ('🧃', '🧊'),
+ ('🧍', '🧏'),
+ ('🨀', '🩓'),
+ ('🩰', '🩳'),
+ ('🩸', '🩺'),
+ ('🪀', '🪂'),
+ ('🪐', '🪕'),
+];
+
+pub const V12_1: &'static [(char, char)] = &[('㋿', '㋿')];
+
+pub const V13_0: &'static [(char, char)] = &[
+ ('ࢾ', 'ࣇ'),
+ ('\u{b55}', '\u{b55}'),
+ ('ഄ', 'ഄ'),
+ ('\u{d81}', '\u{d81}'),
+ ('\u{1abf}', '\u{1ac0}'),
+ ('⮗', '⮗'),
+ ('⹐', '⹒'),
+ ('ㆻ', 'ㆿ'),
+ ('䶶', '䶿'),
+ ('鿰', '鿼'),
+ ('Ꟈ', 'ꟊ'),
+ ('Ꟶ', 'ꟶ'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('ꭨ', '꭫'),
+ ('𐆜', '𐆜'),
+ ('𐺀', '𐺩'),
+ ('\u{10eab}', '𐺭'),
+ ('𐺰', '𐺱'),
+ ('𐾰', '𐿋'),
+ ('𑅇', '𑅇'),
+ ('𑇎', '\u{111cf}'),
+ ('𑑚', '𑑚'),
+ ('𑑠', '𑑡'),
+ ('𑤀', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '𑥆'),
+ ('𑥐', '𑥙'),
+ ('𑾰', '𑾰'),
+ ('\u{16fe4}', '\u{16fe4}'),
+ ('𖿰', '𖿱'),
+ ('𘫳', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('🄍', '🄏'),
+ ('🅭', '🅯'),
+ ('🆭', '🆭'),
+ ('🛖', '🛗'),
+ ('🛻', '🛼'),
+ ('🢰', '🢱'),
+ ('🤌', '🤌'),
+ ('🥲', '🥲'),
+ ('🥷', '🥸'),
+ ('🦣', '🦤'),
+ ('🦫', '🦭'),
+ ('🧋', '🧋'),
+ ('🩴', '🩴'),
+ ('🪃', '🪆'),
+ ('🪖', '🪨'),
+ ('🪰', '🪶'),
+ ('🫀', '🫂'),
+ ('🫐', '🫖'),
+ ('🬀', '🮒'),
+ ('🮔', '🯊'),
+ ('🯰', '🯹'),
+ ('𪛗', '𪛝'),
+ ('𰀀', '𱍊'),
+];
+
+pub const V14_0: &'static [(char, char)] = &[
+ ('؝', '؝'),
+ ('ࡰ', 'ࢎ'),
+ ('\u{890}', '\u{891}'),
+ ('\u{898}', '\u{89f}'),
+ ('ࢵ', 'ࢵ'),
+ ('ࣈ', '\u{8d2}'),
+ ('\u{c3c}', '\u{c3c}'),
+ ('ౝ', 'ౝ'),
+ ('ೝ', 'ೝ'),
+ ('ᜍ', 'ᜍ'),
+ ('᜕', '᜕'),
+ ('ᜟ', 'ᜟ'),
+ ('\u{180f}', '\u{180f}'),
+ ('\u{1ac1}', '\u{1ace}'),
+ ('ᭌ', 'ᭌ'),
+ ('᭽', '᭾'),
+ ('\u{1dfa}', '\u{1dfa}'),
+ ('⃀', '⃀'),
+ ('Ⱟ', 'Ⱟ'),
+ ('ⱟ', 'ⱟ'),
+ ('⹓', '⹝'),
+ ('鿽', '鿿'),
+ ('Ꟁ', 'ꟁ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꟴ'),
+ ('﯂', '﯂'),
+ ('﵀', '﵏'),
+ ('﷏', '﷏'),
+ ('﷾', '﷿'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐽰', '𐾉'),
+ ('\u{11070}', '𑁵'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('𑚹', '𑚹'),
+ ('𑝀', '𑝆'),
+ ('𑪰', '𑪿'),
+ ('𒾐', '𒿲'),
+ ('𖩰', '𖪾'),
+ ('𖫀', '𖫉'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛄟', '𛄢'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('𜽐', '𜿃'),
+ ('𝇩', '𝇪'),
+ ('𝼀', '𝼞'),
+ ('𞊐', '\u{1e2ae}'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('🛝', '🛟'),
+ ('🟰', '🟰'),
+ ('🥹', '🥹'),
+ ('🧌', '🧌'),
+ ('🩻', '🩼'),
+ ('🪩', '🪬'),
+ ('🪷', '🪺'),
+ ('🫃', '🫅'),
+ ('🫗', '🫙'),
+ ('🫠', '🫧'),
+ ('🫰', '🫶'),
+ ('𪛞', '𪛟'),
+ ('𫜵', '𫜸'),
+];
+
+pub const V15_0: &'static [(char, char)] = &[
+ ('ೳ', 'ೳ'),
+ ('\u{ece}', '\u{ece}'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('𑈿', '\u{11241}'),
+ ('𑬀', '𑬉'),
+ ('\u{11f00}', '𑼐'),
+ ('𑼒', '\u{11f3a}'),
+ ('𑼾', '𑽙'),
+ ('𓐯', '𓐯'),
+ ('\u{13439}', '\u{13455}'),
+ ('𛄲', '𛄲'),
+ ('𛅕', '𛅕'),
+ ('𝋀', '𝋓'),
+ ('𝼥', '𝼪'),
+ ('𞀰', '𞁭'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('𞓐', '𞓹'),
+ ('🛜', '🛜'),
+ ('🝴', '🝶'),
+ ('🝻', '🝿'),
+ ('🟙', '🟙'),
+ ('🩵', '🩷'),
+ ('🪇', '🪈'),
+ ('🪭', '🪯'),
+ ('🪻', '🪽'),
+ ('🪿', '🪿'),
+ ('🫎', '🫏'),
+ ('🫚', '🫛'),
+ ('🫨', '🫨'),
+ ('🫷', '🫸'),
+ ('𫜹', '𫜹'),
+ ('𱍐', '𲎯'),
+];
+
+pub const V1_1: &'static [(char, char)] = &[
+ ('\0', 'ǵ'),
+ ('Ǻ', 'ȗ'),
+ ('ɐ', 'ʨ'),
+ ('ʰ', '˞'),
+ ('ˠ', '˩'),
+ ('\u{300}', '\u{345}'),
+ ('\u{360}', '\u{361}'),
+ ('ʹ', '͵'),
+ ('ͺ', 'ͺ'),
+ (';', ';'),
+ ('΄', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ώ'),
+ ('ϐ', 'ϖ'),
+ ('Ϛ', 'Ϛ'),
+ ('Ϝ', 'Ϝ'),
+ ('Ϟ', 'Ϟ'),
+ ('Ϡ', 'Ϡ'),
+ ('Ϣ', 'ϳ'),
+ ('Ё', 'Ќ'),
+ ('Ў', 'я'),
+ ('ё', 'ќ'),
+ ('ў', '\u{486}'),
+ ('Ґ', 'ӄ'),
+ ('Ӈ', 'ӈ'),
+ ('Ӌ', 'ӌ'),
+ ('Ӑ', 'ӫ'),
+ ('Ӯ', 'ӵ'),
+ ('Ӹ', 'ӹ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', '՟'),
+ ('ա', 'և'),
+ ('։', '։'),
+ ('\u{5b0}', '\u{5b9}'),
+ ('\u{5bb}', '׃'),
+ ('א', 'ת'),
+ ('װ', '״'),
+ ('،', '،'),
+ ('؛', '؛'),
+ ('؟', '؟'),
+ ('ء', 'غ'),
+ ('ـ', '\u{652}'),
+ ('٠', '٭'),
+ ('\u{670}', 'ڷ'),
+ ('ں', 'ھ'),
+ ('ۀ', 'ێ'),
+ ('ې', '\u{6ed}'),
+ ('۰', '۹'),
+ ('\u{901}', 'ः'),
+ ('अ', 'ह'),
+ ('\u{93c}', '\u{94d}'),
+ ('ॐ', '\u{954}'),
+ ('क़', '॰'),
+ ('\u{981}', 'ঃ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('\u{9bc}', '\u{9bc}'),
+ ('\u{9be}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', '\u{9cd}'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('ড়', 'ঢ়'),
+ ('য়', '\u{9e3}'),
+ ('০', '৺'),
+ ('\u{a02}', '\u{a02}'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('੦', 'ੴ'),
+ ('\u{a81}', 'ઃ'),
+ ('અ', 'ઋ'),
+ ('ઍ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('\u{abc}', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', 'ૠ'),
+ ('૦', '૯'),
+ ('\u{b01}', 'ଃ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଶ', 'ହ'),
+ ('\u{b3c}', '\u{b43}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b56}', '\u{b57}'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', 'ୡ'),
+ ('୦', '୰'),
+ ('\u{b82}', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'வ'),
+ ('ஷ', 'ஹ'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('௧', '௲'),
+ ('ఁ', 'ః'),
+ ('అ', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'ళ'),
+ ('వ', 'హ'),
+ ('\u{c3e}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('ౠ', 'ౡ'),
+ ('౦', '౯'),
+ ('ಂ', 'ಃ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('ಾ', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('ೞ', 'ೞ'),
+ ('ೠ', 'ೡ'),
+ ('೦', '೯'),
+ ('ം', 'ഃ'),
+ ('അ', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', 'ന'),
+ ('പ', 'ഹ'),
+ ('\u{d3e}', '\u{d43}'),
+ ('െ', 'ൈ'),
+ ('ൊ', '\u{d4d}'),
+ ('\u{d57}', '\u{d57}'),
+ ('ൠ', 'ൡ'),
+ ('൦', '൯'),
+ ('ก', '\u{e3a}'),
+ ('฿', '๛'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ງ', 'ຈ'),
+ ('ຊ', 'ຊ'),
+ ('ຍ', 'ຍ'),
+ ('ດ', 'ທ'),
+ ('ນ', 'ຟ'),
+ ('ມ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ວ'),
+ ('ສ', 'ຫ'),
+ ('ອ', '\u{eb9}'),
+ ('\u{ebb}', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('\u{ec8}', '\u{ecd}'),
+ ('໐', '໙'),
+ ('ໜ', 'ໝ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('ა', 'ჶ'),
+ ('჻', '჻'),
+ ('ᄀ', 'ᅙ'),
+ ('ᅟ', 'ᆢ'),
+ ('ᆨ', 'ᇹ'),
+ ('Ḁ', 'ẚ'),
+ ('Ạ', 'ỹ'),
+ ('ἀ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ῄ'),
+ ('ῆ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('῝', '`'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', '῾'),
+ ('\u{2000}', '\u{202e}'),
+ ('‰', '⁆'),
+ ('\u{206a}', '⁰'),
+ ('⁴', '₎'),
+ ('₠', '₪'),
+ ('\u{20d0}', '\u{20e1}'),
+ ('℀', 'ℸ'),
+ ('⅓', 'ↂ'),
+ ('←', '⇪'),
+ ('∀', '⋱'),
+ ('⌀', '⌀'),
+ ('⌂', '⍺'),
+ ('␀', '␤'),
+ ('⑀', '⑊'),
+ ('①', '⓪'),
+ ('─', '▕'),
+ ('■', '◯'),
+ ('☀', '☓'),
+ ('☚', '♯'),
+ ('✁', '✄'),
+ ('✆', '✉'),
+ ('✌', '✧'),
+ ('✩', '❋'),
+ ('❍', '❍'),
+ ('❏', '❒'),
+ ('❖', '❖'),
+ ('❘', '❞'),
+ ('❡', '❧'),
+ ('❶', '➔'),
+ ('➘', '➯'),
+ ('➱', '➾'),
+ ('\u{3000}', '〷'),
+ ('〿', '〿'),
+ ('ぁ', 'ゔ'),
+ ('\u{3099}', 'ゞ'),
+ ('ァ', 'ヾ'),
+ ('ㄅ', 'ㄬ'),
+ ('ㄱ', 'ㆎ'),
+ ('㆐', '㆟'),
+ ('㈀', '㈜'),
+ ('㈠', '㉃'),
+ ('㉠', '㉻'),
+ ('㉿', '㊰'),
+ ('㋀', '㋋'),
+ ('㋐', '㋾'),
+ ('㌀', '㍶'),
+ ('㍻', '㏝'),
+ ('㏠', '㏾'),
+ ('一', '龥'),
+ ('\u{e000}', '鶴'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('\u{fb1e}', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', '﴿'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('\u{fe20}', '\u{fe23}'),
+ ('︰', '﹄'),
+ ('﹉', '﹒'),
+ ('﹔', '﹦'),
+ ('﹨', '﹫'),
+ ('ﹰ', 'ﹲ'),
+ ('ﹴ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('\u{feff}', '\u{feff}'),
+ ('!', '~'),
+ ('。', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('¢', '₩'),
+ ('│', '○'),
+ ('�', '\u{ffff}'),
+];
+
+pub const V2_0: &'static [(char, char)] = &[
+ ('\u{591}', '\u{5a1}'),
+ ('\u{5a3}', '\u{5af}'),
+ ('\u{5c4}', '\u{5c4}'),
+ ('ༀ', 'ཇ'),
+ ('ཉ', 'ཀྵ'),
+ ('\u{f71}', 'ྋ'),
+ ('\u{f90}', '\u{f95}'),
+ ('\u{f97}', '\u{f97}'),
+ ('\u{f99}', '\u{fad}'),
+ ('\u{fb1}', '\u{fb7}'),
+ ('\u{fb9}', '\u{fb9}'),
+ ('ẛ', 'ẛ'),
+ ('₫', '₫'),
+ ('가', '힣'),
+ ('\u{1fffe}', '\u{1ffff}'),
+ ('\u{2fffe}', '\u{2ffff}'),
+ ('\u{3fffe}', '\u{3ffff}'),
+ ('\u{4fffe}', '\u{4ffff}'),
+ ('\u{5fffe}', '\u{5ffff}'),
+ ('\u{6fffe}', '\u{6ffff}'),
+ ('\u{7fffe}', '\u{7ffff}'),
+ ('\u{8fffe}', '\u{8ffff}'),
+ ('\u{9fffe}', '\u{9ffff}'),
+ ('\u{afffe}', '\u{affff}'),
+ ('\u{bfffe}', '\u{bffff}'),
+ ('\u{cfffe}', '\u{cffff}'),
+ ('\u{dfffe}', '\u{dffff}'),
+ ('\u{efffe}', '\u{10ffff}'),
+];
+
+pub const V2_1: &'static [(char, char)] = &[('€', '€'), ('', '')];
+
+pub const V3_0: &'static [(char, char)] = &[
+ ('Ƕ', 'ǹ'),
+ ('Ș', 'ȟ'),
+ ('Ȣ', 'ȳ'),
+ ('ʩ', 'ʭ'),
+ ('˟', '˟'),
+ ('˪', 'ˮ'),
+ ('\u{346}', '\u{34e}'),
+ ('\u{362}', '\u{362}'),
+ ('ϗ', 'ϗ'),
+ ('ϛ', 'ϛ'),
+ ('ϝ', 'ϝ'),
+ ('ϟ', 'ϟ'),
+ ('ϡ', 'ϡ'),
+ ('Ѐ', 'Ѐ'),
+ ('Ѝ', 'Ѝ'),
+ ('ѐ', 'ѐ'),
+ ('ѝ', 'ѝ'),
+ ('\u{488}', '\u{489}'),
+ ('Ҍ', 'ҏ'),
+ ('Ӭ', 'ӭ'),
+ ('֊', '֊'),
+ ('\u{653}', '\u{655}'),
+ ('ڸ', 'ڹ'),
+ ('ڿ', 'ڿ'),
+ ('ۏ', 'ۏ'),
+ ('ۺ', '۾'),
+ ('܀', '܍'),
+ ('\u{70f}', 'ܬ'),
+ ('\u{730}', '\u{74a}'),
+ ('ހ', '\u{7b0}'),
+ ('ං', 'ඃ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('ෲ', '෴'),
+ ('ཪ', 'ཪ'),
+ ('\u{f96}', '\u{f96}'),
+ ('\u{fae}', '\u{fb0}'),
+ ('\u{fb8}', '\u{fb8}'),
+ ('\u{fba}', '\u{fbc}'),
+ ('྾', '࿌'),
+ ('࿏', '࿏'),
+ ('က', 'အ'),
+ ('ဣ', 'ဧ'),
+ ('ဩ', 'ဪ'),
+ ('ာ', '\u{1032}'),
+ ('\u{1036}', '\u{1039}'),
+ ('၀', '\u{1059}'),
+ ('ሀ', 'ሆ'),
+ ('ለ', 'ቆ'),
+ ('ቈ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኆ'),
+ ('ኈ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኮ'),
+ ('ኰ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዎ'),
+ ('ዐ', 'ዖ'),
+ ('ዘ', 'ዮ'),
+ ('ደ', 'ጎ'),
+ ('ጐ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ጞ'),
+ ('ጠ', 'ፆ'),
+ ('ፈ', 'ፚ'),
+ ('፡', '፼'),
+ ('Ꭰ', 'Ᏼ'),
+ ('ᐁ', 'ᙶ'),
+ ('\u{1680}', '᚜'),
+ ('ᚠ', 'ᛰ'),
+ ('ក', 'ៜ'),
+ ('០', '៩'),
+ ('᠀', '\u{180e}'),
+ ('᠐', '᠙'),
+ ('ᠠ', 'ᡷ'),
+ ('ᢀ', '\u{18a9}'),
+ ('\u{202f}', '\u{202f}'),
+ ('⁈', '⁍'),
+ ('₭', '₯'),
+ ('\u{20e2}', '\u{20e3}'),
+ ('ℹ', '℺'),
+ ('Ↄ', 'Ↄ'),
+ ('⇫', '⇳'),
+ ('⌁', '⌁'),
+ ('⍻', '⍻'),
+ ('⍽', '⎚'),
+ ('␥', '␦'),
+ ('◰', '◷'),
+ ('☙', '☙'),
+ ('♰', '♱'),
+ ('⠀', '⣿'),
+ ('⺀', '⺙'),
+ ('⺛', '⻳'),
+ ('⼀', '⿕'),
+ ('⿰', '⿻'),
+ ('〸', '〺'),
+ ('〾', '〾'),
+ ('ㆠ', 'ㆷ'),
+ ('㐀', '䶵'),
+ ('ꀀ', 'ꒌ'),
+ ('꒐', '꒡'),
+ ('꒤', '꒳'),
+ ('꒵', '꓀'),
+ ('꓂', '꓄'),
+ ('꓆', '꓆'),
+ ('יִ', 'יִ'),
+ ('\u{fff9}', '\u{fffb}'),
+];
+
+pub const V3_1: &'static [(char, char)] = &[
+ ('ϴ', 'ϵ'),
+ ('\u{fdd0}', '\u{fdef}'),
+ ('𐌀', '𐌞'),
+ ('𐌠', '𐌣'),
+ ('𐌰', '𐍊'),
+ ('𐐀', '𐐥'),
+ ('𐐨', '𐑍'),
+ ('𝀀', '𝃵'),
+ ('𝄀', '𝄦'),
+ ('𝄪', '𝇝'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓀'),
+ ('𝓂', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚣'),
+ ('𝚨', '𝟉'),
+ ('𝟎', '𝟿'),
+ ('𠀀', '𪛖'),
+ ('丽', '𪘀'),
+ ('\u{e0001}', '\u{e0001}'),
+ ('\u{e0020}', '\u{e007f}'),
+];
+
+pub const V3_2: &'static [(char, char)] = &[
+ ('Ƞ', 'Ƞ'),
+ ('\u{34f}', '\u{34f}'),
+ ('\u{363}', '\u{36f}'),
+ ('Ϙ', 'ϙ'),
+ ('϶', '϶'),
+ ('Ҋ', 'ҋ'),
+ ('Ӆ', 'ӆ'),
+ ('Ӊ', 'ӊ'),
+ ('Ӎ', 'ӎ'),
+ ('Ԁ', 'ԏ'),
+ ('ٮ', 'ٯ'),
+ ('ޱ', 'ޱ'),
+ ('ჷ', 'ჸ'),
+ ('ᜀ', 'ᜌ'),
+ ('ᜎ', '\u{1714}'),
+ ('ᜠ', '᜶'),
+ ('ᝀ', '\u{1753}'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('\u{1772}', '\u{1773}'),
+ ('⁇', '⁇'),
+ ('⁎', '⁒'),
+ ('⁗', '⁗'),
+ ('\u{205f}', '\u{2063}'),
+ ('ⁱ', 'ⁱ'),
+ ('₰', '₱'),
+ ('\u{20e4}', '\u{20ea}'),
+ ('ℽ', '⅋'),
+ ('⇴', '⇿'),
+ ('⋲', '⋿'),
+ ('⍼', '⍼'),
+ ('⎛', '⏎'),
+ ('⓫', '⓾'),
+ ('▖', '▟'),
+ ('◸', '◿'),
+ ('☖', '☗'),
+ ('♲', '♽'),
+ ('⚀', '⚉'),
+ ('❨', '❵'),
+ ('⟐', '⟫'),
+ ('⟰', '⟿'),
+ ('⤀', '⫿'),
+ ('〻', '〽'),
+ ('ゕ', 'ゖ'),
+ ('ゟ', '゠'),
+ ('ヿ', 'ヿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㉑', '㉟'),
+ ('㊱', '㊿'),
+ ('꒢', '꒣'),
+ ('꒴', '꒴'),
+ ('꓁', '꓁'),
+ ('꓅', '꓅'),
+ ('侮', '頻'),
+ ('﷼', '﷼'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('﹅', '﹆'),
+ ('ﹳ', 'ﹳ'),
+ ('⦅', '⦆'),
+];
+
+pub const V4_0: &'static [(char, char)] = &[
+ ('ȡ', 'ȡ'),
+ ('ȴ', 'ȶ'),
+ ('ʮ', 'ʯ'),
+ ('˯', '˿'),
+ ('\u{350}', '\u{357}'),
+ ('\u{35d}', '\u{35f}'),
+ ('Ϸ', 'ϻ'),
+ ('\u{600}', '\u{603}'),
+ ('؍', '\u{615}'),
+ ('\u{656}', '\u{658}'),
+ ('ۮ', 'ۯ'),
+ ('ۿ', 'ۿ'),
+ ('ܭ', 'ܯ'),
+ ('ݍ', 'ݏ'),
+ ('ऄ', 'ऄ'),
+ ('ঽ', 'ঽ'),
+ ('\u{a01}', '\u{a01}'),
+ ('ਃ', 'ਃ'),
+ ('ઌ', 'ઌ'),
+ ('ૡ', '\u{ae3}'),
+ ('૱', '૱'),
+ ('ଵ', 'ଵ'),
+ ('ୱ', 'ୱ'),
+ ('௳', '௺'),
+ ('\u{cbc}', 'ಽ'),
+ ('\u{17dd}', '\u{17dd}'),
+ ('៰', '៹'),
+ ('ᤀ', 'ᤜ'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('᥀', '᥀'),
+ ('᥄', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('᧠', '᧿'),
+ ('ᴀ', 'ᵫ'),
+ ('⁓', '⁔'),
+ ('℻', '℻'),
+ ('⏏', '⏐'),
+ ('⓿', '⓿'),
+ ('☔', '☕'),
+ ('⚊', '⚑'),
+ ('⚠', '⚡'),
+ ('⬀', '⬍'),
+ ('㈝', '㈞'),
+ ('㉐', '㉐'),
+ ('㉼', '㉽'),
+ ('㋌', '㋏'),
+ ('㍷', '㍺'),
+ ('㏞', '㏟'),
+ ('㏿', '㏿'),
+ ('䷀', '䷿'),
+ ('﷽', '﷽'),
+ ('﹇', '﹈'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐄀', '𐄂'),
+ ('𐄇', '𐄳'),
+ ('𐄷', '𐄿'),
+ ('𐎀', '𐎝'),
+ ('𐎟', '𐎟'),
+ ('𐐦', '𐐧'),
+ ('𐑎', '𐒝'),
+ ('𐒠', '𐒩'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐠿'),
+ ('𝌀', '𝍖'),
+ ('𝓁', '𝓁'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const V4_1: &'static [(char, char)] = &[
+ ('ȷ', 'Ɂ'),
+ ('\u{358}', '\u{35c}'),
+ ('ϼ', 'Ͽ'),
+ ('Ӷ', 'ӷ'),
+ ('\u{5a2}', '\u{5a2}'),
+ ('\u{5c5}', '\u{5c7}'),
+ ('؋', '؋'),
+ ('؞', '؞'),
+ ('\u{659}', '\u{65e}'),
+ ('ݐ', 'ݭ'),
+ ('ॽ', 'ॽ'),
+ ('ৎ', 'ৎ'),
+ ('ஶ', 'ஶ'),
+ ('௦', '௦'),
+ ('࿐', '࿑'),
+ ('ჹ', 'ჺ'),
+ ('ჼ', 'ჼ'),
+ ('ሇ', 'ሇ'),
+ ('ቇ', 'ቇ'),
+ ('ኇ', 'ኇ'),
+ ('ኯ', 'ኯ'),
+ ('ዏ', 'ዏ'),
+ ('ዯ', 'ዯ'),
+ ('ጏ', 'ጏ'),
+ ('ጟ', 'ጟ'),
+ ('ፇ', 'ፇ'),
+ ('\u{135f}', '፠'),
+ ('ᎀ', '᎙'),
+ ('ᦀ', 'ᦩ'),
+ ('ᦰ', 'ᧉ'),
+ ('᧐', '᧙'),
+ ('᧞', '᧟'),
+ ('ᨀ', '\u{1a1b}'),
+ ('᨞', '᨟'),
+ ('ᵬ', '\u{1dc3}'),
+ ('⁕', '⁖'),
+ ('⁘', '⁞'),
+ ('ₐ', 'ₔ'),
+ ('₲', '₵'),
+ ('\u{20eb}', '\u{20eb}'),
+ ('ℼ', 'ℼ'),
+ ('⅌', '⅌'),
+ ('⏑', '⏛'),
+ ('☘', '☘'),
+ ('♾', '♿'),
+ ('⚒', '⚜'),
+ ('⚢', '⚱'),
+ ('⟀', '⟆'),
+ ('⬎', '⬓'),
+ ('Ⰰ', 'Ⱞ'),
+ ('ⰰ', 'ⱞ'),
+ ('Ⲁ', '⳪'),
+ ('⳹', 'ⴥ'),
+ ('ⴰ', 'ⵥ'),
+ ('ⵯ', 'ⵯ'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('⸀', '⸗'),
+ ('⸜', '⸝'),
+ ('㇀', '㇏'),
+ ('㉾', '㉾'),
+ ('龦', '龻'),
+ ('꜀', '꜖'),
+ ('ꠀ', '꠫'),
+ ('並', '龎'),
+ ('︐', '︙'),
+ ('𐅀', '𐆊'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏕'),
+ ('𐨀', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨳'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '𐩇'),
+ ('𐩐', '𐩘'),
+ ('𝈀', '𝉅'),
+ ('𝚤', '𝚥'),
+];
+
+pub const V5_0: &'static [(char, char)] = &[
+ ('ɂ', 'ɏ'),
+ ('ͻ', 'ͽ'),
+ ('ӏ', 'ӏ'),
+ ('Ӻ', 'ӿ'),
+ ('Ԑ', 'ԓ'),
+ ('\u{5ba}', '\u{5ba}'),
+ ('߀', 'ߺ'),
+ ('ॻ', 'ॼ'),
+ ('ॾ', 'ॿ'),
+ ('\u{ce2}', '\u{ce3}'),
+ ('ೱ', 'ೲ'),
+ ('\u{1b00}', 'ᭋ'),
+ ('᭐', '᭼'),
+ ('\u{1dc4}', '\u{1dca}'),
+ ('\u{1dfe}', '\u{1dff}'),
+ ('\u{20ec}', '\u{20ef}'),
+ ('⅍', 'ⅎ'),
+ ('ↄ', 'ↄ'),
+ ('⏜', '⏧'),
+ ('⚲', '⚲'),
+ ('⟇', '⟊'),
+ ('⬔', '⬚'),
+ ('⬠', '⬣'),
+ ('Ⱡ', 'ⱬ'),
+ ('ⱴ', 'ⱷ'),
+ ('ꜗ', 'ꜚ'),
+ ('꜠', '꜡'),
+ ('ꡀ', '꡷'),
+ ('𐤀', '𐤙'),
+ ('𐤟', '𐤟'),
+ ('𒀀', '𒍮'),
+ ('𒐀', '𒑢'),
+ ('𒑰', '𒑳'),
+ ('𝍠', '𝍱'),
+ ('𝟊', '𝟋'),
+];
+
+pub const V5_1: &'static [(char, char)] = &[
+ ('Ͱ', 'ͳ'),
+ ('Ͷ', 'ͷ'),
+ ('Ϗ', 'Ϗ'),
+ ('\u{487}', '\u{487}'),
+ ('Ԕ', 'ԣ'),
+ ('؆', '؊'),
+ ('\u{616}', '\u{61a}'),
+ ('ػ', 'ؿ'),
+ ('ݮ', 'ݿ'),
+ ('ॱ', 'ॲ'),
+ ('\u{a51}', '\u{a51}'),
+ ('\u{a75}', '\u{a75}'),
+ ('\u{b44}', '\u{b44}'),
+ ('\u{b62}', '\u{b63}'),
+ ('ௐ', 'ௐ'),
+ ('ఽ', 'ఽ'),
+ ('ౘ', 'ౙ'),
+ ('\u{c62}', '\u{c63}'),
+ ('౸', '౿'),
+ ('ഽ', 'ഽ'),
+ ('\u{d44}', '\u{d44}'),
+ ('\u{d62}', '\u{d63}'),
+ ('൰', '൵'),
+ ('൹', 'ൿ'),
+ ('ཫ', 'ཬ'),
+ ('࿎', '࿎'),
+ ('࿒', '࿔'),
+ ('ဢ', 'ဢ'),
+ ('ဨ', 'ဨ'),
+ ('ါ', 'ါ'),
+ ('\u{1033}', '\u{1035}'),
+ ('\u{103a}', 'ဿ'),
+ ('ၚ', '႙'),
+ ('႞', '႟'),
+ ('ᢪ', 'ᢪ'),
+ ('\u{1b80}', '᮪'),
+ ('ᮮ', '᮹'),
+ ('ᰀ', '\u{1c37}'),
+ ('᰻', '᱉'),
+ ('ᱍ', '᱿'),
+ ('\u{1dcb}', '\u{1de6}'),
+ ('ẜ', 'ẟ'),
+ ('Ỻ', 'ỿ'),
+ ('\u{2064}', '\u{2064}'),
+ ('\u{20f0}', '\u{20f0}'),
+ ('⅏', '⅏'),
+ ('ↅ', 'ↈ'),
+ ('⚝', '⚝'),
+ ('⚳', '⚼'),
+ ('⛀', '⛃'),
+ ('⟌', '⟌'),
+ ('⟬', '⟯'),
+ ('⬛', '⬟'),
+ ('⬤', '⭌'),
+ ('⭐', '⭔'),
+ ('Ɑ', 'Ɐ'),
+ ('ⱱ', 'ⱳ'),
+ ('ⱸ', 'ⱽ'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('⸘', '⸛'),
+ ('⸞', '⸰'),
+ ('ㄭ', 'ㄭ'),
+ ('㇐', '㇣'),
+ ('龼', '鿃'),
+ ('ꔀ', 'ꘫ'),
+ ('Ꙁ', 'ꙟ'),
+ ('Ꙣ', '꙳'),
+ ('\u{a67c}', 'ꚗ'),
+ ('ꜛ', 'ꜟ'),
+ ('Ꜣ', 'ꞌ'),
+ ('ꟻ', 'ꟿ'),
+ ('ꢀ', '\u{a8c4}'),
+ ('꣎', '꣙'),
+ ('꤀', '꥓'),
+ ('꥟', '꥟'),
+ ('ꨀ', '\u{aa36}'),
+ ('ꩀ', 'ꩍ'),
+ ('꩐', '꩙'),
+ ('꩜', '꩟'),
+ ('\u{fe24}', '\u{fe26}'),
+ ('𐆐', '𐆛'),
+ ('𐇐', '\u{101fd}'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('𐤠', '𐤹'),
+ ('𐤿', '𐤿'),
+ ('𝄩', '𝄩'),
+ ('🀀', '🀫'),
+ ('🀰', '🂓'),
+];
+
+pub const V5_2: &'static [(char, char)] = &[
+ ('Ԥ', 'ԥ'),
+ ('ࠀ', '\u{82d}'),
+ ('࠰', '࠾'),
+ ('\u{900}', '\u{900}'),
+ ('ॎ', 'ॎ'),
+ ('\u{955}', '\u{955}'),
+ ('ॹ', 'ॺ'),
+ ('৻', '৻'),
+ ('࿕', '࿘'),
+ ('ႚ', '\u{109d}'),
+ ('ᅚ', 'ᅞ'),
+ ('ᆣ', 'ᆧ'),
+ ('ᇺ', 'ᇿ'),
+ ('᐀', '᐀'),
+ ('ᙷ', 'ᙿ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᦪ', 'ᦫ'),
+ ('᧚', '᧚'),
+ ('ᨠ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '᪉'),
+ ('᪐', '᪙'),
+ ('᪠', '᪭'),
+ ('\u{1cd0}', 'ᳲ'),
+ ('\u{1dfd}', '\u{1dfd}'),
+ ('₶', '₸'),
+ ('⅐', '⅒'),
+ ('↉', '↉'),
+ ('⏨', '⏨'),
+ ('⚞', '⚟'),
+ ('⚽', '⚿'),
+ ('⛄', '⛍'),
+ ('⛏', '⛡'),
+ ('⛣', '⛣'),
+ ('⛨', '⛿'),
+ ('❗', '❗'),
+ ('⭕', '⭙'),
+ ('Ɒ', 'Ɒ'),
+ ('Ȿ', 'Ɀ'),
+ ('Ⳬ', '\u{2cf1}'),
+ ('⸱', '⸱'),
+ ('㉄', '㉏'),
+ ('鿄', '鿋'),
+ ('ꓐ', '꓿'),
+ ('ꚠ', '꛷'),
+ ('꠰', '꠹'),
+ ('\u{a8e0}', 'ꣻ'),
+ ('ꥠ', 'ꥼ'),
+ ('\u{a980}', '꧍'),
+ ('ꧏ', '꧙'),
+ ('꧞', '꧟'),
+ ('ꩠ', 'ꩻ'),
+ ('ꪀ', 'ꫂ'),
+ ('ꫛ', '꫟'),
+ ('ꯀ', '\u{abed}'),
+ ('꯰', '꯹'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('恵', '舘'),
+ ('𐡀', '𐡕'),
+ ('𐡗', '𐡟'),
+ ('𐤚', '𐤛'),
+ ('𐩠', '𐩿'),
+ ('𐬀', '𐬵'),
+ ('𐬹', '𐭕'),
+ ('𐭘', '𐭲'),
+ ('𐭸', '𐭿'),
+ ('𐰀', '𐱈'),
+ ('𐹠', '𐹾'),
+ ('\u{11080}', '𑃁'),
+ ('𓀀', '𓐮'),
+ ('🄀', '🄊'),
+ ('🄐', '🄮'),
+ ('🄱', '🄱'),
+ ('🄽', '🄽'),
+ ('🄿', '🄿'),
+ ('🅂', '🅂'),
+ ('🅆', '🅆'),
+ ('🅊', '🅎'),
+ ('🅗', '🅗'),
+ ('🅟', '🅟'),
+ ('🅹', '🅹'),
+ ('🅻', '🅼'),
+ ('🅿', '🅿'),
+ ('🆊', '🆍'),
+ ('🆐', '🆐'),
+ ('🈀', '🈀'),
+ ('🈐', '🈱'),
+ ('🉀', '🉈'),
+ ('𪜀', '𫜴'),
+];
+
+pub const V6_0: &'static [(char, char)] = &[
+ ('Ԧ', 'ԧ'),
+ ('ؠ', 'ؠ'),
+ ('\u{65f}', '\u{65f}'),
+ ('ࡀ', '\u{85b}'),
+ ('࡞', '࡞'),
+ ('\u{93a}', 'ऻ'),
+ ('ॏ', 'ॏ'),
+ ('\u{956}', '\u{957}'),
+ ('ॳ', 'ॷ'),
+ ('୲', '୷'),
+ ('ഩ', 'ഩ'),
+ ('ഺ', 'ഺ'),
+ ('ൎ', 'ൎ'),
+ ('ྌ', '\u{f8f}'),
+ ('࿙', '࿚'),
+ ('\u{135d}', '\u{135e}'),
+ ('ᯀ', '᯳'),
+ ('᯼', '᯿'),
+ ('\u{1dfc}', '\u{1dfc}'),
+ ('ₕ', 'ₜ'),
+ ('₹', '₹'),
+ ('⏩', '⏳'),
+ ('⛎', '⛎'),
+ ('⛢', '⛢'),
+ ('⛤', '⛧'),
+ ('✅', '✅'),
+ ('✊', '✋'),
+ ('✨', '✨'),
+ ('❌', '❌'),
+ ('❎', '❎'),
+ ('❓', '❕'),
+ ('❟', '❠'),
+ ('➕', '➗'),
+ ('➰', '➰'),
+ ('➿', '➿'),
+ ('⟎', '⟏'),
+ ('⵰', '⵰'),
+ ('\u{2d7f}', '\u{2d7f}'),
+ ('ㆸ', 'ㆺ'),
+ ('Ꙡ', 'ꙡ'),
+ ('Ɥ', 'ꞎ'),
+ ('Ꞑ', 'ꞑ'),
+ ('Ꞡ', 'ꞩ'),
+ ('ꟺ', 'ꟺ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('﮲', '﯁'),
+ ('𑀀', '𑁍'),
+ ('𑁒', '𑁯'),
+ ('𖠀', '𖨸'),
+ ('𛀀', '𛀁'),
+ ('🂠', '🂮'),
+ ('🂱', '🂾'),
+ ('🃁', '🃏'),
+ ('🃑', '🃟'),
+ ('🄰', '🄰'),
+ ('🄲', '🄼'),
+ ('🄾', '🄾'),
+ ('🅀', '🅁'),
+ ('🅃', '🅅'),
+ ('🅇', '🅉'),
+ ('🅏', '🅖'),
+ ('🅘', '🅞'),
+ ('🅠', '🅩'),
+ ('🅰', '🅸'),
+ ('🅺', '🅺'),
+ ('🅽', '🅾'),
+ ('🆀', '🆉'),
+ ('🆎', '🆏'),
+ ('🆑', '🆚'),
+ ('🇦', '🇿'),
+ ('🈁', '🈂'),
+ ('🈲', '🈺'),
+ ('🉐', '🉑'),
+ ('🌀', '🌠'),
+ ('🌰', '🌵'),
+ ('🌷', '🍼'),
+ ('🎀', '🎓'),
+ ('🎠', '🏄'),
+ ('🏆', '🏊'),
+ ('🏠', '🏰'),
+ ('🐀', '🐾'),
+ ('👀', '👀'),
+ ('👂', '📷'),
+ ('📹', '📼'),
+ ('🔀', '🔽'),
+ ('🕐', '🕧'),
+ ('🗻', '🗿'),
+ ('😁', '😐'),
+ ('😒', '😔'),
+ ('😖', '😖'),
+ ('😘', '😘'),
+ ('😚', '😚'),
+ ('😜', '😞'),
+ ('😠', '😥'),
+ ('😨', '😫'),
+ ('😭', '😭'),
+ ('😰', '😳'),
+ ('😵', '🙀'),
+ ('🙅', '🙏'),
+ ('🚀', '🛅'),
+ ('🜀', '🝳'),
+ ('𫝀', '𫠝'),
+];
+
+pub const V6_1: &'static [(char, char)] = &[
+ ('֏', '֏'),
+ ('\u{604}', '\u{604}'),
+ ('ࢠ', 'ࢠ'),
+ ('ࢢ', 'ࢬ'),
+ ('\u{8e4}', '\u{8fe}'),
+ ('૰', '૰'),
+ ('ໞ', 'ໟ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ჽ', 'ჿ'),
+ ('\u{1bab}', '\u{1bad}'),
+ ('ᮺ', 'ᮿ'),
+ ('᳀', '᳇'),
+ ('ᳳ', 'ᳶ'),
+ ('⟋', '⟋'),
+ ('⟍', '⟍'),
+ ('Ⳳ', 'ⳳ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⵦ', 'ⵧ'),
+ ('⸲', '⸻'),
+ ('鿌', '鿌'),
+ ('\u{a674}', '\u{a67b}'),
+ ('\u{a69f}', '\u{a69f}'),
+ ('Ꞓ', 'ꞓ'),
+ ('Ɦ', 'Ɦ'),
+ ('ꟸ', 'ꟹ'),
+ ('ꫠ', '\u{aaf6}'),
+ ('郞', '隷'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𑃐', '𑃨'),
+ ('𑃰', '𑃹'),
+ ('\u{11100}', '\u{11134}'),
+ ('𑄶', '𑅃'),
+ ('\u{11180}', '𑇈'),
+ ('𑇐', '𑇙'),
+ ('𑚀', '\u{116b7}'),
+ ('𑛀', '𑛉'),
+ ('𖼀', '𖽄'),
+ ('𖽐', '𖽾'),
+ ('\u{16f8f}', '𖾟'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𞻰', '𞻱'),
+ ('🅪', '🅫'),
+ ('🕀', '🕃'),
+ ('😀', '😀'),
+ ('😑', '😑'),
+ ('😕', '😕'),
+ ('😗', '😗'),
+ ('😙', '😙'),
+ ('😛', '😛'),
+ ('😟', '😟'),
+ ('😦', '😧'),
+ ('😬', '😬'),
+ ('😮', '😯'),
+ ('😴', '😴'),
+];
+
+pub const V6_2: &'static [(char, char)] = &[('₺', '₺')];
+
+pub const V6_3: &'static [(char, char)] =
+ &[('\u{61c}', '\u{61c}'), ('\u{2066}', '\u{2069}')];
+
+pub const V7_0: &'static [(char, char)] = &[
+ ('Ϳ', 'Ϳ'),
+ ('Ԩ', 'ԯ'),
+ ('֍', '֎'),
+ ('\u{605}', '\u{605}'),
+ ('ࢡ', 'ࢡ'),
+ ('ࢭ', 'ࢲ'),
+ ('\u{8ff}', '\u{8ff}'),
+ ('ॸ', 'ॸ'),
+ ('ঀ', 'ঀ'),
+ ('\u{c00}', '\u{c00}'),
+ ('ఴ', 'ఴ'),
+ ('\u{c81}', '\u{c81}'),
+ ('\u{d01}', '\u{d01}'),
+ ('෦', '෯'),
+ ('ᛱ', 'ᛸ'),
+ ('ᤝ', 'ᤞ'),
+ ('\u{1ab0}', '\u{1abe}'),
+ ('\u{1cf8}', '\u{1cf9}'),
+ ('\u{1de7}', '\u{1df5}'),
+ ('₻', '₽'),
+ ('⏴', '⏺'),
+ ('✀', '✀'),
+ ('⭍', '⭏'),
+ ('⭚', '⭳'),
+ ('⭶', '⮕'),
+ ('⮘', '⮹'),
+ ('⮽', '⯈'),
+ ('⯊', '⯑'),
+ ('⸼', '⹂'),
+ ('Ꚙ', 'ꚝ'),
+ ('ꞔ', 'ꞟ'),
+ ('Ɜ', 'Ɬ'),
+ ('Ʞ', 'Ʇ'),
+ ('ꟷ', 'ꟷ'),
+ ('ꧠ', 'ꧾ'),
+ ('\u{aa7c}', 'ꩿ'),
+ ('ꬰ', 'ꭟ'),
+ ('ꭤ', 'ꭥ'),
+ ('\u{fe27}', '\u{fe2d}'),
+ ('𐆋', '𐆌'),
+ ('𐆠', '𐆠'),
+ ('\u{102e0}', '𐋻'),
+ ('𐌟', '𐌟'),
+ ('𐍐', '\u{1037a}'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕯', '𐕯'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐡠', '𐢞'),
+ ('𐢧', '𐢯'),
+ ('𐪀', '𐪟'),
+ ('𐫀', '\u{10ae6}'),
+ ('𐫫', '𐫶'),
+ ('𐮀', '𐮑'),
+ ('𐮙', '𐮜'),
+ ('𐮩', '𐮯'),
+ ('\u{1107f}', '\u{1107f}'),
+ ('𑅐', '𑅶'),
+ ('𑇍', '𑇍'),
+ ('𑇚', '𑇚'),
+ ('𑇡', '𑇴'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '𑈽'),
+ ('𑊰', '\u{112ea}'),
+ ('𑋰', '𑋹'),
+ ('\u{11301}', '𑌃'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('\u{1133c}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍝', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('𑒀', '𑓇'),
+ ('𑓐', '𑓙'),
+ ('𑖀', '\u{115b5}'),
+ ('𑖸', '𑗉'),
+ ('𑘀', '𑙄'),
+ ('𑙐', '𑙙'),
+ ('𑢠', '𑣲'),
+ ('𑣿', '𑣿'),
+ ('𑫀', '𑫸'),
+ ('𒍯', '𒎘'),
+ ('𒑣', '𒑮'),
+ ('𒑴', '𒑴'),
+ ('𖩀', '𖩞'),
+ ('𖩠', '𖩩'),
+ ('𖩮', '𖩯'),
+ ('𖫐', '𖫭'),
+ ('\u{16af0}', '𖫵'),
+ ('𖬀', '𖭅'),
+ ('𖭐', '𖭙'),
+ ('𖭛', '𖭡'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('𛲜', '\u{1bca3}'),
+ ('𞠀', '𞣄'),
+ ('𞣇', '\u{1e8d6}'),
+ ('🂿', '🂿'),
+ ('🃠', '🃵'),
+ ('🄋', '🄌'),
+ ('🌡', '🌬'),
+ ('🌶', '🌶'),
+ ('🍽', '🍽'),
+ ('🎔', '🎟'),
+ ('🏅', '🏅'),
+ ('🏋', '🏎'),
+ ('🏔', '🏟'),
+ ('🏱', '🏷'),
+ ('🐿', '🐿'),
+ ('👁', '👁'),
+ ('📸', '📸'),
+ ('📽', '📾'),
+ ('🔾', '🔿'),
+ ('🕄', '🕊'),
+ ('🕨', '🕹'),
+ ('🕻', '🖣'),
+ ('🖥', '🗺'),
+ ('🙁', '🙂'),
+ ('🙐', '🙿'),
+ ('🛆', '🛏'),
+ ('🛠', '🛬'),
+ ('🛰', '🛳'),
+ ('🞀', '🟔'),
+ ('🠀', '🠋'),
+ ('🠐', '🡇'),
+ ('🡐', '🡙'),
+ ('🡠', '🢇'),
+ ('🢐', '🢭'),
+];
+
+pub const V8_0: &'static [(char, char)] = &[
+ ('ࢳ', 'ࢴ'),
+ ('\u{8e3}', '\u{8e3}'),
+ ('ૹ', 'ૹ'),
+ ('ౚ', 'ౚ'),
+ ('ൟ', 'ൟ'),
+ ('Ᏽ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('₾', '₾'),
+ ('↊', '↋'),
+ ('⯬', '⯯'),
+ ('鿍', '鿕'),
+ ('\u{a69e}', '\u{a69e}'),
+ ('ꞏ', 'ꞏ'),
+ ('Ʝ', 'ꞷ'),
+ ('꣼', 'ꣽ'),
+ ('ꭠ', 'ꭣ'),
+ ('ꭰ', 'ꮿ'),
+ ('\u{fe2e}', '\u{fe2f}'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐣻', '𐣿'),
+ ('𐦼', '𐦽'),
+ ('𐧀', '𐧏'),
+ ('𐧒', '𐧿'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐳺', '𐳿'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('𑇛', '𑇟'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊩'),
+ ('\u{11300}', '\u{11300}'),
+ ('𑍐', '𑍐'),
+ ('𑗊', '\u{115dd}'),
+ ('𑜀', '𑜙'),
+ ('\u{1171d}', '\u{1172b}'),
+ ('𑜰', '𑜿'),
+ ('𒎙', '𒎙'),
+ ('𒒀', '𒕃'),
+ ('𔐀', '𔙆'),
+ ('𝇞', '𝇨'),
+ ('𝠀', '𝪋'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('🌭', '🌯'),
+ ('🍾', '🍿'),
+ ('🏏', '🏓'),
+ ('🏸', '🏿'),
+ ('📿', '📿'),
+ ('🕋', '🕏'),
+ ('🙃', '🙄'),
+ ('🛐', '🛐'),
+ ('🤐', '🤘'),
+ ('🦀', '🦄'),
+ ('🧀', '🧀'),
+ ('𫠠', '𬺡'),
+];
+
+pub const V9_0: &'static [(char, char)] = &[
+ ('ࢶ', 'ࢽ'),
+ ('\u{8d4}', '\u{8e2}'),
+ ('ಀ', 'ಀ'),
+ ('൏', '൏'),
+ ('ൔ', 'ൖ'),
+ ('൘', '൞'),
+ ('൶', '൸'),
+ ('ᲀ', 'ᲈ'),
+ ('\u{1dfb}', '\u{1dfb}'),
+ ('⏻', '⏾'),
+ ('⹃', '⹄'),
+ ('Ɪ', 'Ɪ'),
+ ('\u{a8c5}', '\u{a8c5}'),
+ ('𐆍', '𐆎'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('\u{1123e}', '\u{1123e}'),
+ ('𑐀', '𑑙'),
+ ('𑑛', '𑑛'),
+ ('𑑝', '𑑝'),
+ ('𑙠', '𑙬'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '\u{11c36}'),
+ ('\u{11c38}', '𑱅'),
+ ('𑱐', '𑱬'),
+ ('𑱰', '𑲏'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('𖿠', '𖿠'),
+ ('𗀀', '𘟬'),
+ ('𘠀', '𘫲'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('𞤀', '\u{1e94a}'),
+ ('𞥐', '𞥙'),
+ ('𞥞', '𞥟'),
+ ('🆛', '🆬'),
+ ('🈻', '🈻'),
+ ('🕺', '🕺'),
+ ('🖤', '🖤'),
+ ('🛑', '🛒'),
+ ('🛴', '🛶'),
+ ('🤙', '🤞'),
+ ('🤠', '🤧'),
+ ('🤰', '🤰'),
+ ('🤳', '🤾'),
+ ('🥀', '🥋'),
+ ('🥐', '🥞'),
+ ('🦅', '🦑'),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/case_folding_simple.rs b/third_party/rust/regex-syntax/src/unicode_tables/case_folding_simple.rs
new file mode 100644
index 0000000000..23f9364ce9
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/case_folding_simple.rs
@@ -0,0 +1,2888 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate case-folding-simple ucd-15.0.0 --chars --all-pairs
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[
+ ('A', &['a']),
+ ('B', &['b']),
+ ('C', &['c']),
+ ('D', &['d']),
+ ('E', &['e']),
+ ('F', &['f']),
+ ('G', &['g']),
+ ('H', &['h']),
+ ('I', &['i']),
+ ('J', &['j']),
+ ('K', &['k', 'K']),
+ ('L', &['l']),
+ ('M', &['m']),
+ ('N', &['n']),
+ ('O', &['o']),
+ ('P', &['p']),
+ ('Q', &['q']),
+ ('R', &['r']),
+ ('S', &['s', 'ſ']),
+ ('T', &['t']),
+ ('U', &['u']),
+ ('V', &['v']),
+ ('W', &['w']),
+ ('X', &['x']),
+ ('Y', &['y']),
+ ('Z', &['z']),
+ ('a', &['A']),
+ ('b', &['B']),
+ ('c', &['C']),
+ ('d', &['D']),
+ ('e', &['E']),
+ ('f', &['F']),
+ ('g', &['G']),
+ ('h', &['H']),
+ ('i', &['I']),
+ ('j', &['J']),
+ ('k', &['K', 'K']),
+ ('l', &['L']),
+ ('m', &['M']),
+ ('n', &['N']),
+ ('o', &['O']),
+ ('p', &['P']),
+ ('q', &['Q']),
+ ('r', &['R']),
+ ('s', &['S', 'ſ']),
+ ('t', &['T']),
+ ('u', &['U']),
+ ('v', &['V']),
+ ('w', &['W']),
+ ('x', &['X']),
+ ('y', &['Y']),
+ ('z', &['Z']),
+ ('µ', &['Μ', 'μ']),
+ ('À', &['à']),
+ ('Á', &['á']),
+ ('Â', &['â']),
+ ('Ã', &['ã']),
+ ('Ä', &['ä']),
+ ('Å', &['å', 'Å']),
+ ('Æ', &['æ']),
+ ('Ç', &['ç']),
+ ('È', &['è']),
+ ('É', &['é']),
+ ('Ê', &['ê']),
+ ('Ë', &['ë']),
+ ('Ì', &['ì']),
+ ('Í', &['í']),
+ ('Î', &['î']),
+ ('Ï', &['ï']),
+ ('Ð', &['ð']),
+ ('Ñ', &['ñ']),
+ ('Ò', &['ò']),
+ ('Ó', &['ó']),
+ ('Ô', &['ô']),
+ ('Õ', &['õ']),
+ ('Ö', &['ö']),
+ ('Ø', &['ø']),
+ ('Ù', &['ù']),
+ ('Ú', &['ú']),
+ ('Û', &['û']),
+ ('Ü', &['ü']),
+ ('Ý', &['ý']),
+ ('Þ', &['þ']),
+ ('ß', &['ẞ']),
+ ('à', &['À']),
+ ('á', &['Á']),
+ ('â', &['Â']),
+ ('ã', &['Ã']),
+ ('ä', &['Ä']),
+ ('å', &['Å', 'Å']),
+ ('æ', &['Æ']),
+ ('ç', &['Ç']),
+ ('è', &['È']),
+ ('é', &['É']),
+ ('ê', &['Ê']),
+ ('ë', &['Ë']),
+ ('ì', &['Ì']),
+ ('í', &['Í']),
+ ('î', &['Î']),
+ ('ï', &['Ï']),
+ ('ð', &['Ð']),
+ ('ñ', &['Ñ']),
+ ('ò', &['Ò']),
+ ('ó', &['Ó']),
+ ('ô', &['Ô']),
+ ('õ', &['Õ']),
+ ('ö', &['Ö']),
+ ('ø', &['Ø']),
+ ('ù', &['Ù']),
+ ('ú', &['Ú']),
+ ('û', &['Û']),
+ ('ü', &['Ü']),
+ ('ý', &['Ý']),
+ ('þ', &['Þ']),
+ ('ÿ', &['Ÿ']),
+ ('Ā', &['ā']),
+ ('ā', &['Ā']),
+ ('Ă', &['ă']),
+ ('ă', &['Ă']),
+ ('Ą', &['ą']),
+ ('ą', &['Ą']),
+ ('Ć', &['ć']),
+ ('ć', &['Ć']),
+ ('Ĉ', &['ĉ']),
+ ('ĉ', &['Ĉ']),
+ ('Ċ', &['ċ']),
+ ('ċ', &['Ċ']),
+ ('Č', &['č']),
+ ('č', &['Č']),
+ ('Ď', &['ď']),
+ ('ď', &['Ď']),
+ ('Đ', &['đ']),
+ ('đ', &['Đ']),
+ ('Ē', &['ē']),
+ ('ē', &['Ē']),
+ ('Ĕ', &['ĕ']),
+ ('ĕ', &['Ĕ']),
+ ('Ė', &['ė']),
+ ('ė', &['Ė']),
+ ('Ę', &['ę']),
+ ('ę', &['Ę']),
+ ('Ě', &['ě']),
+ ('ě', &['Ě']),
+ ('Ĝ', &['ĝ']),
+ ('ĝ', &['Ĝ']),
+ ('Ğ', &['ğ']),
+ ('ğ', &['Ğ']),
+ ('Ġ', &['ġ']),
+ ('ġ', &['Ġ']),
+ ('Ģ', &['ģ']),
+ ('ģ', &['Ģ']),
+ ('Ĥ', &['ĥ']),
+ ('ĥ', &['Ĥ']),
+ ('Ħ', &['ħ']),
+ ('ħ', &['Ħ']),
+ ('Ĩ', &['ĩ']),
+ ('ĩ', &['Ĩ']),
+ ('Ī', &['ī']),
+ ('ī', &['Ī']),
+ ('Ĭ', &['ĭ']),
+ ('ĭ', &['Ĭ']),
+ ('Į', &['į']),
+ ('į', &['Į']),
+ ('IJ', &['ij']),
+ ('ij', &['IJ']),
+ ('Ĵ', &['ĵ']),
+ ('ĵ', &['Ĵ']),
+ ('Ķ', &['ķ']),
+ ('ķ', &['Ķ']),
+ ('Ĺ', &['ĺ']),
+ ('ĺ', &['Ĺ']),
+ ('Ļ', &['ļ']),
+ ('ļ', &['Ļ']),
+ ('Ľ', &['ľ']),
+ ('ľ', &['Ľ']),
+ ('Ŀ', &['ŀ']),
+ ('ŀ', &['Ŀ']),
+ ('Ł', &['ł']),
+ ('ł', &['Ł']),
+ ('Ń', &['ń']),
+ ('ń', &['Ń']),
+ ('Ņ', &['ņ']),
+ ('ņ', &['Ņ']),
+ ('Ň', &['ň']),
+ ('ň', &['Ň']),
+ ('Ŋ', &['ŋ']),
+ ('ŋ', &['Ŋ']),
+ ('Ō', &['ō']),
+ ('ō', &['Ō']),
+ ('Ŏ', &['ŏ']),
+ ('ŏ', &['Ŏ']),
+ ('Ő', &['ő']),
+ ('ő', &['Ő']),
+ ('Œ', &['œ']),
+ ('œ', &['Œ']),
+ ('Ŕ', &['ŕ']),
+ ('ŕ', &['Ŕ']),
+ ('Ŗ', &['ŗ']),
+ ('ŗ', &['Ŗ']),
+ ('Ř', &['ř']),
+ ('ř', &['Ř']),
+ ('Ś', &['ś']),
+ ('ś', &['Ś']),
+ ('Ŝ', &['ŝ']),
+ ('ŝ', &['Ŝ']),
+ ('Ş', &['ş']),
+ ('ş', &['Ş']),
+ ('Š', &['š']),
+ ('š', &['Š']),
+ ('Ţ', &['ţ']),
+ ('ţ', &['Ţ']),
+ ('Ť', &['ť']),
+ ('ť', &['Ť']),
+ ('Ŧ', &['ŧ']),
+ ('ŧ', &['Ŧ']),
+ ('Ũ', &['ũ']),
+ ('ũ', &['Ũ']),
+ ('Ū', &['ū']),
+ ('ū', &['Ū']),
+ ('Ŭ', &['ŭ']),
+ ('ŭ', &['Ŭ']),
+ ('Ů', &['ů']),
+ ('ů', &['Ů']),
+ ('Ű', &['ű']),
+ ('ű', &['Ű']),
+ ('Ų', &['ų']),
+ ('ų', &['Ų']),
+ ('Ŵ', &['ŵ']),
+ ('ŵ', &['Ŵ']),
+ ('Ŷ', &['ŷ']),
+ ('ŷ', &['Ŷ']),
+ ('Ÿ', &['ÿ']),
+ ('Ź', &['ź']),
+ ('ź', &['Ź']),
+ ('Ż', &['ż']),
+ ('ż', &['Ż']),
+ ('Ž', &['ž']),
+ ('ž', &['Ž']),
+ ('ſ', &['S', 's']),
+ ('ƀ', &['Ƀ']),
+ ('Ɓ', &['ɓ']),
+ ('Ƃ', &['ƃ']),
+ ('ƃ', &['Ƃ']),
+ ('Ƅ', &['ƅ']),
+ ('ƅ', &['Ƅ']),
+ ('Ɔ', &['ɔ']),
+ ('Ƈ', &['ƈ']),
+ ('ƈ', &['Ƈ']),
+ ('Ɖ', &['ɖ']),
+ ('Ɗ', &['ɗ']),
+ ('Ƌ', &['ƌ']),
+ ('ƌ', &['Ƌ']),
+ ('Ǝ', &['ǝ']),
+ ('Ə', &['ə']),
+ ('Ɛ', &['ɛ']),
+ ('Ƒ', &['ƒ']),
+ ('ƒ', &['Ƒ']),
+ ('Ɠ', &['ɠ']),
+ ('Ɣ', &['ɣ']),
+ ('ƕ', &['Ƕ']),
+ ('Ɩ', &['ɩ']),
+ ('Ɨ', &['ɨ']),
+ ('Ƙ', &['ƙ']),
+ ('ƙ', &['Ƙ']),
+ ('ƚ', &['Ƚ']),
+ ('Ɯ', &['ɯ']),
+ ('Ɲ', &['ɲ']),
+ ('ƞ', &['Ƞ']),
+ ('Ɵ', &['ɵ']),
+ ('Ơ', &['ơ']),
+ ('ơ', &['Ơ']),
+ ('Ƣ', &['ƣ']),
+ ('ƣ', &['Ƣ']),
+ ('Ƥ', &['ƥ']),
+ ('ƥ', &['Ƥ']),
+ ('Ʀ', &['ʀ']),
+ ('Ƨ', &['ƨ']),
+ ('ƨ', &['Ƨ']),
+ ('Ʃ', &['ʃ']),
+ ('Ƭ', &['ƭ']),
+ ('ƭ', &['Ƭ']),
+ ('Ʈ', &['ʈ']),
+ ('Ư', &['ư']),
+ ('ư', &['Ư']),
+ ('Ʊ', &['ʊ']),
+ ('Ʋ', &['ʋ']),
+ ('Ƴ', &['ƴ']),
+ ('ƴ', &['Ƴ']),
+ ('Ƶ', &['ƶ']),
+ ('ƶ', &['Ƶ']),
+ ('Ʒ', &['ʒ']),
+ ('Ƹ', &['ƹ']),
+ ('ƹ', &['Ƹ']),
+ ('Ƽ', &['ƽ']),
+ ('ƽ', &['Ƽ']),
+ ('ƿ', &['Ƿ']),
+ ('DŽ', &['Dž', 'dž']),
+ ('Dž', &['DŽ', 'dž']),
+ ('dž', &['DŽ', 'Dž']),
+ ('LJ', &['Lj', 'lj']),
+ ('Lj', &['LJ', 'lj']),
+ ('lj', &['LJ', 'Lj']),
+ ('NJ', &['Nj', 'nj']),
+ ('Nj', &['NJ', 'nj']),
+ ('nj', &['NJ', 'Nj']),
+ ('Ǎ', &['ǎ']),
+ ('ǎ', &['Ǎ']),
+ ('Ǐ', &['ǐ']),
+ ('ǐ', &['Ǐ']),
+ ('Ǒ', &['ǒ']),
+ ('ǒ', &['Ǒ']),
+ ('Ǔ', &['ǔ']),
+ ('ǔ', &['Ǔ']),
+ ('Ǖ', &['ǖ']),
+ ('ǖ', &['Ǖ']),
+ ('Ǘ', &['ǘ']),
+ ('ǘ', &['Ǘ']),
+ ('Ǚ', &['ǚ']),
+ ('ǚ', &['Ǚ']),
+ ('Ǜ', &['ǜ']),
+ ('ǜ', &['Ǜ']),
+ ('ǝ', &['Ǝ']),
+ ('Ǟ', &['ǟ']),
+ ('ǟ', &['Ǟ']),
+ ('Ǡ', &['ǡ']),
+ ('ǡ', &['Ǡ']),
+ ('Ǣ', &['ǣ']),
+ ('ǣ', &['Ǣ']),
+ ('Ǥ', &['ǥ']),
+ ('ǥ', &['Ǥ']),
+ ('Ǧ', &['ǧ']),
+ ('ǧ', &['Ǧ']),
+ ('Ǩ', &['ǩ']),
+ ('ǩ', &['Ǩ']),
+ ('Ǫ', &['ǫ']),
+ ('ǫ', &['Ǫ']),
+ ('Ǭ', &['ǭ']),
+ ('ǭ', &['Ǭ']),
+ ('Ǯ', &['ǯ']),
+ ('ǯ', &['Ǯ']),
+ ('DZ', &['Dz', 'dz']),
+ ('Dz', &['DZ', 'dz']),
+ ('dz', &['DZ', 'Dz']),
+ ('Ǵ', &['ǵ']),
+ ('ǵ', &['Ǵ']),
+ ('Ƕ', &['ƕ']),
+ ('Ƿ', &['ƿ']),
+ ('Ǹ', &['ǹ']),
+ ('ǹ', &['Ǹ']),
+ ('Ǻ', &['ǻ']),
+ ('ǻ', &['Ǻ']),
+ ('Ǽ', &['ǽ']),
+ ('ǽ', &['Ǽ']),
+ ('Ǿ', &['ǿ']),
+ ('ǿ', &['Ǿ']),
+ ('Ȁ', &['ȁ']),
+ ('ȁ', &['Ȁ']),
+ ('Ȃ', &['ȃ']),
+ ('ȃ', &['Ȃ']),
+ ('Ȅ', &['ȅ']),
+ ('ȅ', &['Ȅ']),
+ ('Ȇ', &['ȇ']),
+ ('ȇ', &['Ȇ']),
+ ('Ȉ', &['ȉ']),
+ ('ȉ', &['Ȉ']),
+ ('Ȋ', &['ȋ']),
+ ('ȋ', &['Ȋ']),
+ ('Ȍ', &['ȍ']),
+ ('ȍ', &['Ȍ']),
+ ('Ȏ', &['ȏ']),
+ ('ȏ', &['Ȏ']),
+ ('Ȑ', &['ȑ']),
+ ('ȑ', &['Ȑ']),
+ ('Ȓ', &['ȓ']),
+ ('ȓ', &['Ȓ']),
+ ('Ȕ', &['ȕ']),
+ ('ȕ', &['Ȕ']),
+ ('Ȗ', &['ȗ']),
+ ('ȗ', &['Ȗ']),
+ ('Ș', &['ș']),
+ ('ș', &['Ș']),
+ ('Ț', &['ț']),
+ ('ț', &['Ț']),
+ ('Ȝ', &['ȝ']),
+ ('ȝ', &['Ȝ']),
+ ('Ȟ', &['ȟ']),
+ ('ȟ', &['Ȟ']),
+ ('Ƞ', &['ƞ']),
+ ('Ȣ', &['ȣ']),
+ ('ȣ', &['Ȣ']),
+ ('Ȥ', &['ȥ']),
+ ('ȥ', &['Ȥ']),
+ ('Ȧ', &['ȧ']),
+ ('ȧ', &['Ȧ']),
+ ('Ȩ', &['ȩ']),
+ ('ȩ', &['Ȩ']),
+ ('Ȫ', &['ȫ']),
+ ('ȫ', &['Ȫ']),
+ ('Ȭ', &['ȭ']),
+ ('ȭ', &['Ȭ']),
+ ('Ȯ', &['ȯ']),
+ ('ȯ', &['Ȯ']),
+ ('Ȱ', &['ȱ']),
+ ('ȱ', &['Ȱ']),
+ ('Ȳ', &['ȳ']),
+ ('ȳ', &['Ȳ']),
+ ('Ⱥ', &['ⱥ']),
+ ('Ȼ', &['ȼ']),
+ ('ȼ', &['Ȼ']),
+ ('Ƚ', &['ƚ']),
+ ('Ⱦ', &['ⱦ']),
+ ('ȿ', &['Ȿ']),
+ ('ɀ', &['Ɀ']),
+ ('Ɂ', &['ɂ']),
+ ('ɂ', &['Ɂ']),
+ ('Ƀ', &['ƀ']),
+ ('Ʉ', &['ʉ']),
+ ('Ʌ', &['ʌ']),
+ ('Ɇ', &['ɇ']),
+ ('ɇ', &['Ɇ']),
+ ('Ɉ', &['ɉ']),
+ ('ɉ', &['Ɉ']),
+ ('Ɋ', &['ɋ']),
+ ('ɋ', &['Ɋ']),
+ ('Ɍ', &['ɍ']),
+ ('ɍ', &['Ɍ']),
+ ('Ɏ', &['ɏ']),
+ ('ɏ', &['Ɏ']),
+ ('ɐ', &['Ɐ']),
+ ('ɑ', &['Ɑ']),
+ ('ɒ', &['Ɒ']),
+ ('ɓ', &['Ɓ']),
+ ('ɔ', &['Ɔ']),
+ ('ɖ', &['Ɖ']),
+ ('ɗ', &['Ɗ']),
+ ('ə', &['Ə']),
+ ('ɛ', &['Ɛ']),
+ ('ɜ', &['Ɜ']),
+ ('ɠ', &['Ɠ']),
+ ('ɡ', &['Ɡ']),
+ ('ɣ', &['Ɣ']),
+ ('ɥ', &['Ɥ']),
+ ('ɦ', &['Ɦ']),
+ ('ɨ', &['Ɨ']),
+ ('ɩ', &['Ɩ']),
+ ('ɪ', &['Ɪ']),
+ ('ɫ', &['Ɫ']),
+ ('ɬ', &['Ɬ']),
+ ('ɯ', &['Ɯ']),
+ ('ɱ', &['Ɱ']),
+ ('ɲ', &['Ɲ']),
+ ('ɵ', &['Ɵ']),
+ ('ɽ', &['Ɽ']),
+ ('ʀ', &['Ʀ']),
+ ('ʂ', &['Ʂ']),
+ ('ʃ', &['Ʃ']),
+ ('ʇ', &['Ʇ']),
+ ('ʈ', &['Ʈ']),
+ ('ʉ', &['Ʉ']),
+ ('ʊ', &['Ʊ']),
+ ('ʋ', &['Ʋ']),
+ ('ʌ', &['Ʌ']),
+ ('ʒ', &['Ʒ']),
+ ('ʝ', &['Ʝ']),
+ ('ʞ', &['Ʞ']),
+ ('\u{345}', &['Ι', 'ι', 'ι']),
+ ('Ͱ', &['ͱ']),
+ ('ͱ', &['Ͱ']),
+ ('Ͳ', &['ͳ']),
+ ('ͳ', &['Ͳ']),
+ ('Ͷ', &['ͷ']),
+ ('ͷ', &['Ͷ']),
+ ('ͻ', &['Ͻ']),
+ ('ͼ', &['Ͼ']),
+ ('ͽ', &['Ͽ']),
+ ('Ϳ', &['ϳ']),
+ ('Ά', &['ά']),
+ ('Έ', &['έ']),
+ ('Ή', &['ή']),
+ ('Ί', &['ί']),
+ ('Ό', &['ό']),
+ ('Ύ', &['ύ']),
+ ('Ώ', &['ώ']),
+ ('Α', &['α']),
+ ('Β', &['β', 'ϐ']),
+ ('Γ', &['γ']),
+ ('Δ', &['δ']),
+ ('Ε', &['ε', 'ϵ']),
+ ('Ζ', &['ζ']),
+ ('Η', &['η']),
+ ('Θ', &['θ', 'ϑ', 'ϴ']),
+ ('Ι', &['\u{345}', 'ι', 'ι']),
+ ('Κ', &['κ', 'ϰ']),
+ ('Λ', &['λ']),
+ ('Μ', &['µ', 'μ']),
+ ('Ν', &['ν']),
+ ('Ξ', &['ξ']),
+ ('Ο', &['ο']),
+ ('Π', &['π', 'ϖ']),
+ ('Ρ', &['ρ', 'ϱ']),
+ ('Σ', &['ς', 'σ']),
+ ('Τ', &['τ']),
+ ('Υ', &['υ']),
+ ('Φ', &['φ', 'ϕ']),
+ ('Χ', &['χ']),
+ ('Ψ', &['ψ']),
+ ('Ω', &['ω', 'Ω']),
+ ('Ϊ', &['ϊ']),
+ ('Ϋ', &['ϋ']),
+ ('ά', &['Ά']),
+ ('έ', &['Έ']),
+ ('ή', &['Ή']),
+ ('ί', &['Ί']),
+ ('α', &['Α']),
+ ('β', &['Β', 'ϐ']),
+ ('γ', &['Γ']),
+ ('δ', &['Δ']),
+ ('ε', &['Ε', 'ϵ']),
+ ('ζ', &['Ζ']),
+ ('η', &['Η']),
+ ('θ', &['Θ', 'ϑ', 'ϴ']),
+ ('ι', &['\u{345}', 'Ι', 'ι']),
+ ('κ', &['Κ', 'ϰ']),
+ ('λ', &['Λ']),
+ ('μ', &['µ', 'Μ']),
+ ('ν', &['Ν']),
+ ('ξ', &['Ξ']),
+ ('ο', &['Ο']),
+ ('π', &['Π', 'ϖ']),
+ ('ρ', &['Ρ', 'ϱ']),
+ ('ς', &['Σ', 'σ']),
+ ('σ', &['Σ', 'ς']),
+ ('τ', &['Τ']),
+ ('υ', &['Υ']),
+ ('φ', &['Φ', 'ϕ']),
+ ('χ', &['Χ']),
+ ('ψ', &['Ψ']),
+ ('ω', &['Ω', 'Ω']),
+ ('ϊ', &['Ϊ']),
+ ('ϋ', &['Ϋ']),
+ ('ό', &['Ό']),
+ ('ύ', &['Ύ']),
+ ('ώ', &['Ώ']),
+ ('Ϗ', &['ϗ']),
+ ('ϐ', &['Β', 'β']),
+ ('ϑ', &['Θ', 'θ', 'ϴ']),
+ ('ϕ', &['Φ', 'φ']),
+ ('ϖ', &['Π', 'π']),
+ ('ϗ', &['Ϗ']),
+ ('Ϙ', &['ϙ']),
+ ('ϙ', &['Ϙ']),
+ ('Ϛ', &['ϛ']),
+ ('ϛ', &['Ϛ']),
+ ('Ϝ', &['ϝ']),
+ ('ϝ', &['Ϝ']),
+ ('Ϟ', &['ϟ']),
+ ('ϟ', &['Ϟ']),
+ ('Ϡ', &['ϡ']),
+ ('ϡ', &['Ϡ']),
+ ('Ϣ', &['ϣ']),
+ ('ϣ', &['Ϣ']),
+ ('Ϥ', &['ϥ']),
+ ('ϥ', &['Ϥ']),
+ ('Ϧ', &['ϧ']),
+ ('ϧ', &['Ϧ']),
+ ('Ϩ', &['ϩ']),
+ ('ϩ', &['Ϩ']),
+ ('Ϫ', &['ϫ']),
+ ('ϫ', &['Ϫ']),
+ ('Ϭ', &['ϭ']),
+ ('ϭ', &['Ϭ']),
+ ('Ϯ', &['ϯ']),
+ ('ϯ', &['Ϯ']),
+ ('ϰ', &['Κ', 'κ']),
+ ('ϱ', &['Ρ', 'ρ']),
+ ('ϲ', &['Ϲ']),
+ ('ϳ', &['Ϳ']),
+ ('ϴ', &['Θ', 'θ', 'ϑ']),
+ ('ϵ', &['Ε', 'ε']),
+ ('Ϸ', &['ϸ']),
+ ('ϸ', &['Ϸ']),
+ ('Ϲ', &['ϲ']),
+ ('Ϻ', &['ϻ']),
+ ('ϻ', &['Ϻ']),
+ ('Ͻ', &['ͻ']),
+ ('Ͼ', &['ͼ']),
+ ('Ͽ', &['ͽ']),
+ ('Ѐ', &['ѐ']),
+ ('Ё', &['ё']),
+ ('Ђ', &['ђ']),
+ ('Ѓ', &['ѓ']),
+ ('Є', &['є']),
+ ('Ѕ', &['ѕ']),
+ ('І', &['і']),
+ ('Ї', &['ї']),
+ ('Ј', &['ј']),
+ ('Љ', &['љ']),
+ ('Њ', &['њ']),
+ ('Ћ', &['ћ']),
+ ('Ќ', &['ќ']),
+ ('Ѝ', &['ѝ']),
+ ('Ў', &['ў']),
+ ('Џ', &['џ']),
+ ('А', &['а']),
+ ('Б', &['б']),
+ ('В', &['в', 'ᲀ']),
+ ('Г', &['г']),
+ ('Д', &['д', 'ᲁ']),
+ ('Е', &['е']),
+ ('Ж', &['ж']),
+ ('З', &['з']),
+ ('И', &['и']),
+ ('Й', &['й']),
+ ('К', &['к']),
+ ('Л', &['л']),
+ ('М', &['м']),
+ ('Н', &['н']),
+ ('О', &['о', 'ᲂ']),
+ ('П', &['п']),
+ ('Р', &['р']),
+ ('С', &['с', 'ᲃ']),
+ ('Т', &['т', 'ᲄ', 'ᲅ']),
+ ('У', &['у']),
+ ('Ф', &['ф']),
+ ('Х', &['х']),
+ ('Ц', &['ц']),
+ ('Ч', &['ч']),
+ ('Ш', &['ш']),
+ ('Щ', &['щ']),
+ ('Ъ', &['ъ', 'ᲆ']),
+ ('Ы', &['ы']),
+ ('Ь', &['ь']),
+ ('Э', &['э']),
+ ('Ю', &['ю']),
+ ('Я', &['я']),
+ ('а', &['А']),
+ ('б', &['Б']),
+ ('в', &['В', 'ᲀ']),
+ ('г', &['Г']),
+ ('д', &['Д', 'ᲁ']),
+ ('е', &['Е']),
+ ('ж', &['Ж']),
+ ('з', &['З']),
+ ('и', &['И']),
+ ('й', &['Й']),
+ ('к', &['К']),
+ ('л', &['Л']),
+ ('м', &['М']),
+ ('н', &['Н']),
+ ('о', &['О', 'ᲂ']),
+ ('п', &['П']),
+ ('р', &['Р']),
+ ('с', &['С', 'ᲃ']),
+ ('т', &['Т', 'ᲄ', 'ᲅ']),
+ ('у', &['У']),
+ ('ф', &['Ф']),
+ ('х', &['Х']),
+ ('ц', &['Ц']),
+ ('ч', &['Ч']),
+ ('ш', &['Ш']),
+ ('щ', &['Щ']),
+ ('ъ', &['Ъ', 'ᲆ']),
+ ('ы', &['Ы']),
+ ('ь', &['Ь']),
+ ('э', &['Э']),
+ ('ю', &['Ю']),
+ ('я', &['Я']),
+ ('ѐ', &['Ѐ']),
+ ('ё', &['Ё']),
+ ('ђ', &['Ђ']),
+ ('ѓ', &['Ѓ']),
+ ('є', &['Є']),
+ ('ѕ', &['Ѕ']),
+ ('і', &['І']),
+ ('ї', &['Ї']),
+ ('ј', &['Ј']),
+ ('љ', &['Љ']),
+ ('њ', &['Њ']),
+ ('ћ', &['Ћ']),
+ ('ќ', &['Ќ']),
+ ('ѝ', &['Ѝ']),
+ ('ў', &['Ў']),
+ ('џ', &['Џ']),
+ ('Ѡ', &['ѡ']),
+ ('ѡ', &['Ѡ']),
+ ('Ѣ', &['ѣ', 'ᲇ']),
+ ('ѣ', &['Ѣ', 'ᲇ']),
+ ('Ѥ', &['ѥ']),
+ ('ѥ', &['Ѥ']),
+ ('Ѧ', &['ѧ']),
+ ('ѧ', &['Ѧ']),
+ ('Ѩ', &['ѩ']),
+ ('ѩ', &['Ѩ']),
+ ('Ѫ', &['ѫ']),
+ ('ѫ', &['Ѫ']),
+ ('Ѭ', &['ѭ']),
+ ('ѭ', &['Ѭ']),
+ ('Ѯ', &['ѯ']),
+ ('ѯ', &['Ѯ']),
+ ('Ѱ', &['ѱ']),
+ ('ѱ', &['Ѱ']),
+ ('Ѳ', &['ѳ']),
+ ('ѳ', &['Ѳ']),
+ ('Ѵ', &['ѵ']),
+ ('ѵ', &['Ѵ']),
+ ('Ѷ', &['ѷ']),
+ ('ѷ', &['Ѷ']),
+ ('Ѹ', &['ѹ']),
+ ('ѹ', &['Ѹ']),
+ ('Ѻ', &['ѻ']),
+ ('ѻ', &['Ѻ']),
+ ('Ѽ', &['ѽ']),
+ ('ѽ', &['Ѽ']),
+ ('Ѿ', &['ѿ']),
+ ('ѿ', &['Ѿ']),
+ ('Ҁ', &['ҁ']),
+ ('ҁ', &['Ҁ']),
+ ('Ҋ', &['ҋ']),
+ ('ҋ', &['Ҋ']),
+ ('Ҍ', &['ҍ']),
+ ('ҍ', &['Ҍ']),
+ ('Ҏ', &['ҏ']),
+ ('ҏ', &['Ҏ']),
+ ('Ґ', &['ґ']),
+ ('ґ', &['Ґ']),
+ ('Ғ', &['ғ']),
+ ('ғ', &['Ғ']),
+ ('Ҕ', &['ҕ']),
+ ('ҕ', &['Ҕ']),
+ ('Җ', &['җ']),
+ ('җ', &['Җ']),
+ ('Ҙ', &['ҙ']),
+ ('ҙ', &['Ҙ']),
+ ('Қ', &['қ']),
+ ('қ', &['Қ']),
+ ('Ҝ', &['ҝ']),
+ ('ҝ', &['Ҝ']),
+ ('Ҟ', &['ҟ']),
+ ('ҟ', &['Ҟ']),
+ ('Ҡ', &['ҡ']),
+ ('ҡ', &['Ҡ']),
+ ('Ң', &['ң']),
+ ('ң', &['Ң']),
+ ('Ҥ', &['ҥ']),
+ ('ҥ', &['Ҥ']),
+ ('Ҧ', &['ҧ']),
+ ('ҧ', &['Ҧ']),
+ ('Ҩ', &['ҩ']),
+ ('ҩ', &['Ҩ']),
+ ('Ҫ', &['ҫ']),
+ ('ҫ', &['Ҫ']),
+ ('Ҭ', &['ҭ']),
+ ('ҭ', &['Ҭ']),
+ ('Ү', &['ү']),
+ ('ү', &['Ү']),
+ ('Ұ', &['ұ']),
+ ('ұ', &['Ұ']),
+ ('Ҳ', &['ҳ']),
+ ('ҳ', &['Ҳ']),
+ ('Ҵ', &['ҵ']),
+ ('ҵ', &['Ҵ']),
+ ('Ҷ', &['ҷ']),
+ ('ҷ', &['Ҷ']),
+ ('Ҹ', &['ҹ']),
+ ('ҹ', &['Ҹ']),
+ ('Һ', &['һ']),
+ ('һ', &['Һ']),
+ ('Ҽ', &['ҽ']),
+ ('ҽ', &['Ҽ']),
+ ('Ҿ', &['ҿ']),
+ ('ҿ', &['Ҿ']),
+ ('Ӏ', &['ӏ']),
+ ('Ӂ', &['ӂ']),
+ ('ӂ', &['Ӂ']),
+ ('Ӄ', &['ӄ']),
+ ('ӄ', &['Ӄ']),
+ ('Ӆ', &['ӆ']),
+ ('ӆ', &['Ӆ']),
+ ('Ӈ', &['ӈ']),
+ ('ӈ', &['Ӈ']),
+ ('Ӊ', &['ӊ']),
+ ('ӊ', &['Ӊ']),
+ ('Ӌ', &['ӌ']),
+ ('ӌ', &['Ӌ']),
+ ('Ӎ', &['ӎ']),
+ ('ӎ', &['Ӎ']),
+ ('ӏ', &['Ӏ']),
+ ('Ӑ', &['ӑ']),
+ ('ӑ', &['Ӑ']),
+ ('Ӓ', &['ӓ']),
+ ('ӓ', &['Ӓ']),
+ ('Ӕ', &['ӕ']),
+ ('ӕ', &['Ӕ']),
+ ('Ӗ', &['ӗ']),
+ ('ӗ', &['Ӗ']),
+ ('Ә', &['ә']),
+ ('ә', &['Ә']),
+ ('Ӛ', &['ӛ']),
+ ('ӛ', &['Ӛ']),
+ ('Ӝ', &['ӝ']),
+ ('ӝ', &['Ӝ']),
+ ('Ӟ', &['ӟ']),
+ ('ӟ', &['Ӟ']),
+ ('Ӡ', &['ӡ']),
+ ('ӡ', &['Ӡ']),
+ ('Ӣ', &['ӣ']),
+ ('ӣ', &['Ӣ']),
+ ('Ӥ', &['ӥ']),
+ ('ӥ', &['Ӥ']),
+ ('Ӧ', &['ӧ']),
+ ('ӧ', &['Ӧ']),
+ ('Ө', &['ө']),
+ ('ө', &['Ө']),
+ ('Ӫ', &['ӫ']),
+ ('ӫ', &['Ӫ']),
+ ('Ӭ', &['ӭ']),
+ ('ӭ', &['Ӭ']),
+ ('Ӯ', &['ӯ']),
+ ('ӯ', &['Ӯ']),
+ ('Ӱ', &['ӱ']),
+ ('ӱ', &['Ӱ']),
+ ('Ӳ', &['ӳ']),
+ ('ӳ', &['Ӳ']),
+ ('Ӵ', &['ӵ']),
+ ('ӵ', &['Ӵ']),
+ ('Ӷ', &['ӷ']),
+ ('ӷ', &['Ӷ']),
+ ('Ӹ', &['ӹ']),
+ ('ӹ', &['Ӹ']),
+ ('Ӻ', &['ӻ']),
+ ('ӻ', &['Ӻ']),
+ ('Ӽ', &['ӽ']),
+ ('ӽ', &['Ӽ']),
+ ('Ӿ', &['ӿ']),
+ ('ӿ', &['Ӿ']),
+ ('Ԁ', &['ԁ']),
+ ('ԁ', &['Ԁ']),
+ ('Ԃ', &['ԃ']),
+ ('ԃ', &['Ԃ']),
+ ('Ԅ', &['ԅ']),
+ ('ԅ', &['Ԅ']),
+ ('Ԇ', &['ԇ']),
+ ('ԇ', &['Ԇ']),
+ ('Ԉ', &['ԉ']),
+ ('ԉ', &['Ԉ']),
+ ('Ԋ', &['ԋ']),
+ ('ԋ', &['Ԋ']),
+ ('Ԍ', &['ԍ']),
+ ('ԍ', &['Ԍ']),
+ ('Ԏ', &['ԏ']),
+ ('ԏ', &['Ԏ']),
+ ('Ԑ', &['ԑ']),
+ ('ԑ', &['Ԑ']),
+ ('Ԓ', &['ԓ']),
+ ('ԓ', &['Ԓ']),
+ ('Ԕ', &['ԕ']),
+ ('ԕ', &['Ԕ']),
+ ('Ԗ', &['ԗ']),
+ ('ԗ', &['Ԗ']),
+ ('Ԙ', &['ԙ']),
+ ('ԙ', &['Ԙ']),
+ ('Ԛ', &['ԛ']),
+ ('ԛ', &['Ԛ']),
+ ('Ԝ', &['ԝ']),
+ ('ԝ', &['Ԝ']),
+ ('Ԟ', &['ԟ']),
+ ('ԟ', &['Ԟ']),
+ ('Ԡ', &['ԡ']),
+ ('ԡ', &['Ԡ']),
+ ('Ԣ', &['ԣ']),
+ ('ԣ', &['Ԣ']),
+ ('Ԥ', &['ԥ']),
+ ('ԥ', &['Ԥ']),
+ ('Ԧ', &['ԧ']),
+ ('ԧ', &['Ԧ']),
+ ('Ԩ', &['ԩ']),
+ ('ԩ', &['Ԩ']),
+ ('Ԫ', &['ԫ']),
+ ('ԫ', &['Ԫ']),
+ ('Ԭ', &['ԭ']),
+ ('ԭ', &['Ԭ']),
+ ('Ԯ', &['ԯ']),
+ ('ԯ', &['Ԯ']),
+ ('Ա', &['ա']),
+ ('Բ', &['բ']),
+ ('Գ', &['գ']),
+ ('Դ', &['դ']),
+ ('Ե', &['ե']),
+ ('Զ', &['զ']),
+ ('Է', &['է']),
+ ('Ը', &['ը']),
+ ('Թ', &['թ']),
+ ('Ժ', &['ժ']),
+ ('Ի', &['ի']),
+ ('Լ', &['լ']),
+ ('Խ', &['խ']),
+ ('Ծ', &['ծ']),
+ ('Կ', &['կ']),
+ ('Հ', &['հ']),
+ ('Ձ', &['ձ']),
+ ('Ղ', &['ղ']),
+ ('Ճ', &['ճ']),
+ ('Մ', &['մ']),
+ ('Յ', &['յ']),
+ ('Ն', &['ն']),
+ ('Շ', &['շ']),
+ ('Ո', &['ո']),
+ ('Չ', &['չ']),
+ ('Պ', &['պ']),
+ ('Ջ', &['ջ']),
+ ('Ռ', &['ռ']),
+ ('Ս', &['ս']),
+ ('Վ', &['վ']),
+ ('Տ', &['տ']),
+ ('Ր', &['ր']),
+ ('Ց', &['ց']),
+ ('Ւ', &['ւ']),
+ ('Փ', &['փ']),
+ ('Ք', &['ք']),
+ ('Օ', &['օ']),
+ ('Ֆ', &['ֆ']),
+ ('ա', &['Ա']),
+ ('բ', &['Բ']),
+ ('գ', &['Գ']),
+ ('դ', &['Դ']),
+ ('ե', &['Ե']),
+ ('զ', &['Զ']),
+ ('է', &['Է']),
+ ('ը', &['Ը']),
+ ('թ', &['Թ']),
+ ('ժ', &['Ժ']),
+ ('ի', &['Ի']),
+ ('լ', &['Լ']),
+ ('խ', &['Խ']),
+ ('ծ', &['Ծ']),
+ ('կ', &['Կ']),
+ ('հ', &['Հ']),
+ ('ձ', &['Ձ']),
+ ('ղ', &['Ղ']),
+ ('ճ', &['Ճ']),
+ ('մ', &['Մ']),
+ ('յ', &['Յ']),
+ ('ն', &['Ն']),
+ ('շ', &['Շ']),
+ ('ո', &['Ո']),
+ ('չ', &['Չ']),
+ ('պ', &['Պ']),
+ ('ջ', &['Ջ']),
+ ('ռ', &['Ռ']),
+ ('ս', &['Ս']),
+ ('վ', &['Վ']),
+ ('տ', &['Տ']),
+ ('ր', &['Ր']),
+ ('ց', &['Ց']),
+ ('ւ', &['Ւ']),
+ ('փ', &['Փ']),
+ ('ք', &['Ք']),
+ ('օ', &['Օ']),
+ ('ֆ', &['Ֆ']),
+ ('Ⴀ', &['ⴀ']),
+ ('Ⴁ', &['ⴁ']),
+ ('Ⴂ', &['ⴂ']),
+ ('Ⴃ', &['ⴃ']),
+ ('Ⴄ', &['ⴄ']),
+ ('Ⴅ', &['ⴅ']),
+ ('Ⴆ', &['ⴆ']),
+ ('Ⴇ', &['ⴇ']),
+ ('Ⴈ', &['ⴈ']),
+ ('Ⴉ', &['ⴉ']),
+ ('Ⴊ', &['ⴊ']),
+ ('Ⴋ', &['ⴋ']),
+ ('Ⴌ', &['ⴌ']),
+ ('Ⴍ', &['ⴍ']),
+ ('Ⴎ', &['ⴎ']),
+ ('Ⴏ', &['ⴏ']),
+ ('Ⴐ', &['ⴐ']),
+ ('Ⴑ', &['ⴑ']),
+ ('Ⴒ', &['ⴒ']),
+ ('Ⴓ', &['ⴓ']),
+ ('Ⴔ', &['ⴔ']),
+ ('Ⴕ', &['ⴕ']),
+ ('Ⴖ', &['ⴖ']),
+ ('Ⴗ', &['ⴗ']),
+ ('Ⴘ', &['ⴘ']),
+ ('Ⴙ', &['ⴙ']),
+ ('Ⴚ', &['ⴚ']),
+ ('Ⴛ', &['ⴛ']),
+ ('Ⴜ', &['ⴜ']),
+ ('Ⴝ', &['ⴝ']),
+ ('Ⴞ', &['ⴞ']),
+ ('Ⴟ', &['ⴟ']),
+ ('Ⴠ', &['ⴠ']),
+ ('Ⴡ', &['ⴡ']),
+ ('Ⴢ', &['ⴢ']),
+ ('Ⴣ', &['ⴣ']),
+ ('Ⴤ', &['ⴤ']),
+ ('Ⴥ', &['ⴥ']),
+ ('Ⴧ', &['ⴧ']),
+ ('Ⴭ', &['ⴭ']),
+ ('ა', &['Ა']),
+ ('ბ', &['Ბ']),
+ ('გ', &['Გ']),
+ ('დ', &['Დ']),
+ ('ე', &['Ე']),
+ ('ვ', &['Ვ']),
+ ('ზ', &['Ზ']),
+ ('თ', &['Თ']),
+ ('ი', &['Ი']),
+ ('კ', &['Კ']),
+ ('ლ', &['Ლ']),
+ ('მ', &['Მ']),
+ ('ნ', &['Ნ']),
+ ('ო', &['Ო']),
+ ('პ', &['Პ']),
+ ('ჟ', &['Ჟ']),
+ ('რ', &['Რ']),
+ ('ს', &['Ს']),
+ ('ტ', &['Ტ']),
+ ('უ', &['Უ']),
+ ('ფ', &['Ფ']),
+ ('ქ', &['Ქ']),
+ ('ღ', &['Ღ']),
+ ('ყ', &['Ყ']),
+ ('შ', &['Შ']),
+ ('ჩ', &['Ჩ']),
+ ('ც', &['Ც']),
+ ('ძ', &['Ძ']),
+ ('წ', &['Წ']),
+ ('ჭ', &['Ჭ']),
+ ('ხ', &['Ხ']),
+ ('ჯ', &['Ჯ']),
+ ('ჰ', &['Ჰ']),
+ ('ჱ', &['Ჱ']),
+ ('ჲ', &['Ჲ']),
+ ('ჳ', &['Ჳ']),
+ ('ჴ', &['Ჴ']),
+ ('ჵ', &['Ჵ']),
+ ('ჶ', &['Ჶ']),
+ ('ჷ', &['Ჷ']),
+ ('ჸ', &['Ჸ']),
+ ('ჹ', &['Ჹ']),
+ ('ჺ', &['Ჺ']),
+ ('ჽ', &['Ჽ']),
+ ('ჾ', &['Ჾ']),
+ ('ჿ', &['Ჿ']),
+ ('Ꭰ', &['ꭰ']),
+ ('Ꭱ', &['ꭱ']),
+ ('Ꭲ', &['ꭲ']),
+ ('Ꭳ', &['ꭳ']),
+ ('Ꭴ', &['ꭴ']),
+ ('Ꭵ', &['ꭵ']),
+ ('Ꭶ', &['ꭶ']),
+ ('Ꭷ', &['ꭷ']),
+ ('Ꭸ', &['ꭸ']),
+ ('Ꭹ', &['ꭹ']),
+ ('Ꭺ', &['ꭺ']),
+ ('Ꭻ', &['ꭻ']),
+ ('Ꭼ', &['ꭼ']),
+ ('Ꭽ', &['ꭽ']),
+ ('Ꭾ', &['ꭾ']),
+ ('Ꭿ', &['ꭿ']),
+ ('Ꮀ', &['ꮀ']),
+ ('Ꮁ', &['ꮁ']),
+ ('Ꮂ', &['ꮂ']),
+ ('Ꮃ', &['ꮃ']),
+ ('Ꮄ', &['ꮄ']),
+ ('Ꮅ', &['ꮅ']),
+ ('Ꮆ', &['ꮆ']),
+ ('Ꮇ', &['ꮇ']),
+ ('Ꮈ', &['ꮈ']),
+ ('Ꮉ', &['ꮉ']),
+ ('Ꮊ', &['ꮊ']),
+ ('Ꮋ', &['ꮋ']),
+ ('Ꮌ', &['ꮌ']),
+ ('Ꮍ', &['ꮍ']),
+ ('Ꮎ', &['ꮎ']),
+ ('Ꮏ', &['ꮏ']),
+ ('Ꮐ', &['ꮐ']),
+ ('Ꮑ', &['ꮑ']),
+ ('Ꮒ', &['ꮒ']),
+ ('Ꮓ', &['ꮓ']),
+ ('Ꮔ', &['ꮔ']),
+ ('Ꮕ', &['ꮕ']),
+ ('Ꮖ', &['ꮖ']),
+ ('Ꮗ', &['ꮗ']),
+ ('Ꮘ', &['ꮘ']),
+ ('Ꮙ', &['ꮙ']),
+ ('Ꮚ', &['ꮚ']),
+ ('Ꮛ', &['ꮛ']),
+ ('Ꮜ', &['ꮜ']),
+ ('Ꮝ', &['ꮝ']),
+ ('Ꮞ', &['ꮞ']),
+ ('Ꮟ', &['ꮟ']),
+ ('Ꮠ', &['ꮠ']),
+ ('Ꮡ', &['ꮡ']),
+ ('Ꮢ', &['ꮢ']),
+ ('Ꮣ', &['ꮣ']),
+ ('Ꮤ', &['ꮤ']),
+ ('Ꮥ', &['ꮥ']),
+ ('Ꮦ', &['ꮦ']),
+ ('Ꮧ', &['ꮧ']),
+ ('Ꮨ', &['ꮨ']),
+ ('Ꮩ', &['ꮩ']),
+ ('Ꮪ', &['ꮪ']),
+ ('Ꮫ', &['ꮫ']),
+ ('Ꮬ', &['ꮬ']),
+ ('Ꮭ', &['ꮭ']),
+ ('Ꮮ', &['ꮮ']),
+ ('Ꮯ', &['ꮯ']),
+ ('Ꮰ', &['ꮰ']),
+ ('Ꮱ', &['ꮱ']),
+ ('Ꮲ', &['ꮲ']),
+ ('Ꮳ', &['ꮳ']),
+ ('Ꮴ', &['ꮴ']),
+ ('Ꮵ', &['ꮵ']),
+ ('Ꮶ', &['ꮶ']),
+ ('Ꮷ', &['ꮷ']),
+ ('Ꮸ', &['ꮸ']),
+ ('Ꮹ', &['ꮹ']),
+ ('Ꮺ', &['ꮺ']),
+ ('Ꮻ', &['ꮻ']),
+ ('Ꮼ', &['ꮼ']),
+ ('Ꮽ', &['ꮽ']),
+ ('Ꮾ', &['ꮾ']),
+ ('Ꮿ', &['ꮿ']),
+ ('Ᏸ', &['ᏸ']),
+ ('Ᏹ', &['ᏹ']),
+ ('Ᏺ', &['ᏺ']),
+ ('Ᏻ', &['ᏻ']),
+ ('Ᏼ', &['ᏼ']),
+ ('Ᏽ', &['ᏽ']),
+ ('ᏸ', &['Ᏸ']),
+ ('ᏹ', &['Ᏹ']),
+ ('ᏺ', &['Ᏺ']),
+ ('ᏻ', &['Ᏻ']),
+ ('ᏼ', &['Ᏼ']),
+ ('ᏽ', &['Ᏽ']),
+ ('ᲀ', &['В', 'в']),
+ ('ᲁ', &['Д', 'д']),
+ ('ᲂ', &['О', 'о']),
+ ('ᲃ', &['С', 'с']),
+ ('ᲄ', &['Т', 'т', 'ᲅ']),
+ ('ᲅ', &['Т', 'т', 'ᲄ']),
+ ('ᲆ', &['Ъ', 'ъ']),
+ ('ᲇ', &['Ѣ', 'ѣ']),
+ ('ᲈ', &['Ꙋ', 'ꙋ']),
+ ('Ა', &['ა']),
+ ('Ბ', &['ბ']),
+ ('Გ', &['გ']),
+ ('Დ', &['დ']),
+ ('Ე', &['ე']),
+ ('Ვ', &['ვ']),
+ ('Ზ', &['ზ']),
+ ('Თ', &['თ']),
+ ('Ი', &['ი']),
+ ('Კ', &['კ']),
+ ('Ლ', &['ლ']),
+ ('Მ', &['მ']),
+ ('Ნ', &['ნ']),
+ ('Ო', &['ო']),
+ ('Პ', &['პ']),
+ ('Ჟ', &['ჟ']),
+ ('Რ', &['რ']),
+ ('Ს', &['ს']),
+ ('Ტ', &['ტ']),
+ ('Უ', &['უ']),
+ ('Ფ', &['ფ']),
+ ('Ქ', &['ქ']),
+ ('Ღ', &['ღ']),
+ ('Ყ', &['ყ']),
+ ('Შ', &['შ']),
+ ('Ჩ', &['ჩ']),
+ ('Ც', &['ც']),
+ ('Ძ', &['ძ']),
+ ('Წ', &['წ']),
+ ('Ჭ', &['ჭ']),
+ ('Ხ', &['ხ']),
+ ('Ჯ', &['ჯ']),
+ ('Ჰ', &['ჰ']),
+ ('Ჱ', &['ჱ']),
+ ('Ჲ', &['ჲ']),
+ ('Ჳ', &['ჳ']),
+ ('Ჴ', &['ჴ']),
+ ('Ჵ', &['ჵ']),
+ ('Ჶ', &['ჶ']),
+ ('Ჷ', &['ჷ']),
+ ('Ჸ', &['ჸ']),
+ ('Ჹ', &['ჹ']),
+ ('Ჺ', &['ჺ']),
+ ('Ჽ', &['ჽ']),
+ ('Ჾ', &['ჾ']),
+ ('Ჿ', &['ჿ']),
+ ('ᵹ', &['Ᵹ']),
+ ('ᵽ', &['Ᵽ']),
+ ('ᶎ', &['Ᶎ']),
+ ('Ḁ', &['ḁ']),
+ ('ḁ', &['Ḁ']),
+ ('Ḃ', &['ḃ']),
+ ('ḃ', &['Ḃ']),
+ ('Ḅ', &['ḅ']),
+ ('ḅ', &['Ḅ']),
+ ('Ḇ', &['ḇ']),
+ ('ḇ', &['Ḇ']),
+ ('Ḉ', &['ḉ']),
+ ('ḉ', &['Ḉ']),
+ ('Ḋ', &['ḋ']),
+ ('ḋ', &['Ḋ']),
+ ('Ḍ', &['ḍ']),
+ ('ḍ', &['Ḍ']),
+ ('Ḏ', &['ḏ']),
+ ('ḏ', &['Ḏ']),
+ ('Ḑ', &['ḑ']),
+ ('ḑ', &['Ḑ']),
+ ('Ḓ', &['ḓ']),
+ ('ḓ', &['Ḓ']),
+ ('Ḕ', &['ḕ']),
+ ('ḕ', &['Ḕ']),
+ ('Ḗ', &['ḗ']),
+ ('ḗ', &['Ḗ']),
+ ('Ḙ', &['ḙ']),
+ ('ḙ', &['Ḙ']),
+ ('Ḛ', &['ḛ']),
+ ('ḛ', &['Ḛ']),
+ ('Ḝ', &['ḝ']),
+ ('ḝ', &['Ḝ']),
+ ('Ḟ', &['ḟ']),
+ ('ḟ', &['Ḟ']),
+ ('Ḡ', &['ḡ']),
+ ('ḡ', &['Ḡ']),
+ ('Ḣ', &['ḣ']),
+ ('ḣ', &['Ḣ']),
+ ('Ḥ', &['ḥ']),
+ ('ḥ', &['Ḥ']),
+ ('Ḧ', &['ḧ']),
+ ('ḧ', &['Ḧ']),
+ ('Ḩ', &['ḩ']),
+ ('ḩ', &['Ḩ']),
+ ('Ḫ', &['ḫ']),
+ ('ḫ', &['Ḫ']),
+ ('Ḭ', &['ḭ']),
+ ('ḭ', &['Ḭ']),
+ ('Ḯ', &['ḯ']),
+ ('ḯ', &['Ḯ']),
+ ('Ḱ', &['ḱ']),
+ ('ḱ', &['Ḱ']),
+ ('Ḳ', &['ḳ']),
+ ('ḳ', &['Ḳ']),
+ ('Ḵ', &['ḵ']),
+ ('ḵ', &['Ḵ']),
+ ('Ḷ', &['ḷ']),
+ ('ḷ', &['Ḷ']),
+ ('Ḹ', &['ḹ']),
+ ('ḹ', &['Ḹ']),
+ ('Ḻ', &['ḻ']),
+ ('ḻ', &['Ḻ']),
+ ('Ḽ', &['ḽ']),
+ ('ḽ', &['Ḽ']),
+ ('Ḿ', &['ḿ']),
+ ('ḿ', &['Ḿ']),
+ ('Ṁ', &['ṁ']),
+ ('ṁ', &['Ṁ']),
+ ('Ṃ', &['ṃ']),
+ ('ṃ', &['Ṃ']),
+ ('Ṅ', &['ṅ']),
+ ('ṅ', &['Ṅ']),
+ ('Ṇ', &['ṇ']),
+ ('ṇ', &['Ṇ']),
+ ('Ṉ', &['ṉ']),
+ ('ṉ', &['Ṉ']),
+ ('Ṋ', &['ṋ']),
+ ('ṋ', &['Ṋ']),
+ ('Ṍ', &['ṍ']),
+ ('ṍ', &['Ṍ']),
+ ('Ṏ', &['ṏ']),
+ ('ṏ', &['Ṏ']),
+ ('Ṑ', &['ṑ']),
+ ('ṑ', &['Ṑ']),
+ ('Ṓ', &['ṓ']),
+ ('ṓ', &['Ṓ']),
+ ('Ṕ', &['ṕ']),
+ ('ṕ', &['Ṕ']),
+ ('Ṗ', &['ṗ']),
+ ('ṗ', &['Ṗ']),
+ ('Ṙ', &['ṙ']),
+ ('ṙ', &['Ṙ']),
+ ('Ṛ', &['ṛ']),
+ ('ṛ', &['Ṛ']),
+ ('Ṝ', &['ṝ']),
+ ('ṝ', &['Ṝ']),
+ ('Ṟ', &['ṟ']),
+ ('ṟ', &['Ṟ']),
+ ('Ṡ', &['ṡ', 'ẛ']),
+ ('ṡ', &['Ṡ', 'ẛ']),
+ ('Ṣ', &['ṣ']),
+ ('ṣ', &['Ṣ']),
+ ('Ṥ', &['ṥ']),
+ ('ṥ', &['Ṥ']),
+ ('Ṧ', &['ṧ']),
+ ('ṧ', &['Ṧ']),
+ ('Ṩ', &['ṩ']),
+ ('ṩ', &['Ṩ']),
+ ('Ṫ', &['ṫ']),
+ ('ṫ', &['Ṫ']),
+ ('Ṭ', &['ṭ']),
+ ('ṭ', &['Ṭ']),
+ ('Ṯ', &['ṯ']),
+ ('ṯ', &['Ṯ']),
+ ('Ṱ', &['ṱ']),
+ ('ṱ', &['Ṱ']),
+ ('Ṳ', &['ṳ']),
+ ('ṳ', &['Ṳ']),
+ ('Ṵ', &['ṵ']),
+ ('ṵ', &['Ṵ']),
+ ('Ṷ', &['ṷ']),
+ ('ṷ', &['Ṷ']),
+ ('Ṹ', &['ṹ']),
+ ('ṹ', &['Ṹ']),
+ ('Ṻ', &['ṻ']),
+ ('ṻ', &['Ṻ']),
+ ('Ṽ', &['ṽ']),
+ ('ṽ', &['Ṽ']),
+ ('Ṿ', &['ṿ']),
+ ('ṿ', &['Ṿ']),
+ ('Ẁ', &['ẁ']),
+ ('ẁ', &['Ẁ']),
+ ('Ẃ', &['ẃ']),
+ ('ẃ', &['Ẃ']),
+ ('Ẅ', &['ẅ']),
+ ('ẅ', &['Ẅ']),
+ ('Ẇ', &['ẇ']),
+ ('ẇ', &['Ẇ']),
+ ('Ẉ', &['ẉ']),
+ ('ẉ', &['Ẉ']),
+ ('Ẋ', &['ẋ']),
+ ('ẋ', &['Ẋ']),
+ ('Ẍ', &['ẍ']),
+ ('ẍ', &['Ẍ']),
+ ('Ẏ', &['ẏ']),
+ ('ẏ', &['Ẏ']),
+ ('Ẑ', &['ẑ']),
+ ('ẑ', &['Ẑ']),
+ ('Ẓ', &['ẓ']),
+ ('ẓ', &['Ẓ']),
+ ('Ẕ', &['ẕ']),
+ ('ẕ', &['Ẕ']),
+ ('ẛ', &['Ṡ', 'ṡ']),
+ ('ẞ', &['ß']),
+ ('Ạ', &['ạ']),
+ ('ạ', &['Ạ']),
+ ('Ả', &['ả']),
+ ('ả', &['Ả']),
+ ('Ấ', &['ấ']),
+ ('ấ', &['Ấ']),
+ ('Ầ', &['ầ']),
+ ('ầ', &['Ầ']),
+ ('Ẩ', &['ẩ']),
+ ('ẩ', &['Ẩ']),
+ ('Ẫ', &['ẫ']),
+ ('ẫ', &['Ẫ']),
+ ('Ậ', &['ậ']),
+ ('ậ', &['Ậ']),
+ ('Ắ', &['ắ']),
+ ('ắ', &['Ắ']),
+ ('Ằ', &['ằ']),
+ ('ằ', &['Ằ']),
+ ('Ẳ', &['ẳ']),
+ ('ẳ', &['Ẳ']),
+ ('Ẵ', &['ẵ']),
+ ('ẵ', &['Ẵ']),
+ ('Ặ', &['ặ']),
+ ('ặ', &['Ặ']),
+ ('Ẹ', &['ẹ']),
+ ('ẹ', &['Ẹ']),
+ ('Ẻ', &['ẻ']),
+ ('ẻ', &['Ẻ']),
+ ('Ẽ', &['ẽ']),
+ ('ẽ', &['Ẽ']),
+ ('Ế', &['ế']),
+ ('ế', &['Ế']),
+ ('Ề', &['ề']),
+ ('ề', &['Ề']),
+ ('Ể', &['ể']),
+ ('ể', &['Ể']),
+ ('Ễ', &['ễ']),
+ ('ễ', &['Ễ']),
+ ('Ệ', &['ệ']),
+ ('ệ', &['Ệ']),
+ ('Ỉ', &['ỉ']),
+ ('ỉ', &['Ỉ']),
+ ('Ị', &['ị']),
+ ('ị', &['Ị']),
+ ('Ọ', &['ọ']),
+ ('ọ', &['Ọ']),
+ ('Ỏ', &['ỏ']),
+ ('ỏ', &['Ỏ']),
+ ('Ố', &['ố']),
+ ('ố', &['Ố']),
+ ('Ồ', &['ồ']),
+ ('ồ', &['Ồ']),
+ ('Ổ', &['ổ']),
+ ('ổ', &['Ổ']),
+ ('Ỗ', &['ỗ']),
+ ('ỗ', &['Ỗ']),
+ ('Ộ', &['ộ']),
+ ('ộ', &['Ộ']),
+ ('Ớ', &['ớ']),
+ ('ớ', &['Ớ']),
+ ('Ờ', &['ờ']),
+ ('ờ', &['Ờ']),
+ ('Ở', &['ở']),
+ ('ở', &['Ở']),
+ ('Ỡ', &['ỡ']),
+ ('ỡ', &['Ỡ']),
+ ('Ợ', &['ợ']),
+ ('ợ', &['Ợ']),
+ ('Ụ', &['ụ']),
+ ('ụ', &['Ụ']),
+ ('Ủ', &['ủ']),
+ ('ủ', &['Ủ']),
+ ('Ứ', &['ứ']),
+ ('ứ', &['Ứ']),
+ ('Ừ', &['ừ']),
+ ('ừ', &['Ừ']),
+ ('Ử', &['ử']),
+ ('ử', &['Ử']),
+ ('Ữ', &['ữ']),
+ ('ữ', &['Ữ']),
+ ('Ự', &['ự']),
+ ('ự', &['Ự']),
+ ('Ỳ', &['ỳ']),
+ ('ỳ', &['Ỳ']),
+ ('Ỵ', &['ỵ']),
+ ('ỵ', &['Ỵ']),
+ ('Ỷ', &['ỷ']),
+ ('ỷ', &['Ỷ']),
+ ('Ỹ', &['ỹ']),
+ ('ỹ', &['Ỹ']),
+ ('Ỻ', &['ỻ']),
+ ('ỻ', &['Ỻ']),
+ ('Ỽ', &['ỽ']),
+ ('ỽ', &['Ỽ']),
+ ('Ỿ', &['ỿ']),
+ ('ỿ', &['Ỿ']),
+ ('ἀ', &['Ἀ']),
+ ('ἁ', &['Ἁ']),
+ ('ἂ', &['Ἂ']),
+ ('ἃ', &['Ἃ']),
+ ('ἄ', &['Ἄ']),
+ ('ἅ', &['Ἅ']),
+ ('ἆ', &['Ἆ']),
+ ('ἇ', &['Ἇ']),
+ ('Ἀ', &['ἀ']),
+ ('Ἁ', &['ἁ']),
+ ('Ἂ', &['ἂ']),
+ ('Ἃ', &['ἃ']),
+ ('Ἄ', &['ἄ']),
+ ('Ἅ', &['ἅ']),
+ ('Ἆ', &['ἆ']),
+ ('Ἇ', &['ἇ']),
+ ('ἐ', &['Ἐ']),
+ ('ἑ', &['Ἑ']),
+ ('ἒ', &['Ἒ']),
+ ('ἓ', &['Ἓ']),
+ ('ἔ', &['Ἔ']),
+ ('ἕ', &['Ἕ']),
+ ('Ἐ', &['ἐ']),
+ ('Ἑ', &['ἑ']),
+ ('Ἒ', &['ἒ']),
+ ('Ἓ', &['ἓ']),
+ ('Ἔ', &['ἔ']),
+ ('Ἕ', &['ἕ']),
+ ('ἠ', &['Ἠ']),
+ ('ἡ', &['Ἡ']),
+ ('ἢ', &['Ἢ']),
+ ('ἣ', &['Ἣ']),
+ ('ἤ', &['Ἤ']),
+ ('ἥ', &['Ἥ']),
+ ('ἦ', &['Ἦ']),
+ ('ἧ', &['Ἧ']),
+ ('Ἠ', &['ἠ']),
+ ('Ἡ', &['ἡ']),
+ ('Ἢ', &['ἢ']),
+ ('Ἣ', &['ἣ']),
+ ('Ἤ', &['ἤ']),
+ ('Ἥ', &['ἥ']),
+ ('Ἦ', &['ἦ']),
+ ('Ἧ', &['ἧ']),
+ ('ἰ', &['Ἰ']),
+ ('ἱ', &['Ἱ']),
+ ('ἲ', &['Ἲ']),
+ ('ἳ', &['Ἳ']),
+ ('ἴ', &['Ἴ']),
+ ('ἵ', &['Ἵ']),
+ ('ἶ', &['Ἶ']),
+ ('ἷ', &['Ἷ']),
+ ('Ἰ', &['ἰ']),
+ ('Ἱ', &['ἱ']),
+ ('Ἲ', &['ἲ']),
+ ('Ἳ', &['ἳ']),
+ ('Ἴ', &['ἴ']),
+ ('Ἵ', &['ἵ']),
+ ('Ἶ', &['ἶ']),
+ ('Ἷ', &['ἷ']),
+ ('ὀ', &['Ὀ']),
+ ('ὁ', &['Ὁ']),
+ ('ὂ', &['Ὂ']),
+ ('ὃ', &['Ὃ']),
+ ('ὄ', &['Ὄ']),
+ ('ὅ', &['Ὅ']),
+ ('Ὀ', &['ὀ']),
+ ('Ὁ', &['ὁ']),
+ ('Ὂ', &['ὂ']),
+ ('Ὃ', &['ὃ']),
+ ('Ὄ', &['ὄ']),
+ ('Ὅ', &['ὅ']),
+ ('ὑ', &['Ὑ']),
+ ('ὓ', &['Ὓ']),
+ ('ὕ', &['Ὕ']),
+ ('ὗ', &['Ὗ']),
+ ('Ὑ', &['ὑ']),
+ ('Ὓ', &['ὓ']),
+ ('Ὕ', &['ὕ']),
+ ('Ὗ', &['ὗ']),
+ ('ὠ', &['Ὠ']),
+ ('ὡ', &['Ὡ']),
+ ('ὢ', &['Ὢ']),
+ ('ὣ', &['Ὣ']),
+ ('ὤ', &['Ὤ']),
+ ('ὥ', &['Ὥ']),
+ ('ὦ', &['Ὦ']),
+ ('ὧ', &['Ὧ']),
+ ('Ὠ', &['ὠ']),
+ ('Ὡ', &['ὡ']),
+ ('Ὢ', &['ὢ']),
+ ('Ὣ', &['ὣ']),
+ ('Ὤ', &['ὤ']),
+ ('Ὥ', &['ὥ']),
+ ('Ὦ', &['ὦ']),
+ ('Ὧ', &['ὧ']),
+ ('ὰ', &['Ὰ']),
+ ('ά', &['Ά']),
+ ('ὲ', &['Ὲ']),
+ ('έ', &['Έ']),
+ ('ὴ', &['Ὴ']),
+ ('ή', &['Ή']),
+ ('ὶ', &['Ὶ']),
+ ('ί', &['Ί']),
+ ('ὸ', &['Ὸ']),
+ ('ό', &['Ό']),
+ ('ὺ', &['Ὺ']),
+ ('ύ', &['Ύ']),
+ ('ὼ', &['Ὼ']),
+ ('ώ', &['Ώ']),
+ ('ᾀ', &['ᾈ']),
+ ('ᾁ', &['ᾉ']),
+ ('ᾂ', &['ᾊ']),
+ ('ᾃ', &['ᾋ']),
+ ('ᾄ', &['ᾌ']),
+ ('ᾅ', &['ᾍ']),
+ ('ᾆ', &['ᾎ']),
+ ('ᾇ', &['ᾏ']),
+ ('ᾈ', &['ᾀ']),
+ ('ᾉ', &['ᾁ']),
+ ('ᾊ', &['ᾂ']),
+ ('ᾋ', &['ᾃ']),
+ ('ᾌ', &['ᾄ']),
+ ('ᾍ', &['ᾅ']),
+ ('ᾎ', &['ᾆ']),
+ ('ᾏ', &['ᾇ']),
+ ('ᾐ', &['ᾘ']),
+ ('ᾑ', &['ᾙ']),
+ ('ᾒ', &['ᾚ']),
+ ('ᾓ', &['ᾛ']),
+ ('ᾔ', &['ᾜ']),
+ ('ᾕ', &['ᾝ']),
+ ('ᾖ', &['ᾞ']),
+ ('ᾗ', &['ᾟ']),
+ ('ᾘ', &['ᾐ']),
+ ('ᾙ', &['ᾑ']),
+ ('ᾚ', &['ᾒ']),
+ ('ᾛ', &['ᾓ']),
+ ('ᾜ', &['ᾔ']),
+ ('ᾝ', &['ᾕ']),
+ ('ᾞ', &['ᾖ']),
+ ('ᾟ', &['ᾗ']),
+ ('ᾠ', &['ᾨ']),
+ ('ᾡ', &['ᾩ']),
+ ('ᾢ', &['ᾪ']),
+ ('ᾣ', &['ᾫ']),
+ ('ᾤ', &['ᾬ']),
+ ('ᾥ', &['ᾭ']),
+ ('ᾦ', &['ᾮ']),
+ ('ᾧ', &['ᾯ']),
+ ('ᾨ', &['ᾠ']),
+ ('ᾩ', &['ᾡ']),
+ ('ᾪ', &['ᾢ']),
+ ('ᾫ', &['ᾣ']),
+ ('ᾬ', &['ᾤ']),
+ ('ᾭ', &['ᾥ']),
+ ('ᾮ', &['ᾦ']),
+ ('ᾯ', &['ᾧ']),
+ ('ᾰ', &['Ᾰ']),
+ ('ᾱ', &['Ᾱ']),
+ ('ᾳ', &['ᾼ']),
+ ('Ᾰ', &['ᾰ']),
+ ('Ᾱ', &['ᾱ']),
+ ('Ὰ', &['ὰ']),
+ ('Ά', &['ά']),
+ ('ᾼ', &['ᾳ']),
+ ('ι', &['\u{345}', 'Ι', 'ι']),
+ ('ῃ', &['ῌ']),
+ ('Ὲ', &['ὲ']),
+ ('Έ', &['έ']),
+ ('Ὴ', &['ὴ']),
+ ('Ή', &['ή']),
+ ('ῌ', &['ῃ']),
+ ('ῐ', &['Ῐ']),
+ ('ῑ', &['Ῑ']),
+ ('Ῐ', &['ῐ']),
+ ('Ῑ', &['ῑ']),
+ ('Ὶ', &['ὶ']),
+ ('Ί', &['ί']),
+ ('ῠ', &['Ῠ']),
+ ('ῡ', &['Ῡ']),
+ ('ῥ', &['Ῥ']),
+ ('Ῠ', &['ῠ']),
+ ('Ῡ', &['ῡ']),
+ ('Ὺ', &['ὺ']),
+ ('Ύ', &['ύ']),
+ ('Ῥ', &['ῥ']),
+ ('ῳ', &['ῼ']),
+ ('Ὸ', &['ὸ']),
+ ('Ό', &['ό']),
+ ('Ὼ', &['ὼ']),
+ ('Ώ', &['ώ']),
+ ('ῼ', &['ῳ']),
+ ('Ω', &['Ω', 'ω']),
+ ('K', &['K', 'k']),
+ ('Å', &['Å', 'å']),
+ ('Ⅎ', &['ⅎ']),
+ ('ⅎ', &['Ⅎ']),
+ ('Ⅰ', &['ⅰ']),
+ ('Ⅱ', &['ⅱ']),
+ ('Ⅲ', &['ⅲ']),
+ ('Ⅳ', &['ⅳ']),
+ ('Ⅴ', &['ⅴ']),
+ ('Ⅵ', &['ⅵ']),
+ ('Ⅶ', &['ⅶ']),
+ ('Ⅷ', &['ⅷ']),
+ ('Ⅸ', &['ⅸ']),
+ ('Ⅹ', &['ⅹ']),
+ ('Ⅺ', &['ⅺ']),
+ ('Ⅻ', &['ⅻ']),
+ ('Ⅼ', &['ⅼ']),
+ ('Ⅽ', &['ⅽ']),
+ ('Ⅾ', &['ⅾ']),
+ ('Ⅿ', &['ⅿ']),
+ ('ⅰ', &['Ⅰ']),
+ ('ⅱ', &['Ⅱ']),
+ ('ⅲ', &['Ⅲ']),
+ ('ⅳ', &['Ⅳ']),
+ ('ⅴ', &['Ⅴ']),
+ ('ⅵ', &['Ⅵ']),
+ ('ⅶ', &['Ⅶ']),
+ ('ⅷ', &['Ⅷ']),
+ ('ⅸ', &['Ⅸ']),
+ ('ⅹ', &['Ⅹ']),
+ ('ⅺ', &['Ⅺ']),
+ ('ⅻ', &['Ⅻ']),
+ ('ⅼ', &['Ⅼ']),
+ ('ⅽ', &['Ⅽ']),
+ ('ⅾ', &['Ⅾ']),
+ ('ⅿ', &['Ⅿ']),
+ ('Ↄ', &['ↄ']),
+ ('ↄ', &['Ↄ']),
+ ('Ⓐ', &['ⓐ']),
+ ('Ⓑ', &['ⓑ']),
+ ('Ⓒ', &['ⓒ']),
+ ('Ⓓ', &['ⓓ']),
+ ('Ⓔ', &['ⓔ']),
+ ('Ⓕ', &['ⓕ']),
+ ('Ⓖ', &['ⓖ']),
+ ('Ⓗ', &['ⓗ']),
+ ('Ⓘ', &['ⓘ']),
+ ('Ⓙ', &['ⓙ']),
+ ('Ⓚ', &['ⓚ']),
+ ('Ⓛ', &['ⓛ']),
+ ('Ⓜ', &['ⓜ']),
+ ('Ⓝ', &['ⓝ']),
+ ('Ⓞ', &['ⓞ']),
+ ('Ⓟ', &['ⓟ']),
+ ('Ⓠ', &['ⓠ']),
+ ('Ⓡ', &['ⓡ']),
+ ('Ⓢ', &['ⓢ']),
+ ('Ⓣ', &['ⓣ']),
+ ('Ⓤ', &['ⓤ']),
+ ('Ⓥ', &['ⓥ']),
+ ('Ⓦ', &['ⓦ']),
+ ('Ⓧ', &['ⓧ']),
+ ('Ⓨ', &['ⓨ']),
+ ('Ⓩ', &['ⓩ']),
+ ('ⓐ', &['Ⓐ']),
+ ('ⓑ', &['Ⓑ']),
+ ('ⓒ', &['Ⓒ']),
+ ('ⓓ', &['Ⓓ']),
+ ('ⓔ', &['Ⓔ']),
+ ('ⓕ', &['Ⓕ']),
+ ('ⓖ', &['Ⓖ']),
+ ('ⓗ', &['Ⓗ']),
+ ('ⓘ', &['Ⓘ']),
+ ('ⓙ', &['Ⓙ']),
+ ('ⓚ', &['Ⓚ']),
+ ('ⓛ', &['Ⓛ']),
+ ('ⓜ', &['Ⓜ']),
+ ('ⓝ', &['Ⓝ']),
+ ('ⓞ', &['Ⓞ']),
+ ('ⓟ', &['Ⓟ']),
+ ('ⓠ', &['Ⓠ']),
+ ('ⓡ', &['Ⓡ']),
+ ('ⓢ', &['Ⓢ']),
+ ('ⓣ', &['Ⓣ']),
+ ('ⓤ', &['Ⓤ']),
+ ('ⓥ', &['Ⓥ']),
+ ('ⓦ', &['Ⓦ']),
+ ('ⓧ', &['Ⓧ']),
+ ('ⓨ', &['Ⓨ']),
+ ('ⓩ', &['Ⓩ']),
+ ('Ⰰ', &['ⰰ']),
+ ('Ⰱ', &['ⰱ']),
+ ('Ⰲ', &['ⰲ']),
+ ('Ⰳ', &['ⰳ']),
+ ('Ⰴ', &['ⰴ']),
+ ('Ⰵ', &['ⰵ']),
+ ('Ⰶ', &['ⰶ']),
+ ('Ⰷ', &['ⰷ']),
+ ('Ⰸ', &['ⰸ']),
+ ('Ⰹ', &['ⰹ']),
+ ('Ⰺ', &['ⰺ']),
+ ('Ⰻ', &['ⰻ']),
+ ('Ⰼ', &['ⰼ']),
+ ('Ⰽ', &['ⰽ']),
+ ('Ⰾ', &['ⰾ']),
+ ('Ⰿ', &['ⰿ']),
+ ('Ⱀ', &['ⱀ']),
+ ('Ⱁ', &['ⱁ']),
+ ('Ⱂ', &['ⱂ']),
+ ('Ⱃ', &['ⱃ']),
+ ('Ⱄ', &['ⱄ']),
+ ('Ⱅ', &['ⱅ']),
+ ('Ⱆ', &['ⱆ']),
+ ('Ⱇ', &['ⱇ']),
+ ('Ⱈ', &['ⱈ']),
+ ('Ⱉ', &['ⱉ']),
+ ('Ⱊ', &['ⱊ']),
+ ('Ⱋ', &['ⱋ']),
+ ('Ⱌ', &['ⱌ']),
+ ('Ⱍ', &['ⱍ']),
+ ('Ⱎ', &['ⱎ']),
+ ('Ⱏ', &['ⱏ']),
+ ('Ⱐ', &['ⱐ']),
+ ('Ⱑ', &['ⱑ']),
+ ('Ⱒ', &['ⱒ']),
+ ('Ⱓ', &['ⱓ']),
+ ('Ⱔ', &['ⱔ']),
+ ('Ⱕ', &['ⱕ']),
+ ('Ⱖ', &['ⱖ']),
+ ('Ⱗ', &['ⱗ']),
+ ('Ⱘ', &['ⱘ']),
+ ('Ⱙ', &['ⱙ']),
+ ('Ⱚ', &['ⱚ']),
+ ('Ⱛ', &['ⱛ']),
+ ('Ⱜ', &['ⱜ']),
+ ('Ⱝ', &['ⱝ']),
+ ('Ⱞ', &['ⱞ']),
+ ('Ⱟ', &['ⱟ']),
+ ('ⰰ', &['Ⰰ']),
+ ('ⰱ', &['Ⰱ']),
+ ('ⰲ', &['Ⰲ']),
+ ('ⰳ', &['Ⰳ']),
+ ('ⰴ', &['Ⰴ']),
+ ('ⰵ', &['Ⰵ']),
+ ('ⰶ', &['Ⰶ']),
+ ('ⰷ', &['Ⰷ']),
+ ('ⰸ', &['Ⰸ']),
+ ('ⰹ', &['Ⰹ']),
+ ('ⰺ', &['Ⰺ']),
+ ('ⰻ', &['Ⰻ']),
+ ('ⰼ', &['Ⰼ']),
+ ('ⰽ', &['Ⰽ']),
+ ('ⰾ', &['Ⰾ']),
+ ('ⰿ', &['Ⰿ']),
+ ('ⱀ', &['Ⱀ']),
+ ('ⱁ', &['Ⱁ']),
+ ('ⱂ', &['Ⱂ']),
+ ('ⱃ', &['Ⱃ']),
+ ('ⱄ', &['Ⱄ']),
+ ('ⱅ', &['Ⱅ']),
+ ('ⱆ', &['Ⱆ']),
+ ('ⱇ', &['Ⱇ']),
+ ('ⱈ', &['Ⱈ']),
+ ('ⱉ', &['Ⱉ']),
+ ('ⱊ', &['Ⱊ']),
+ ('ⱋ', &['Ⱋ']),
+ ('ⱌ', &['Ⱌ']),
+ ('ⱍ', &['Ⱍ']),
+ ('ⱎ', &['Ⱎ']),
+ ('ⱏ', &['Ⱏ']),
+ ('ⱐ', &['Ⱐ']),
+ ('ⱑ', &['Ⱑ']),
+ ('ⱒ', &['Ⱒ']),
+ ('ⱓ', &['Ⱓ']),
+ ('ⱔ', &['Ⱔ']),
+ ('ⱕ', &['Ⱕ']),
+ ('ⱖ', &['Ⱖ']),
+ ('ⱗ', &['Ⱗ']),
+ ('ⱘ', &['Ⱘ']),
+ ('ⱙ', &['Ⱙ']),
+ ('ⱚ', &['Ⱚ']),
+ ('ⱛ', &['Ⱛ']),
+ ('ⱜ', &['Ⱜ']),
+ ('ⱝ', &['Ⱝ']),
+ ('ⱞ', &['Ⱞ']),
+ ('ⱟ', &['Ⱟ']),
+ ('Ⱡ', &['ⱡ']),
+ ('ⱡ', &['Ⱡ']),
+ ('Ɫ', &['ɫ']),
+ ('Ᵽ', &['ᵽ']),
+ ('Ɽ', &['ɽ']),
+ ('ⱥ', &['Ⱥ']),
+ ('ⱦ', &['Ⱦ']),
+ ('Ⱨ', &['ⱨ']),
+ ('ⱨ', &['Ⱨ']),
+ ('Ⱪ', &['ⱪ']),
+ ('ⱪ', &['Ⱪ']),
+ ('Ⱬ', &['ⱬ']),
+ ('ⱬ', &['Ⱬ']),
+ ('Ɑ', &['ɑ']),
+ ('Ɱ', &['ɱ']),
+ ('Ɐ', &['ɐ']),
+ ('Ɒ', &['ɒ']),
+ ('Ⱳ', &['ⱳ']),
+ ('ⱳ', &['Ⱳ']),
+ ('Ⱶ', &['ⱶ']),
+ ('ⱶ', &['Ⱶ']),
+ ('Ȿ', &['ȿ']),
+ ('Ɀ', &['ɀ']),
+ ('Ⲁ', &['ⲁ']),
+ ('ⲁ', &['Ⲁ']),
+ ('Ⲃ', &['ⲃ']),
+ ('ⲃ', &['Ⲃ']),
+ ('Ⲅ', &['ⲅ']),
+ ('ⲅ', &['Ⲅ']),
+ ('Ⲇ', &['ⲇ']),
+ ('ⲇ', &['Ⲇ']),
+ ('Ⲉ', &['ⲉ']),
+ ('ⲉ', &['Ⲉ']),
+ ('Ⲋ', &['ⲋ']),
+ ('ⲋ', &['Ⲋ']),
+ ('Ⲍ', &['ⲍ']),
+ ('ⲍ', &['Ⲍ']),
+ ('Ⲏ', &['ⲏ']),
+ ('ⲏ', &['Ⲏ']),
+ ('Ⲑ', &['ⲑ']),
+ ('ⲑ', &['Ⲑ']),
+ ('Ⲓ', &['ⲓ']),
+ ('ⲓ', &['Ⲓ']),
+ ('Ⲕ', &['ⲕ']),
+ ('ⲕ', &['Ⲕ']),
+ ('Ⲗ', &['ⲗ']),
+ ('ⲗ', &['Ⲗ']),
+ ('Ⲙ', &['ⲙ']),
+ ('ⲙ', &['Ⲙ']),
+ ('Ⲛ', &['ⲛ']),
+ ('ⲛ', &['Ⲛ']),
+ ('Ⲝ', &['ⲝ']),
+ ('ⲝ', &['Ⲝ']),
+ ('Ⲟ', &['ⲟ']),
+ ('ⲟ', &['Ⲟ']),
+ ('Ⲡ', &['ⲡ']),
+ ('ⲡ', &['Ⲡ']),
+ ('Ⲣ', &['ⲣ']),
+ ('ⲣ', &['Ⲣ']),
+ ('Ⲥ', &['ⲥ']),
+ ('ⲥ', &['Ⲥ']),
+ ('Ⲧ', &['ⲧ']),
+ ('ⲧ', &['Ⲧ']),
+ ('Ⲩ', &['ⲩ']),
+ ('ⲩ', &['Ⲩ']),
+ ('Ⲫ', &['ⲫ']),
+ ('ⲫ', &['Ⲫ']),
+ ('Ⲭ', &['ⲭ']),
+ ('ⲭ', &['Ⲭ']),
+ ('Ⲯ', &['ⲯ']),
+ ('ⲯ', &['Ⲯ']),
+ ('Ⲱ', &['ⲱ']),
+ ('ⲱ', &['Ⲱ']),
+ ('Ⲳ', &['ⲳ']),
+ ('ⲳ', &['Ⲳ']),
+ ('Ⲵ', &['ⲵ']),
+ ('ⲵ', &['Ⲵ']),
+ ('Ⲷ', &['ⲷ']),
+ ('ⲷ', &['Ⲷ']),
+ ('Ⲹ', &['ⲹ']),
+ ('ⲹ', &['Ⲹ']),
+ ('Ⲻ', &['ⲻ']),
+ ('ⲻ', &['Ⲻ']),
+ ('Ⲽ', &['ⲽ']),
+ ('ⲽ', &['Ⲽ']),
+ ('Ⲿ', &['ⲿ']),
+ ('ⲿ', &['Ⲿ']),
+ ('Ⳁ', &['ⳁ']),
+ ('ⳁ', &['Ⳁ']),
+ ('Ⳃ', &['ⳃ']),
+ ('ⳃ', &['Ⳃ']),
+ ('Ⳅ', &['ⳅ']),
+ ('ⳅ', &['Ⳅ']),
+ ('Ⳇ', &['ⳇ']),
+ ('ⳇ', &['Ⳇ']),
+ ('Ⳉ', &['ⳉ']),
+ ('ⳉ', &['Ⳉ']),
+ ('Ⳋ', &['ⳋ']),
+ ('ⳋ', &['Ⳋ']),
+ ('Ⳍ', &['ⳍ']),
+ ('ⳍ', &['Ⳍ']),
+ ('Ⳏ', &['ⳏ']),
+ ('ⳏ', &['Ⳏ']),
+ ('Ⳑ', &['ⳑ']),
+ ('ⳑ', &['Ⳑ']),
+ ('Ⳓ', &['ⳓ']),
+ ('ⳓ', &['Ⳓ']),
+ ('Ⳕ', &['ⳕ']),
+ ('ⳕ', &['Ⳕ']),
+ ('Ⳗ', &['ⳗ']),
+ ('ⳗ', &['Ⳗ']),
+ ('Ⳙ', &['ⳙ']),
+ ('ⳙ', &['Ⳙ']),
+ ('Ⳛ', &['ⳛ']),
+ ('ⳛ', &['Ⳛ']),
+ ('Ⳝ', &['ⳝ']),
+ ('ⳝ', &['Ⳝ']),
+ ('Ⳟ', &['ⳟ']),
+ ('ⳟ', &['Ⳟ']),
+ ('Ⳡ', &['ⳡ']),
+ ('ⳡ', &['Ⳡ']),
+ ('Ⳣ', &['ⳣ']),
+ ('ⳣ', &['Ⳣ']),
+ ('Ⳬ', &['ⳬ']),
+ ('ⳬ', &['Ⳬ']),
+ ('Ⳮ', &['ⳮ']),
+ ('ⳮ', &['Ⳮ']),
+ ('Ⳳ', &['ⳳ']),
+ ('ⳳ', &['Ⳳ']),
+ ('ⴀ', &['Ⴀ']),
+ ('ⴁ', &['Ⴁ']),
+ ('ⴂ', &['Ⴂ']),
+ ('ⴃ', &['Ⴃ']),
+ ('ⴄ', &['Ⴄ']),
+ ('ⴅ', &['Ⴅ']),
+ ('ⴆ', &['Ⴆ']),
+ ('ⴇ', &['Ⴇ']),
+ ('ⴈ', &['Ⴈ']),
+ ('ⴉ', &['Ⴉ']),
+ ('ⴊ', &['Ⴊ']),
+ ('ⴋ', &['Ⴋ']),
+ ('ⴌ', &['Ⴌ']),
+ ('ⴍ', &['Ⴍ']),
+ ('ⴎ', &['Ⴎ']),
+ ('ⴏ', &['Ⴏ']),
+ ('ⴐ', &['Ⴐ']),
+ ('ⴑ', &['Ⴑ']),
+ ('ⴒ', &['Ⴒ']),
+ ('ⴓ', &['Ⴓ']),
+ ('ⴔ', &['Ⴔ']),
+ ('ⴕ', &['Ⴕ']),
+ ('ⴖ', &['Ⴖ']),
+ ('ⴗ', &['Ⴗ']),
+ ('ⴘ', &['Ⴘ']),
+ ('ⴙ', &['Ⴙ']),
+ ('ⴚ', &['Ⴚ']),
+ ('ⴛ', &['Ⴛ']),
+ ('ⴜ', &['Ⴜ']),
+ ('ⴝ', &['Ⴝ']),
+ ('ⴞ', &['Ⴞ']),
+ ('ⴟ', &['Ⴟ']),
+ ('ⴠ', &['Ⴠ']),
+ ('ⴡ', &['Ⴡ']),
+ ('ⴢ', &['Ⴢ']),
+ ('ⴣ', &['Ⴣ']),
+ ('ⴤ', &['Ⴤ']),
+ ('ⴥ', &['Ⴥ']),
+ ('ⴧ', &['Ⴧ']),
+ ('ⴭ', &['Ⴭ']),
+ ('Ꙁ', &['ꙁ']),
+ ('ꙁ', &['Ꙁ']),
+ ('Ꙃ', &['ꙃ']),
+ ('ꙃ', &['Ꙃ']),
+ ('Ꙅ', &['ꙅ']),
+ ('ꙅ', &['Ꙅ']),
+ ('Ꙇ', &['ꙇ']),
+ ('ꙇ', &['Ꙇ']),
+ ('Ꙉ', &['ꙉ']),
+ ('ꙉ', &['Ꙉ']),
+ ('Ꙋ', &['ᲈ', 'ꙋ']),
+ ('ꙋ', &['ᲈ', 'Ꙋ']),
+ ('Ꙍ', &['ꙍ']),
+ ('ꙍ', &['Ꙍ']),
+ ('Ꙏ', &['ꙏ']),
+ ('ꙏ', &['Ꙏ']),
+ ('Ꙑ', &['ꙑ']),
+ ('ꙑ', &['Ꙑ']),
+ ('Ꙓ', &['ꙓ']),
+ ('ꙓ', &['Ꙓ']),
+ ('Ꙕ', &['ꙕ']),
+ ('ꙕ', &['Ꙕ']),
+ ('Ꙗ', &['ꙗ']),
+ ('ꙗ', &['Ꙗ']),
+ ('Ꙙ', &['ꙙ']),
+ ('ꙙ', &['Ꙙ']),
+ ('Ꙛ', &['ꙛ']),
+ ('ꙛ', &['Ꙛ']),
+ ('Ꙝ', &['ꙝ']),
+ ('ꙝ', &['Ꙝ']),
+ ('Ꙟ', &['ꙟ']),
+ ('ꙟ', &['Ꙟ']),
+ ('Ꙡ', &['ꙡ']),
+ ('ꙡ', &['Ꙡ']),
+ ('Ꙣ', &['ꙣ']),
+ ('ꙣ', &['Ꙣ']),
+ ('Ꙥ', &['ꙥ']),
+ ('ꙥ', &['Ꙥ']),
+ ('Ꙧ', &['ꙧ']),
+ ('ꙧ', &['Ꙧ']),
+ ('Ꙩ', &['ꙩ']),
+ ('ꙩ', &['Ꙩ']),
+ ('Ꙫ', &['ꙫ']),
+ ('ꙫ', &['Ꙫ']),
+ ('Ꙭ', &['ꙭ']),
+ ('ꙭ', &['Ꙭ']),
+ ('Ꚁ', &['ꚁ']),
+ ('ꚁ', &['Ꚁ']),
+ ('Ꚃ', &['ꚃ']),
+ ('ꚃ', &['Ꚃ']),
+ ('Ꚅ', &['ꚅ']),
+ ('ꚅ', &['Ꚅ']),
+ ('Ꚇ', &['ꚇ']),
+ ('ꚇ', &['Ꚇ']),
+ ('Ꚉ', &['ꚉ']),
+ ('ꚉ', &['Ꚉ']),
+ ('Ꚋ', &['ꚋ']),
+ ('ꚋ', &['Ꚋ']),
+ ('Ꚍ', &['ꚍ']),
+ ('ꚍ', &['Ꚍ']),
+ ('Ꚏ', &['ꚏ']),
+ ('ꚏ', &['Ꚏ']),
+ ('Ꚑ', &['ꚑ']),
+ ('ꚑ', &['Ꚑ']),
+ ('Ꚓ', &['ꚓ']),
+ ('ꚓ', &['Ꚓ']),
+ ('Ꚕ', &['ꚕ']),
+ ('ꚕ', &['Ꚕ']),
+ ('Ꚗ', &['ꚗ']),
+ ('ꚗ', &['Ꚗ']),
+ ('Ꚙ', &['ꚙ']),
+ ('ꚙ', &['Ꚙ']),
+ ('Ꚛ', &['ꚛ']),
+ ('ꚛ', &['Ꚛ']),
+ ('Ꜣ', &['ꜣ']),
+ ('ꜣ', &['Ꜣ']),
+ ('Ꜥ', &['ꜥ']),
+ ('ꜥ', &['Ꜥ']),
+ ('Ꜧ', &['ꜧ']),
+ ('ꜧ', &['Ꜧ']),
+ ('Ꜩ', &['ꜩ']),
+ ('ꜩ', &['Ꜩ']),
+ ('Ꜫ', &['ꜫ']),
+ ('ꜫ', &['Ꜫ']),
+ ('Ꜭ', &['ꜭ']),
+ ('ꜭ', &['Ꜭ']),
+ ('Ꜯ', &['ꜯ']),
+ ('ꜯ', &['Ꜯ']),
+ ('Ꜳ', &['ꜳ']),
+ ('ꜳ', &['Ꜳ']),
+ ('Ꜵ', &['ꜵ']),
+ ('ꜵ', &['Ꜵ']),
+ ('Ꜷ', &['ꜷ']),
+ ('ꜷ', &['Ꜷ']),
+ ('Ꜹ', &['ꜹ']),
+ ('ꜹ', &['Ꜹ']),
+ ('Ꜻ', &['ꜻ']),
+ ('ꜻ', &['Ꜻ']),
+ ('Ꜽ', &['ꜽ']),
+ ('ꜽ', &['Ꜽ']),
+ ('Ꜿ', &['ꜿ']),
+ ('ꜿ', &['Ꜿ']),
+ ('Ꝁ', &['ꝁ']),
+ ('ꝁ', &['Ꝁ']),
+ ('Ꝃ', &['ꝃ']),
+ ('ꝃ', &['Ꝃ']),
+ ('Ꝅ', &['ꝅ']),
+ ('ꝅ', &['Ꝅ']),
+ ('Ꝇ', &['ꝇ']),
+ ('ꝇ', &['Ꝇ']),
+ ('Ꝉ', &['ꝉ']),
+ ('ꝉ', &['Ꝉ']),
+ ('Ꝋ', &['ꝋ']),
+ ('ꝋ', &['Ꝋ']),
+ ('Ꝍ', &['ꝍ']),
+ ('ꝍ', &['Ꝍ']),
+ ('Ꝏ', &['ꝏ']),
+ ('ꝏ', &['Ꝏ']),
+ ('Ꝑ', &['ꝑ']),
+ ('ꝑ', &['Ꝑ']),
+ ('Ꝓ', &['ꝓ']),
+ ('ꝓ', &['Ꝓ']),
+ ('Ꝕ', &['ꝕ']),
+ ('ꝕ', &['Ꝕ']),
+ ('Ꝗ', &['ꝗ']),
+ ('ꝗ', &['Ꝗ']),
+ ('Ꝙ', &['ꝙ']),
+ ('ꝙ', &['Ꝙ']),
+ ('Ꝛ', &['ꝛ']),
+ ('ꝛ', &['Ꝛ']),
+ ('Ꝝ', &['ꝝ']),
+ ('ꝝ', &['Ꝝ']),
+ ('Ꝟ', &['ꝟ']),
+ ('ꝟ', &['Ꝟ']),
+ ('Ꝡ', &['ꝡ']),
+ ('ꝡ', &['Ꝡ']),
+ ('Ꝣ', &['ꝣ']),
+ ('ꝣ', &['Ꝣ']),
+ ('Ꝥ', &['ꝥ']),
+ ('ꝥ', &['Ꝥ']),
+ ('Ꝧ', &['ꝧ']),
+ ('ꝧ', &['Ꝧ']),
+ ('Ꝩ', &['ꝩ']),
+ ('ꝩ', &['Ꝩ']),
+ ('Ꝫ', &['ꝫ']),
+ ('ꝫ', &['Ꝫ']),
+ ('Ꝭ', &['ꝭ']),
+ ('ꝭ', &['Ꝭ']),
+ ('Ꝯ', &['ꝯ']),
+ ('ꝯ', &['Ꝯ']),
+ ('Ꝺ', &['ꝺ']),
+ ('ꝺ', &['Ꝺ']),
+ ('Ꝼ', &['ꝼ']),
+ ('ꝼ', &['Ꝼ']),
+ ('Ᵹ', &['ᵹ']),
+ ('Ꝿ', &['ꝿ']),
+ ('ꝿ', &['Ꝿ']),
+ ('Ꞁ', &['ꞁ']),
+ ('ꞁ', &['Ꞁ']),
+ ('Ꞃ', &['ꞃ']),
+ ('ꞃ', &['Ꞃ']),
+ ('Ꞅ', &['ꞅ']),
+ ('ꞅ', &['Ꞅ']),
+ ('Ꞇ', &['ꞇ']),
+ ('ꞇ', &['Ꞇ']),
+ ('Ꞌ', &['ꞌ']),
+ ('ꞌ', &['Ꞌ']),
+ ('Ɥ', &['ɥ']),
+ ('Ꞑ', &['ꞑ']),
+ ('ꞑ', &['Ꞑ']),
+ ('Ꞓ', &['ꞓ']),
+ ('ꞓ', &['Ꞓ']),
+ ('ꞔ', &['Ꞔ']),
+ ('Ꞗ', &['ꞗ']),
+ ('ꞗ', &['Ꞗ']),
+ ('Ꞙ', &['ꞙ']),
+ ('ꞙ', &['Ꞙ']),
+ ('Ꞛ', &['ꞛ']),
+ ('ꞛ', &['Ꞛ']),
+ ('Ꞝ', &['ꞝ']),
+ ('ꞝ', &['Ꞝ']),
+ ('Ꞟ', &['ꞟ']),
+ ('ꞟ', &['Ꞟ']),
+ ('Ꞡ', &['ꞡ']),
+ ('ꞡ', &['Ꞡ']),
+ ('Ꞣ', &['ꞣ']),
+ ('ꞣ', &['Ꞣ']),
+ ('Ꞥ', &['ꞥ']),
+ ('ꞥ', &['Ꞥ']),
+ ('Ꞧ', &['ꞧ']),
+ ('ꞧ', &['Ꞧ']),
+ ('Ꞩ', &['ꞩ']),
+ ('ꞩ', &['Ꞩ']),
+ ('Ɦ', &['ɦ']),
+ ('Ɜ', &['ɜ']),
+ ('Ɡ', &['ɡ']),
+ ('Ɬ', &['ɬ']),
+ ('Ɪ', &['ɪ']),
+ ('Ʞ', &['ʞ']),
+ ('Ʇ', &['ʇ']),
+ ('Ʝ', &['ʝ']),
+ ('Ꭓ', &['ꭓ']),
+ ('Ꞵ', &['ꞵ']),
+ ('ꞵ', &['Ꞵ']),
+ ('Ꞷ', &['ꞷ']),
+ ('ꞷ', &['Ꞷ']),
+ ('Ꞹ', &['ꞹ']),
+ ('ꞹ', &['Ꞹ']),
+ ('Ꞻ', &['ꞻ']),
+ ('ꞻ', &['Ꞻ']),
+ ('Ꞽ', &['ꞽ']),
+ ('ꞽ', &['Ꞽ']),
+ ('Ꞿ', &['ꞿ']),
+ ('ꞿ', &['Ꞿ']),
+ ('Ꟁ', &['ꟁ']),
+ ('ꟁ', &['Ꟁ']),
+ ('Ꟃ', &['ꟃ']),
+ ('ꟃ', &['Ꟃ']),
+ ('Ꞔ', &['ꞔ']),
+ ('Ʂ', &['ʂ']),
+ ('Ᶎ', &['ᶎ']),
+ ('Ꟈ', &['ꟈ']),
+ ('ꟈ', &['Ꟈ']),
+ ('Ꟊ', &['ꟊ']),
+ ('ꟊ', &['Ꟊ']),
+ ('Ꟑ', &['ꟑ']),
+ ('ꟑ', &['Ꟑ']),
+ ('Ꟗ', &['ꟗ']),
+ ('ꟗ', &['Ꟗ']),
+ ('Ꟙ', &['ꟙ']),
+ ('ꟙ', &['Ꟙ']),
+ ('Ꟶ', &['ꟶ']),
+ ('ꟶ', &['Ꟶ']),
+ ('ꭓ', &['Ꭓ']),
+ ('ꭰ', &['Ꭰ']),
+ ('ꭱ', &['Ꭱ']),
+ ('ꭲ', &['Ꭲ']),
+ ('ꭳ', &['Ꭳ']),
+ ('ꭴ', &['Ꭴ']),
+ ('ꭵ', &['Ꭵ']),
+ ('ꭶ', &['Ꭶ']),
+ ('ꭷ', &['Ꭷ']),
+ ('ꭸ', &['Ꭸ']),
+ ('ꭹ', &['Ꭹ']),
+ ('ꭺ', &['Ꭺ']),
+ ('ꭻ', &['Ꭻ']),
+ ('ꭼ', &['Ꭼ']),
+ ('ꭽ', &['Ꭽ']),
+ ('ꭾ', &['Ꭾ']),
+ ('ꭿ', &['Ꭿ']),
+ ('ꮀ', &['Ꮀ']),
+ ('ꮁ', &['Ꮁ']),
+ ('ꮂ', &['Ꮂ']),
+ ('ꮃ', &['Ꮃ']),
+ ('ꮄ', &['Ꮄ']),
+ ('ꮅ', &['Ꮅ']),
+ ('ꮆ', &['Ꮆ']),
+ ('ꮇ', &['Ꮇ']),
+ ('ꮈ', &['Ꮈ']),
+ ('ꮉ', &['Ꮉ']),
+ ('ꮊ', &['Ꮊ']),
+ ('ꮋ', &['Ꮋ']),
+ ('ꮌ', &['Ꮌ']),
+ ('ꮍ', &['Ꮍ']),
+ ('ꮎ', &['Ꮎ']),
+ ('ꮏ', &['Ꮏ']),
+ ('ꮐ', &['Ꮐ']),
+ ('ꮑ', &['Ꮑ']),
+ ('ꮒ', &['Ꮒ']),
+ ('ꮓ', &['Ꮓ']),
+ ('ꮔ', &['Ꮔ']),
+ ('ꮕ', &['Ꮕ']),
+ ('ꮖ', &['Ꮖ']),
+ ('ꮗ', &['Ꮗ']),
+ ('ꮘ', &['Ꮘ']),
+ ('ꮙ', &['Ꮙ']),
+ ('ꮚ', &['Ꮚ']),
+ ('ꮛ', &['Ꮛ']),
+ ('ꮜ', &['Ꮜ']),
+ ('ꮝ', &['Ꮝ']),
+ ('ꮞ', &['Ꮞ']),
+ ('ꮟ', &['Ꮟ']),
+ ('ꮠ', &['Ꮠ']),
+ ('ꮡ', &['Ꮡ']),
+ ('ꮢ', &['Ꮢ']),
+ ('ꮣ', &['Ꮣ']),
+ ('ꮤ', &['Ꮤ']),
+ ('ꮥ', &['Ꮥ']),
+ ('ꮦ', &['Ꮦ']),
+ ('ꮧ', &['Ꮧ']),
+ ('ꮨ', &['Ꮨ']),
+ ('ꮩ', &['Ꮩ']),
+ ('ꮪ', &['Ꮪ']),
+ ('ꮫ', &['Ꮫ']),
+ ('ꮬ', &['Ꮬ']),
+ ('ꮭ', &['Ꮭ']),
+ ('ꮮ', &['Ꮮ']),
+ ('ꮯ', &['Ꮯ']),
+ ('ꮰ', &['Ꮰ']),
+ ('ꮱ', &['Ꮱ']),
+ ('ꮲ', &['Ꮲ']),
+ ('ꮳ', &['Ꮳ']),
+ ('ꮴ', &['Ꮴ']),
+ ('ꮵ', &['Ꮵ']),
+ ('ꮶ', &['Ꮶ']),
+ ('ꮷ', &['Ꮷ']),
+ ('ꮸ', &['Ꮸ']),
+ ('ꮹ', &['Ꮹ']),
+ ('ꮺ', &['Ꮺ']),
+ ('ꮻ', &['Ꮻ']),
+ ('ꮼ', &['Ꮼ']),
+ ('ꮽ', &['Ꮽ']),
+ ('ꮾ', &['Ꮾ']),
+ ('ꮿ', &['Ꮿ']),
+ ('A', &['a']),
+ ('B', &['b']),
+ ('C', &['c']),
+ ('D', &['d']),
+ ('E', &['e']),
+ ('F', &['f']),
+ ('G', &['g']),
+ ('H', &['h']),
+ ('I', &['i']),
+ ('J', &['j']),
+ ('K', &['k']),
+ ('L', &['l']),
+ ('M', &['m']),
+ ('N', &['n']),
+ ('O', &['o']),
+ ('P', &['p']),
+ ('Q', &['q']),
+ ('R', &['r']),
+ ('S', &['s']),
+ ('T', &['t']),
+ ('U', &['u']),
+ ('V', &['v']),
+ ('W', &['w']),
+ ('X', &['x']),
+ ('Y', &['y']),
+ ('Z', &['z']),
+ ('a', &['A']),
+ ('b', &['B']),
+ ('c', &['C']),
+ ('d', &['D']),
+ ('e', &['E']),
+ ('f', &['F']),
+ ('g', &['G']),
+ ('h', &['H']),
+ ('i', &['I']),
+ ('j', &['J']),
+ ('k', &['K']),
+ ('l', &['L']),
+ ('m', &['M']),
+ ('n', &['N']),
+ ('o', &['O']),
+ ('p', &['P']),
+ ('q', &['Q']),
+ ('r', &['R']),
+ ('s', &['S']),
+ ('t', &['T']),
+ ('u', &['U']),
+ ('v', &['V']),
+ ('w', &['W']),
+ ('x', &['X']),
+ ('y', &['Y']),
+ ('z', &['Z']),
+ ('𐐀', &['𐐨']),
+ ('𐐁', &['𐐩']),
+ ('𐐂', &['𐐪']),
+ ('𐐃', &['𐐫']),
+ ('𐐄', &['𐐬']),
+ ('𐐅', &['𐐭']),
+ ('𐐆', &['𐐮']),
+ ('𐐇', &['𐐯']),
+ ('𐐈', &['𐐰']),
+ ('𐐉', &['𐐱']),
+ ('𐐊', &['𐐲']),
+ ('𐐋', &['𐐳']),
+ ('𐐌', &['𐐴']),
+ ('𐐍', &['𐐵']),
+ ('𐐎', &['𐐶']),
+ ('𐐏', &['𐐷']),
+ ('𐐐', &['𐐸']),
+ ('𐐑', &['𐐹']),
+ ('𐐒', &['𐐺']),
+ ('𐐓', &['𐐻']),
+ ('𐐔', &['𐐼']),
+ ('𐐕', &['𐐽']),
+ ('𐐖', &['𐐾']),
+ ('𐐗', &['𐐿']),
+ ('𐐘', &['𐑀']),
+ ('𐐙', &['𐑁']),
+ ('𐐚', &['𐑂']),
+ ('𐐛', &['𐑃']),
+ ('𐐜', &['𐑄']),
+ ('𐐝', &['𐑅']),
+ ('𐐞', &['𐑆']),
+ ('𐐟', &['𐑇']),
+ ('𐐠', &['𐑈']),
+ ('𐐡', &['𐑉']),
+ ('𐐢', &['𐑊']),
+ ('𐐣', &['𐑋']),
+ ('𐐤', &['𐑌']),
+ ('𐐥', &['𐑍']),
+ ('𐐦', &['𐑎']),
+ ('𐐧', &['𐑏']),
+ ('𐐨', &['𐐀']),
+ ('𐐩', &['𐐁']),
+ ('𐐪', &['𐐂']),
+ ('𐐫', &['𐐃']),
+ ('𐐬', &['𐐄']),
+ ('𐐭', &['𐐅']),
+ ('𐐮', &['𐐆']),
+ ('𐐯', &['𐐇']),
+ ('𐐰', &['𐐈']),
+ ('𐐱', &['𐐉']),
+ ('𐐲', &['𐐊']),
+ ('𐐳', &['𐐋']),
+ ('𐐴', &['𐐌']),
+ ('𐐵', &['𐐍']),
+ ('𐐶', &['𐐎']),
+ ('𐐷', &['𐐏']),
+ ('𐐸', &['𐐐']),
+ ('𐐹', &['𐐑']),
+ ('𐐺', &['𐐒']),
+ ('𐐻', &['𐐓']),
+ ('𐐼', &['𐐔']),
+ ('𐐽', &['𐐕']),
+ ('𐐾', &['𐐖']),
+ ('𐐿', &['𐐗']),
+ ('𐑀', &['𐐘']),
+ ('𐑁', &['𐐙']),
+ ('𐑂', &['𐐚']),
+ ('𐑃', &['𐐛']),
+ ('𐑄', &['𐐜']),
+ ('𐑅', &['𐐝']),
+ ('𐑆', &['𐐞']),
+ ('𐑇', &['𐐟']),
+ ('𐑈', &['𐐠']),
+ ('𐑉', &['𐐡']),
+ ('𐑊', &['𐐢']),
+ ('𐑋', &['𐐣']),
+ ('𐑌', &['𐐤']),
+ ('𐑍', &['𐐥']),
+ ('𐑎', &['𐐦']),
+ ('𐑏', &['𐐧']),
+ ('𐒰', &['𐓘']),
+ ('𐒱', &['𐓙']),
+ ('𐒲', &['𐓚']),
+ ('𐒳', &['𐓛']),
+ ('𐒴', &['𐓜']),
+ ('𐒵', &['𐓝']),
+ ('𐒶', &['𐓞']),
+ ('𐒷', &['𐓟']),
+ ('𐒸', &['𐓠']),
+ ('𐒹', &['𐓡']),
+ ('𐒺', &['𐓢']),
+ ('𐒻', &['𐓣']),
+ ('𐒼', &['𐓤']),
+ ('𐒽', &['𐓥']),
+ ('𐒾', &['𐓦']),
+ ('𐒿', &['𐓧']),
+ ('𐓀', &['𐓨']),
+ ('𐓁', &['𐓩']),
+ ('𐓂', &['𐓪']),
+ ('𐓃', &['𐓫']),
+ ('𐓄', &['𐓬']),
+ ('𐓅', &['𐓭']),
+ ('𐓆', &['𐓮']),
+ ('𐓇', &['𐓯']),
+ ('𐓈', &['𐓰']),
+ ('𐓉', &['𐓱']),
+ ('𐓊', &['𐓲']),
+ ('𐓋', &['𐓳']),
+ ('𐓌', &['𐓴']),
+ ('𐓍', &['𐓵']),
+ ('𐓎', &['𐓶']),
+ ('𐓏', &['𐓷']),
+ ('𐓐', &['𐓸']),
+ ('𐓑', &['𐓹']),
+ ('𐓒', &['𐓺']),
+ ('𐓓', &['𐓻']),
+ ('𐓘', &['𐒰']),
+ ('𐓙', &['𐒱']),
+ ('𐓚', &['𐒲']),
+ ('𐓛', &['𐒳']),
+ ('𐓜', &['𐒴']),
+ ('𐓝', &['𐒵']),
+ ('𐓞', &['𐒶']),
+ ('𐓟', &['𐒷']),
+ ('𐓠', &['𐒸']),
+ ('𐓡', &['𐒹']),
+ ('𐓢', &['𐒺']),
+ ('𐓣', &['𐒻']),
+ ('𐓤', &['𐒼']),
+ ('𐓥', &['𐒽']),
+ ('𐓦', &['𐒾']),
+ ('𐓧', &['𐒿']),
+ ('𐓨', &['𐓀']),
+ ('𐓩', &['𐓁']),
+ ('𐓪', &['𐓂']),
+ ('𐓫', &['𐓃']),
+ ('𐓬', &['𐓄']),
+ ('𐓭', &['𐓅']),
+ ('𐓮', &['𐓆']),
+ ('𐓯', &['𐓇']),
+ ('𐓰', &['𐓈']),
+ ('𐓱', &['𐓉']),
+ ('𐓲', &['𐓊']),
+ ('𐓳', &['𐓋']),
+ ('𐓴', &['𐓌']),
+ ('𐓵', &['𐓍']),
+ ('𐓶', &['𐓎']),
+ ('𐓷', &['𐓏']),
+ ('𐓸', &['𐓐']),
+ ('𐓹', &['𐓑']),
+ ('𐓺', &['𐓒']),
+ ('𐓻', &['𐓓']),
+ ('𐕰', &['𐖗']),
+ ('𐕱', &['𐖘']),
+ ('𐕲', &['𐖙']),
+ ('𐕳', &['𐖚']),
+ ('𐕴', &['𐖛']),
+ ('𐕵', &['𐖜']),
+ ('𐕶', &['𐖝']),
+ ('𐕷', &['𐖞']),
+ ('𐕸', &['𐖟']),
+ ('𐕹', &['𐖠']),
+ ('𐕺', &['𐖡']),
+ ('𐕼', &['𐖣']),
+ ('𐕽', &['𐖤']),
+ ('𐕾', &['𐖥']),
+ ('𐕿', &['𐖦']),
+ ('𐖀', &['𐖧']),
+ ('𐖁', &['𐖨']),
+ ('𐖂', &['𐖩']),
+ ('𐖃', &['𐖪']),
+ ('𐖄', &['𐖫']),
+ ('𐖅', &['𐖬']),
+ ('𐖆', &['𐖭']),
+ ('𐖇', &['𐖮']),
+ ('𐖈', &['𐖯']),
+ ('𐖉', &['𐖰']),
+ ('𐖊', &['𐖱']),
+ ('𐖌', &['𐖳']),
+ ('𐖍', &['𐖴']),
+ ('𐖎', &['𐖵']),
+ ('𐖏', &['𐖶']),
+ ('𐖐', &['𐖷']),
+ ('𐖑', &['𐖸']),
+ ('𐖒', &['𐖹']),
+ ('𐖔', &['𐖻']),
+ ('𐖕', &['𐖼']),
+ ('𐖗', &['𐕰']),
+ ('𐖘', &['𐕱']),
+ ('𐖙', &['𐕲']),
+ ('𐖚', &['𐕳']),
+ ('𐖛', &['𐕴']),
+ ('𐖜', &['𐕵']),
+ ('𐖝', &['𐕶']),
+ ('𐖞', &['𐕷']),
+ ('𐖟', &['𐕸']),
+ ('𐖠', &['𐕹']),
+ ('𐖡', &['𐕺']),
+ ('𐖣', &['𐕼']),
+ ('𐖤', &['𐕽']),
+ ('𐖥', &['𐕾']),
+ ('𐖦', &['𐕿']),
+ ('𐖧', &['𐖀']),
+ ('𐖨', &['𐖁']),
+ ('𐖩', &['𐖂']),
+ ('𐖪', &['𐖃']),
+ ('𐖫', &['𐖄']),
+ ('𐖬', &['𐖅']),
+ ('𐖭', &['𐖆']),
+ ('𐖮', &['𐖇']),
+ ('𐖯', &['𐖈']),
+ ('𐖰', &['𐖉']),
+ ('𐖱', &['𐖊']),
+ ('𐖳', &['𐖌']),
+ ('𐖴', &['𐖍']),
+ ('𐖵', &['𐖎']),
+ ('𐖶', &['𐖏']),
+ ('𐖷', &['𐖐']),
+ ('𐖸', &['𐖑']),
+ ('𐖹', &['𐖒']),
+ ('𐖻', &['𐖔']),
+ ('𐖼', &['𐖕']),
+ ('𐲀', &['𐳀']),
+ ('𐲁', &['𐳁']),
+ ('𐲂', &['𐳂']),
+ ('𐲃', &['𐳃']),
+ ('𐲄', &['𐳄']),
+ ('𐲅', &['𐳅']),
+ ('𐲆', &['𐳆']),
+ ('𐲇', &['𐳇']),
+ ('𐲈', &['𐳈']),
+ ('𐲉', &['𐳉']),
+ ('𐲊', &['𐳊']),
+ ('𐲋', &['𐳋']),
+ ('𐲌', &['𐳌']),
+ ('𐲍', &['𐳍']),
+ ('𐲎', &['𐳎']),
+ ('𐲏', &['𐳏']),
+ ('𐲐', &['𐳐']),
+ ('𐲑', &['𐳑']),
+ ('𐲒', &['𐳒']),
+ ('𐲓', &['𐳓']),
+ ('𐲔', &['𐳔']),
+ ('𐲕', &['𐳕']),
+ ('𐲖', &['𐳖']),
+ ('𐲗', &['𐳗']),
+ ('𐲘', &['𐳘']),
+ ('𐲙', &['𐳙']),
+ ('𐲚', &['𐳚']),
+ ('𐲛', &['𐳛']),
+ ('𐲜', &['𐳜']),
+ ('𐲝', &['𐳝']),
+ ('𐲞', &['𐳞']),
+ ('𐲟', &['𐳟']),
+ ('𐲠', &['𐳠']),
+ ('𐲡', &['𐳡']),
+ ('𐲢', &['𐳢']),
+ ('𐲣', &['𐳣']),
+ ('𐲤', &['𐳤']),
+ ('𐲥', &['𐳥']),
+ ('𐲦', &['𐳦']),
+ ('𐲧', &['𐳧']),
+ ('𐲨', &['𐳨']),
+ ('𐲩', &['𐳩']),
+ ('𐲪', &['𐳪']),
+ ('𐲫', &['𐳫']),
+ ('𐲬', &['𐳬']),
+ ('𐲭', &['𐳭']),
+ ('𐲮', &['𐳮']),
+ ('𐲯', &['𐳯']),
+ ('𐲰', &['𐳰']),
+ ('𐲱', &['𐳱']),
+ ('𐲲', &['𐳲']),
+ ('𐳀', &['𐲀']),
+ ('𐳁', &['𐲁']),
+ ('𐳂', &['𐲂']),
+ ('𐳃', &['𐲃']),
+ ('𐳄', &['𐲄']),
+ ('𐳅', &['𐲅']),
+ ('𐳆', &['𐲆']),
+ ('𐳇', &['𐲇']),
+ ('𐳈', &['𐲈']),
+ ('𐳉', &['𐲉']),
+ ('𐳊', &['𐲊']),
+ ('𐳋', &['𐲋']),
+ ('𐳌', &['𐲌']),
+ ('𐳍', &['𐲍']),
+ ('𐳎', &['𐲎']),
+ ('𐳏', &['𐲏']),
+ ('𐳐', &['𐲐']),
+ ('𐳑', &['𐲑']),
+ ('𐳒', &['𐲒']),
+ ('𐳓', &['𐲓']),
+ ('𐳔', &['𐲔']),
+ ('𐳕', &['𐲕']),
+ ('𐳖', &['𐲖']),
+ ('𐳗', &['𐲗']),
+ ('𐳘', &['𐲘']),
+ ('𐳙', &['𐲙']),
+ ('𐳚', &['𐲚']),
+ ('𐳛', &['𐲛']),
+ ('𐳜', &['𐲜']),
+ ('𐳝', &['𐲝']),
+ ('𐳞', &['𐲞']),
+ ('𐳟', &['𐲟']),
+ ('𐳠', &['𐲠']),
+ ('𐳡', &['𐲡']),
+ ('𐳢', &['𐲢']),
+ ('𐳣', &['𐲣']),
+ ('𐳤', &['𐲤']),
+ ('𐳥', &['𐲥']),
+ ('𐳦', &['𐲦']),
+ ('𐳧', &['𐲧']),
+ ('𐳨', &['𐲨']),
+ ('𐳩', &['𐲩']),
+ ('𐳪', &['𐲪']),
+ ('𐳫', &['𐲫']),
+ ('𐳬', &['𐲬']),
+ ('𐳭', &['𐲭']),
+ ('𐳮', &['𐲮']),
+ ('𐳯', &['𐲯']),
+ ('𐳰', &['𐲰']),
+ ('𐳱', &['𐲱']),
+ ('𐳲', &['𐲲']),
+ ('𑢠', &['𑣀']),
+ ('𑢡', &['𑣁']),
+ ('𑢢', &['𑣂']),
+ ('𑢣', &['𑣃']),
+ ('𑢤', &['𑣄']),
+ ('𑢥', &['𑣅']),
+ ('𑢦', &['𑣆']),
+ ('𑢧', &['𑣇']),
+ ('𑢨', &['𑣈']),
+ ('𑢩', &['𑣉']),
+ ('𑢪', &['𑣊']),
+ ('𑢫', &['𑣋']),
+ ('𑢬', &['𑣌']),
+ ('𑢭', &['𑣍']),
+ ('𑢮', &['𑣎']),
+ ('𑢯', &['𑣏']),
+ ('𑢰', &['𑣐']),
+ ('𑢱', &['𑣑']),
+ ('𑢲', &['𑣒']),
+ ('𑢳', &['𑣓']),
+ ('𑢴', &['𑣔']),
+ ('𑢵', &['𑣕']),
+ ('𑢶', &['𑣖']),
+ ('𑢷', &['𑣗']),
+ ('𑢸', &['𑣘']),
+ ('𑢹', &['𑣙']),
+ ('𑢺', &['𑣚']),
+ ('𑢻', &['𑣛']),
+ ('𑢼', &['𑣜']),
+ ('𑢽', &['𑣝']),
+ ('𑢾', &['𑣞']),
+ ('𑢿', &['𑣟']),
+ ('𑣀', &['𑢠']),
+ ('𑣁', &['𑢡']),
+ ('𑣂', &['𑢢']),
+ ('𑣃', &['𑢣']),
+ ('𑣄', &['𑢤']),
+ ('𑣅', &['𑢥']),
+ ('𑣆', &['𑢦']),
+ ('𑣇', &['𑢧']),
+ ('𑣈', &['𑢨']),
+ ('𑣉', &['𑢩']),
+ ('𑣊', &['𑢪']),
+ ('𑣋', &['𑢫']),
+ ('𑣌', &['𑢬']),
+ ('𑣍', &['𑢭']),
+ ('𑣎', &['𑢮']),
+ ('𑣏', &['𑢯']),
+ ('𑣐', &['𑢰']),
+ ('𑣑', &['𑢱']),
+ ('𑣒', &['𑢲']),
+ ('𑣓', &['𑢳']),
+ ('𑣔', &['𑢴']),
+ ('𑣕', &['𑢵']),
+ ('𑣖', &['𑢶']),
+ ('𑣗', &['𑢷']),
+ ('𑣘', &['𑢸']),
+ ('𑣙', &['𑢹']),
+ ('𑣚', &['𑢺']),
+ ('𑣛', &['𑢻']),
+ ('𑣜', &['𑢼']),
+ ('𑣝', &['𑢽']),
+ ('𑣞', &['𑢾']),
+ ('𑣟', &['𑢿']),
+ ('𖹀', &['𖹠']),
+ ('𖹁', &['𖹡']),
+ ('𖹂', &['𖹢']),
+ ('𖹃', &['𖹣']),
+ ('𖹄', &['𖹤']),
+ ('𖹅', &['𖹥']),
+ ('𖹆', &['𖹦']),
+ ('𖹇', &['𖹧']),
+ ('𖹈', &['𖹨']),
+ ('𖹉', &['𖹩']),
+ ('𖹊', &['𖹪']),
+ ('𖹋', &['𖹫']),
+ ('𖹌', &['𖹬']),
+ ('𖹍', &['𖹭']),
+ ('𖹎', &['𖹮']),
+ ('𖹏', &['𖹯']),
+ ('𖹐', &['𖹰']),
+ ('𖹑', &['𖹱']),
+ ('𖹒', &['𖹲']),
+ ('𖹓', &['𖹳']),
+ ('𖹔', &['𖹴']),
+ ('𖹕', &['𖹵']),
+ ('𖹖', &['𖹶']),
+ ('𖹗', &['𖹷']),
+ ('𖹘', &['𖹸']),
+ ('𖹙', &['𖹹']),
+ ('𖹚', &['𖹺']),
+ ('𖹛', &['𖹻']),
+ ('𖹜', &['𖹼']),
+ ('𖹝', &['𖹽']),
+ ('𖹞', &['𖹾']),
+ ('𖹟', &['𖹿']),
+ ('𖹠', &['𖹀']),
+ ('𖹡', &['𖹁']),
+ ('𖹢', &['𖹂']),
+ ('𖹣', &['𖹃']),
+ ('𖹤', &['𖹄']),
+ ('𖹥', &['𖹅']),
+ ('𖹦', &['𖹆']),
+ ('𖹧', &['𖹇']),
+ ('𖹨', &['𖹈']),
+ ('𖹩', &['𖹉']),
+ ('𖹪', &['𖹊']),
+ ('𖹫', &['𖹋']),
+ ('𖹬', &['𖹌']),
+ ('𖹭', &['𖹍']),
+ ('𖹮', &['𖹎']),
+ ('𖹯', &['𖹏']),
+ ('𖹰', &['𖹐']),
+ ('𖹱', &['𖹑']),
+ ('𖹲', &['𖹒']),
+ ('𖹳', &['𖹓']),
+ ('𖹴', &['𖹔']),
+ ('𖹵', &['𖹕']),
+ ('𖹶', &['𖹖']),
+ ('𖹷', &['𖹗']),
+ ('𖹸', &['𖹘']),
+ ('𖹹', &['𖹙']),
+ ('𖹺', &['𖹚']),
+ ('𖹻', &['𖹛']),
+ ('𖹼', &['𖹜']),
+ ('𖹽', &['𖹝']),
+ ('𖹾', &['𖹞']),
+ ('𖹿', &['𖹟']),
+ ('𞤀', &['𞤢']),
+ ('𞤁', &['𞤣']),
+ ('𞤂', &['𞤤']),
+ ('𞤃', &['𞤥']),
+ ('𞤄', &['𞤦']),
+ ('𞤅', &['𞤧']),
+ ('𞤆', &['𞤨']),
+ ('𞤇', &['𞤩']),
+ ('𞤈', &['𞤪']),
+ ('𞤉', &['𞤫']),
+ ('𞤊', &['𞤬']),
+ ('𞤋', &['𞤭']),
+ ('𞤌', &['𞤮']),
+ ('𞤍', &['𞤯']),
+ ('𞤎', &['𞤰']),
+ ('𞤏', &['𞤱']),
+ ('𞤐', &['𞤲']),
+ ('𞤑', &['𞤳']),
+ ('𞤒', &['𞤴']),
+ ('𞤓', &['𞤵']),
+ ('𞤔', &['𞤶']),
+ ('𞤕', &['𞤷']),
+ ('𞤖', &['𞤸']),
+ ('𞤗', &['𞤹']),
+ ('𞤘', &['𞤺']),
+ ('𞤙', &['𞤻']),
+ ('𞤚', &['𞤼']),
+ ('𞤛', &['𞤽']),
+ ('𞤜', &['𞤾']),
+ ('𞤝', &['𞤿']),
+ ('𞤞', &['𞥀']),
+ ('𞤟', &['𞥁']),
+ ('𞤠', &['𞥂']),
+ ('𞤡', &['𞥃']),
+ ('𞤢', &['𞤀']),
+ ('𞤣', &['𞤁']),
+ ('𞤤', &['𞤂']),
+ ('𞤥', &['𞤃']),
+ ('𞤦', &['𞤄']),
+ ('𞤧', &['𞤅']),
+ ('𞤨', &['𞤆']),
+ ('𞤩', &['𞤇']),
+ ('𞤪', &['𞤈']),
+ ('𞤫', &['𞤉']),
+ ('𞤬', &['𞤊']),
+ ('𞤭', &['𞤋']),
+ ('𞤮', &['𞤌']),
+ ('𞤯', &['𞤍']),
+ ('𞤰', &['𞤎']),
+ ('𞤱', &['𞤏']),
+ ('𞤲', &['𞤐']),
+ ('𞤳', &['𞤑']),
+ ('𞤴', &['𞤒']),
+ ('𞤵', &['𞤓']),
+ ('𞤶', &['𞤔']),
+ ('𞤷', &['𞤕']),
+ ('𞤸', &['𞤖']),
+ ('𞤹', &['𞤗']),
+ ('𞤺', &['𞤘']),
+ ('𞤻', &['𞤙']),
+ ('𞤼', &['𞤚']),
+ ('𞤽', &['𞤛']),
+ ('𞤾', &['𞤜']),
+ ('𞤿', &['𞤝']),
+ ('𞥀', &['𞤞']),
+ ('𞥁', &['𞤟']),
+ ('𞥂', &['𞤠']),
+ ('𞥃', &['𞤡']),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/general_category.rs b/third_party/rust/regex-syntax/src/unicode_tables/general_category.rs
new file mode 100644
index 0000000000..8fc9289127
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/general_category.rs
@@ -0,0 +1,6552 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate general-category ucd-15.0.0 --chars --exclude surrogate
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
+ ("Cased_Letter", CASED_LETTER),
+ ("Close_Punctuation", CLOSE_PUNCTUATION),
+ ("Connector_Punctuation", CONNECTOR_PUNCTUATION),
+ ("Control", CONTROL),
+ ("Currency_Symbol", CURRENCY_SYMBOL),
+ ("Dash_Punctuation", DASH_PUNCTUATION),
+ ("Decimal_Number", DECIMAL_NUMBER),
+ ("Enclosing_Mark", ENCLOSING_MARK),
+ ("Final_Punctuation", FINAL_PUNCTUATION),
+ ("Format", FORMAT),
+ ("Initial_Punctuation", INITIAL_PUNCTUATION),
+ ("Letter", LETTER),
+ ("Letter_Number", LETTER_NUMBER),
+ ("Line_Separator", LINE_SEPARATOR),
+ ("Lowercase_Letter", LOWERCASE_LETTER),
+ ("Mark", MARK),
+ ("Math_Symbol", MATH_SYMBOL),
+ ("Modifier_Letter", MODIFIER_LETTER),
+ ("Modifier_Symbol", MODIFIER_SYMBOL),
+ ("Nonspacing_Mark", NONSPACING_MARK),
+ ("Number", NUMBER),
+ ("Open_Punctuation", OPEN_PUNCTUATION),
+ ("Other", OTHER),
+ ("Other_Letter", OTHER_LETTER),
+ ("Other_Number", OTHER_NUMBER),
+ ("Other_Punctuation", OTHER_PUNCTUATION),
+ ("Other_Symbol", OTHER_SYMBOL),
+ ("Paragraph_Separator", PARAGRAPH_SEPARATOR),
+ ("Private_Use", PRIVATE_USE),
+ ("Punctuation", PUNCTUATION),
+ ("Separator", SEPARATOR),
+ ("Space_Separator", SPACE_SEPARATOR),
+ ("Spacing_Mark", SPACING_MARK),
+ ("Symbol", SYMBOL),
+ ("Titlecase_Letter", TITLECASE_LETTER),
+ ("Unassigned", UNASSIGNED),
+ ("Uppercase_Letter", UPPERCASE_LETTER),
+];
+
+pub const CASED_LETTER: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('µ', 'µ'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ƺ'),
+ ('Ƽ', 'ƿ'),
+ ('DŽ', 'ʓ'),
+ ('ʕ', 'ʯ'),
+ ('Ͱ', 'ͳ'),
+ ('Ͷ', 'ͷ'),
+ ('ͻ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՠ', 'ֈ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჽ', 'ჿ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ᴀ', 'ᴫ'),
+ ('ᵫ', 'ᵷ'),
+ ('ᵹ', 'ᶚ'),
+ ('Ḁ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℯ', 'ℴ'),
+ ('ℹ', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ↄ', 'ↄ'),
+ ('Ⰰ', 'ⱻ'),
+ ('Ȿ', 'ⳤ'),
+ ('Ⳬ', 'ⳮ'),
+ ('Ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('Ꙁ', 'ꙭ'),
+ ('Ꚁ', 'ꚛ'),
+ ('Ꜣ', 'ꝯ'),
+ ('ꝱ', 'ꞇ'),
+ ('Ꞌ', 'ꞎ'),
+ ('Ꞑ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('Ꟶ', 'ꟶ'),
+ ('ꟺ', 'ꟺ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭠ', 'ꭨ'),
+ ('ꭰ', 'ꮿ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('𐐀', '𐑏'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𑢠', '𑣟'),
+ ('𖹀', '𖹿'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝼀', '𝼉'),
+ ('𝼋', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞤀', '𞥃'),
+];
+
+pub const CLOSE_PUNCTUATION: &'static [(char, char)] = &[
+ (')', ')'),
+ (']', ']'),
+ ('}', '}'),
+ ('༻', '༻'),
+ ('༽', '༽'),
+ ('᚜', '᚜'),
+ ('⁆', '⁆'),
+ ('⁾', '⁾'),
+ ('₎', '₎'),
+ ('⌉', '⌉'),
+ ('⌋', '⌋'),
+ ('〉', '〉'),
+ ('❩', '❩'),
+ ('❫', '❫'),
+ ('❭', '❭'),
+ ('❯', '❯'),
+ ('❱', '❱'),
+ ('❳', '❳'),
+ ('❵', '❵'),
+ ('⟆', '⟆'),
+ ('⟧', '⟧'),
+ ('⟩', '⟩'),
+ ('⟫', '⟫'),
+ ('⟭', '⟭'),
+ ('⟯', '⟯'),
+ ('⦄', '⦄'),
+ ('⦆', '⦆'),
+ ('⦈', '⦈'),
+ ('⦊', '⦊'),
+ ('⦌', '⦌'),
+ ('⦎', '⦎'),
+ ('⦐', '⦐'),
+ ('⦒', '⦒'),
+ ('⦔', '⦔'),
+ ('⦖', '⦖'),
+ ('⦘', '⦘'),
+ ('⧙', '⧙'),
+ ('⧛', '⧛'),
+ ('⧽', '⧽'),
+ ('⸣', '⸣'),
+ ('⸥', '⸥'),
+ ('⸧', '⸧'),
+ ('⸩', '⸩'),
+ ('⹖', '⹖'),
+ ('⹘', '⹘'),
+ ('⹚', '⹚'),
+ ('⹜', '⹜'),
+ ('〉', '〉'),
+ ('》', '》'),
+ ('」', '」'),
+ ('』', '』'),
+ ('】', '】'),
+ ('〕', '〕'),
+ ('〗', '〗'),
+ ('〙', '〙'),
+ ('〛', '〛'),
+ ('〞', '〟'),
+ ('﴾', '﴾'),
+ ('︘', '︘'),
+ ('︶', '︶'),
+ ('︸', '︸'),
+ ('︺', '︺'),
+ ('︼', '︼'),
+ ('︾', '︾'),
+ ('﹀', '﹀'),
+ ('﹂', '﹂'),
+ ('﹄', '﹄'),
+ ('﹈', '﹈'),
+ ('﹚', '﹚'),
+ ('﹜', '﹜'),
+ ('﹞', '﹞'),
+ (')', ')'),
+ (']', ']'),
+ ('}', '}'),
+ ('⦆', '⦆'),
+ ('」', '」'),
+];
+
+pub const CONNECTOR_PUNCTUATION: &'static [(char, char)] = &[
+ ('_', '_'),
+ ('‿', '⁀'),
+ ('⁔', '⁔'),
+ ('︳', '︴'),
+ ('﹍', '﹏'),
+ ('_', '_'),
+];
+
+pub const CONTROL: &'static [(char, char)] =
+ &[('\0', '\u{1f}'), ('\u{7f}', '\u{9f}')];
+
+pub const CURRENCY_SYMBOL: &'static [(char, char)] = &[
+ ('$', '$'),
+ ('¢', '¥'),
+ ('֏', '֏'),
+ ('؋', '؋'),
+ ('߾', '߿'),
+ ('৲', '৳'),
+ ('৻', '৻'),
+ ('૱', '૱'),
+ ('௹', '௹'),
+ ('฿', '฿'),
+ ('៛', '៛'),
+ ('₠', '⃀'),
+ ('꠸', '꠸'),
+ ('﷼', '﷼'),
+ ('﹩', '﹩'),
+ ('$', '$'),
+ ('¢', '£'),
+ ('¥', '₩'),
+ ('𑿝', '𑿠'),
+ ('𞋿', '𞋿'),
+ ('𞲰', '𞲰'),
+];
+
+pub const DASH_PUNCTUATION: &'static [(char, char)] = &[
+ ('-', '-'),
+ ('֊', '֊'),
+ ('־', '־'),
+ ('᐀', '᐀'),
+ ('᠆', '᠆'),
+ ('‐', '―'),
+ ('⸗', '⸗'),
+ ('⸚', '⸚'),
+ ('⸺', '⸻'),
+ ('⹀', '⹀'),
+ ('⹝', '⹝'),
+ ('〜', '〜'),
+ ('〰', '〰'),
+ ('゠', '゠'),
+ ('︱', '︲'),
+ ('﹘', '﹘'),
+ ('﹣', '﹣'),
+ ('-', '-'),
+ ('𐺭', '𐺭'),
+];
+
+pub const DECIMAL_NUMBER: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('٠', '٩'),
+ ('۰', '۹'),
+ ('߀', '߉'),
+ ('०', '९'),
+ ('০', '৯'),
+ ('੦', '੯'),
+ ('૦', '૯'),
+ ('୦', '୯'),
+ ('௦', '௯'),
+ ('౦', '౯'),
+ ('೦', '೯'),
+ ('൦', '൯'),
+ ('෦', '෯'),
+ ('๐', '๙'),
+ ('໐', '໙'),
+ ('༠', '༩'),
+ ('၀', '၉'),
+ ('႐', '႙'),
+ ('០', '៩'),
+ ('᠐', '᠙'),
+ ('᥆', '᥏'),
+ ('᧐', '᧙'),
+ ('᪀', '᪉'),
+ ('᪐', '᪙'),
+ ('᭐', '᭙'),
+ ('᮰', '᮹'),
+ ('᱀', '᱉'),
+ ('᱐', '᱙'),
+ ('꘠', '꘩'),
+ ('꣐', '꣙'),
+ ('꤀', '꤉'),
+ ('꧐', '꧙'),
+ ('꧰', '꧹'),
+ ('꩐', '꩙'),
+ ('꯰', '꯹'),
+ ('0', '9'),
+ ('𐒠', '𐒩'),
+ ('𐴰', '𐴹'),
+ ('𑁦', '𑁯'),
+ ('𑃰', '𑃹'),
+ ('𑄶', '𑄿'),
+ ('𑇐', '𑇙'),
+ ('𑋰', '𑋹'),
+ ('𑑐', '𑑙'),
+ ('𑓐', '𑓙'),
+ ('𑙐', '𑙙'),
+ ('𑛀', '𑛉'),
+ ('𑜰', '𑜹'),
+ ('𑣠', '𑣩'),
+ ('𑥐', '𑥙'),
+ ('𑱐', '𑱙'),
+ ('𑵐', '𑵙'),
+ ('𑶠', '𑶩'),
+ ('𑽐', '𑽙'),
+ ('𖩠', '𖩩'),
+ ('𖫀', '𖫉'),
+ ('𖭐', '𖭙'),
+ ('𝟎', '𝟿'),
+ ('𞅀', '𞅉'),
+ ('𞋰', '𞋹'),
+ ('𞓰', '𞓹'),
+ ('𞥐', '𞥙'),
+ ('🯰', '🯹'),
+];
+
+pub const ENCLOSING_MARK: &'static [(char, char)] = &[
+ ('\u{488}', '\u{489}'),
+ ('\u{1abe}', '\u{1abe}'),
+ ('\u{20dd}', '\u{20e0}'),
+ ('\u{20e2}', '\u{20e4}'),
+ ('\u{a670}', '\u{a672}'),
+];
+
+pub const FINAL_PUNCTUATION: &'static [(char, char)] = &[
+ ('»', '»'),
+ ('’', '’'),
+ ('”', '”'),
+ ('›', '›'),
+ ('⸃', '⸃'),
+ ('⸅', '⸅'),
+ ('⸊', '⸊'),
+ ('⸍', '⸍'),
+ ('⸝', '⸝'),
+ ('⸡', '⸡'),
+];
+
+pub const FORMAT: &'static [(char, char)] = &[
+ ('\u{ad}', '\u{ad}'),
+ ('\u{600}', '\u{605}'),
+ ('\u{61c}', '\u{61c}'),
+ ('\u{6dd}', '\u{6dd}'),
+ ('\u{70f}', '\u{70f}'),
+ ('\u{890}', '\u{891}'),
+ ('\u{8e2}', '\u{8e2}'),
+ ('\u{180e}', '\u{180e}'),
+ ('\u{200b}', '\u{200f}'),
+ ('\u{202a}', '\u{202e}'),
+ ('\u{2060}', '\u{2064}'),
+ ('\u{2066}', '\u{206f}'),
+ ('\u{feff}', '\u{feff}'),
+ ('\u{fff9}', '\u{fffb}'),
+ ('\u{110bd}', '\u{110bd}'),
+ ('\u{110cd}', '\u{110cd}'),
+ ('\u{13430}', '\u{1343f}'),
+ ('\u{1bca0}', '\u{1bca3}'),
+ ('\u{1d173}', '\u{1d17a}'),
+ ('\u{e0001}', '\u{e0001}'),
+ ('\u{e0020}', '\u{e007f}'),
+];
+
+pub const INITIAL_PUNCTUATION: &'static [(char, char)] = &[
+ ('«', '«'),
+ ('‘', '‘'),
+ ('‛', '“'),
+ ('‟', '‟'),
+ ('‹', '‹'),
+ ('⸂', '⸂'),
+ ('⸄', '⸄'),
+ ('⸉', '⸉'),
+ ('⸌', '⸌'),
+ ('⸜', '⸜'),
+ ('⸠', '⸠'),
+];
+
+pub const LETTER: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ˁ'),
+ ('ˆ', 'ˑ'),
+ ('ˠ', 'ˤ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('Ͱ', 'ʹ'),
+ ('Ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', 'ՙ'),
+ ('ՠ', 'ֈ'),
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('ؠ', 'ي'),
+ ('ٮ', 'ٯ'),
+ ('ٱ', 'ۓ'),
+ ('ە', 'ە'),
+ ('ۥ', 'ۦ'),
+ ('ۮ', 'ۯ'),
+ ('ۺ', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', 'ܐ'),
+ ('ܒ', 'ܯ'),
+ ('ݍ', 'ޥ'),
+ ('ޱ', 'ޱ'),
+ ('ߊ', 'ߪ'),
+ ('ߴ', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('ࠀ', 'ࠕ'),
+ ('ࠚ', 'ࠚ'),
+ ('ࠤ', 'ࠤ'),
+ ('ࠨ', 'ࠨ'),
+ ('ࡀ', 'ࡘ'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('ࢠ', 'ࣉ'),
+ ('ऄ', 'ह'),
+ ('ऽ', 'ऽ'),
+ ('ॐ', 'ॐ'),
+ ('क़', 'ॡ'),
+ ('ॱ', 'ঀ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('ঽ', 'ঽ'),
+ ('ৎ', 'ৎ'),
+ ('ড়', 'ঢ়'),
+ ('য়', 'ৡ'),
+ ('ৰ', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('ੲ', 'ੴ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('ઽ', 'ઽ'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', 'ૡ'),
+ ('ૹ', 'ૹ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('ଽ', 'ଽ'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', 'ୡ'),
+ ('ୱ', 'ୱ'),
+ ('ஃ', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('ௐ', 'ௐ'),
+ ('అ', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('ఽ', 'ఽ'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', 'ౡ'),
+ ('ಀ', 'ಀ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('ಽ', 'ಽ'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', 'ೡ'),
+ ('ೱ', 'ೲ'),
+ ('ഄ', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', 'ഺ'),
+ ('ഽ', 'ഽ'),
+ ('ൎ', 'ൎ'),
+ ('ൔ', 'ൖ'),
+ ('ൟ', 'ൡ'),
+ ('ൺ', 'ൿ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('ก', 'ะ'),
+ ('า', 'ำ'),
+ ('เ', 'ๆ'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ະ'),
+ ('າ', 'ຳ'),
+ ('ຽ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('ཀ', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('ྈ', 'ྌ'),
+ ('က', 'ဪ'),
+ ('ဿ', 'ဿ'),
+ ('ၐ', 'ၕ'),
+ ('ၚ', 'ၝ'),
+ ('ၡ', 'ၡ'),
+ ('ၥ', 'ၦ'),
+ ('ၮ', 'ၰ'),
+ ('ၵ', 'ႁ'),
+ ('ႎ', 'ႎ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('ᎀ', 'ᎏ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛱ', 'ᛸ'),
+ ('ᜀ', 'ᜑ'),
+ ('ᜟ', 'ᜱ'),
+ ('ᝀ', 'ᝑ'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('ក', 'ឳ'),
+ ('ៗ', 'ៗ'),
+ ('ៜ', 'ៜ'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢄ'),
+ ('ᢇ', 'ᢨ'),
+ ('ᢪ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('ᥐ', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('ᨀ', 'ᨖ'),
+ ('ᨠ', 'ᩔ'),
+ ('ᪧ', 'ᪧ'),
+ ('ᬅ', 'ᬳ'),
+ ('ᭅ', 'ᭌ'),
+ ('ᮃ', 'ᮠ'),
+ ('ᮮ', 'ᮯ'),
+ ('ᮺ', 'ᯥ'),
+ ('ᰀ', 'ᰣ'),
+ ('ᱍ', 'ᱏ'),
+ ('ᱚ', 'ᱽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ᳩ', 'ᳬ'),
+ ('ᳮ', 'ᳳ'),
+ ('ᳵ', 'ᳶ'),
+ ('ᳺ', 'ᳺ'),
+ ('ᴀ', 'ᶿ'),
+ ('Ḁ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℯ', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ↄ', 'ↄ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳮ'),
+ ('Ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('ⸯ', 'ⸯ'),
+ ('々', '〆'),
+ ('〱', '〵'),
+ ('〻', '〼'),
+ ('ぁ', 'ゖ'),
+ ('ゝ', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘟ'),
+ ('ꘪ', 'ꘫ'),
+ ('Ꙁ', 'ꙮ'),
+ ('ꙿ', 'ꚝ'),
+ ('ꚠ', 'ꛥ'),
+ ('ꜗ', 'ꜟ'),
+ ('Ꜣ', 'ꞈ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠁ'),
+ ('ꠃ', 'ꠅ'),
+ ('ꠇ', 'ꠊ'),
+ ('ꠌ', 'ꠢ'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢂ', 'ꢳ'),
+ ('ꣲ', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', 'ꣾ'),
+ ('ꤊ', 'ꤥ'),
+ ('ꤰ', 'ꥆ'),
+ ('ꥠ', 'ꥼ'),
+ ('ꦄ', 'ꦲ'),
+ ('ꧏ', 'ꧏ'),
+ ('ꧠ', 'ꧤ'),
+ ('ꧦ', 'ꧯ'),
+ ('ꧺ', 'ꧾ'),
+ ('ꨀ', 'ꨨ'),
+ ('ꩀ', 'ꩂ'),
+ ('ꩄ', 'ꩋ'),
+ ('ꩠ', 'ꩶ'),
+ ('ꩺ', 'ꩺ'),
+ ('ꩾ', 'ꪯ'),
+ ('ꪱ', 'ꪱ'),
+ ('ꪵ', 'ꪶ'),
+ ('ꪹ', 'ꪽ'),
+ ('ꫀ', 'ꫀ'),
+ ('ꫂ', 'ꫂ'),
+ ('ꫛ', 'ꫝ'),
+ ('ꫠ', 'ꫪ'),
+ ('ꫲ', 'ꫴ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꯢ'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('יִ', 'יִ'),
+ ('ײַ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ヲ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍀'),
+ ('𐍂', '𐍉'),
+ ('𐍐', '𐍵'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐐀', '𐒝'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '𐨀'),
+ ('𐨐', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '𐫤'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐴀', '𐴣'),
+ ('𐺀', '𐺩'),
+ ('𐺰', '𐺱'),
+ ('𐼀', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '𐽅'),
+ ('𐽰', '𐾁'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀃', '𑀷'),
+ ('𑁱', '𑁲'),
+ ('𑁵', '𑁵'),
+ ('𑂃', '𑂯'),
+ ('𑃐', '𑃨'),
+ ('𑄃', '𑄦'),
+ ('𑅄', '𑅄'),
+ ('𑅇', '𑅇'),
+ ('𑅐', '𑅲'),
+ ('𑅶', '𑅶'),
+ ('𑆃', '𑆲'),
+ ('𑇁', '𑇄'),
+ ('𑇚', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '𑈫'),
+ ('𑈿', '𑉀'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '𑋞'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('𑌽', '𑌽'),
+ ('𑍐', '𑍐'),
+ ('𑍝', '𑍡'),
+ ('𑐀', '𑐴'),
+ ('𑑇', '𑑊'),
+ ('𑑟', '𑑡'),
+ ('𑒀', '𑒯'),
+ ('𑓄', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑖀', '𑖮'),
+ ('𑗘', '𑗛'),
+ ('𑘀', '𑘯'),
+ ('𑙄', '𑙄'),
+ ('𑚀', '𑚪'),
+ ('𑚸', '𑚸'),
+ ('𑜀', '𑜚'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '𑠫'),
+ ('𑢠', '𑣟'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤯'),
+ ('𑤿', '𑤿'),
+ ('𑥁', '𑥁'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '𑧐'),
+ ('𑧡', '𑧡'),
+ ('𑧣', '𑧣'),
+ ('𑨀', '𑨀'),
+ ('𑨋', '𑨲'),
+ ('𑨺', '𑨺'),
+ ('𑩐', '𑩐'),
+ ('𑩜', '𑪉'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '𑰮'),
+ ('𑱀', '𑱀'),
+ ('𑱲', '𑲏'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '𑴰'),
+ ('𑵆', '𑵆'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶉'),
+ ('𑶘', '𑶘'),
+ ('𑻠', '𑻲'),
+ ('𑼂', '𑼂'),
+ ('𑼄', '𑼐'),
+ ('𑼒', '𑼳'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('𓑁', '𓑆'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩰', '𖪾'),
+ ('𖫐', '𖫭'),
+ ('𖬀', '𖬯'),
+ ('𖭀', '𖭃'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖹿'),
+ ('𖼀', '𖽊'),
+ ('𖽐', '𖽐'),
+ ('𖾓', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '𖿣'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞀰', '𞁭'),
+ ('𞄀', '𞄬'),
+ ('𞄷', '𞄽'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '𞊭'),
+ ('𞋀', '𞋫'),
+ ('𞓐', '𞓫'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('𞤀', '𞥃'),
+ ('𞥋', '𞥋'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const LETTER_NUMBER: &'static [(char, char)] = &[
+ ('ᛮ', 'ᛰ'),
+ ('Ⅰ', 'ↂ'),
+ ('ↅ', 'ↈ'),
+ ('〇', '〇'),
+ ('〡', '〩'),
+ ('〸', '〺'),
+ ('ꛦ', 'ꛯ'),
+ ('𐅀', '𐅴'),
+ ('𐍁', '𐍁'),
+ ('𐍊', '𐍊'),
+ ('𐏑', '𐏕'),
+ ('𒐀', '𒑮'),
+];
+
+pub const LINE_SEPARATOR: &'static [(char, char)] =
+ &[('\u{2028}', '\u{2028}')];
+
+pub const LOWERCASE_LETTER: &'static [(char, char)] = &[
+ ('a', 'z'),
+ ('µ', 'µ'),
+ ('ß', 'ö'),
+ ('ø', 'ÿ'),
+ ('ā', 'ā'),
+ ('ă', 'ă'),
+ ('ą', 'ą'),
+ ('ć', 'ć'),
+ ('ĉ', 'ĉ'),
+ ('ċ', 'ċ'),
+ ('č', 'č'),
+ ('ď', 'ď'),
+ ('đ', 'đ'),
+ ('ē', 'ē'),
+ ('ĕ', 'ĕ'),
+ ('ė', 'ė'),
+ ('ę', 'ę'),
+ ('ě', 'ě'),
+ ('ĝ', 'ĝ'),
+ ('ğ', 'ğ'),
+ ('ġ', 'ġ'),
+ ('ģ', 'ģ'),
+ ('ĥ', 'ĥ'),
+ ('ħ', 'ħ'),
+ ('ĩ', 'ĩ'),
+ ('ī', 'ī'),
+ ('ĭ', 'ĭ'),
+ ('į', 'į'),
+ ('ı', 'ı'),
+ ('ij', 'ij'),
+ ('ĵ', 'ĵ'),
+ ('ķ', 'ĸ'),
+ ('ĺ', 'ĺ'),
+ ('ļ', 'ļ'),
+ ('ľ', 'ľ'),
+ ('ŀ', 'ŀ'),
+ ('ł', 'ł'),
+ ('ń', 'ń'),
+ ('ņ', 'ņ'),
+ ('ň', 'ʼn'),
+ ('ŋ', 'ŋ'),
+ ('ō', 'ō'),
+ ('ŏ', 'ŏ'),
+ ('ő', 'ő'),
+ ('œ', 'œ'),
+ ('ŕ', 'ŕ'),
+ ('ŗ', 'ŗ'),
+ ('ř', 'ř'),
+ ('ś', 'ś'),
+ ('ŝ', 'ŝ'),
+ ('ş', 'ş'),
+ ('š', 'š'),
+ ('ţ', 'ţ'),
+ ('ť', 'ť'),
+ ('ŧ', 'ŧ'),
+ ('ũ', 'ũ'),
+ ('ū', 'ū'),
+ ('ŭ', 'ŭ'),
+ ('ů', 'ů'),
+ ('ű', 'ű'),
+ ('ų', 'ų'),
+ ('ŵ', 'ŵ'),
+ ('ŷ', 'ŷ'),
+ ('ź', 'ź'),
+ ('ż', 'ż'),
+ ('ž', 'ƀ'),
+ ('ƃ', 'ƃ'),
+ ('ƅ', 'ƅ'),
+ ('ƈ', 'ƈ'),
+ ('ƌ', 'ƍ'),
+ ('ƒ', 'ƒ'),
+ ('ƕ', 'ƕ'),
+ ('ƙ', 'ƛ'),
+ ('ƞ', 'ƞ'),
+ ('ơ', 'ơ'),
+ ('ƣ', 'ƣ'),
+ ('ƥ', 'ƥ'),
+ ('ƨ', 'ƨ'),
+ ('ƪ', 'ƫ'),
+ ('ƭ', 'ƭ'),
+ ('ư', 'ư'),
+ ('ƴ', 'ƴ'),
+ ('ƶ', 'ƶ'),
+ ('ƹ', 'ƺ'),
+ ('ƽ', 'ƿ'),
+ ('dž', 'dž'),
+ ('lj', 'lj'),
+ ('nj', 'nj'),
+ ('ǎ', 'ǎ'),
+ ('ǐ', 'ǐ'),
+ ('ǒ', 'ǒ'),
+ ('ǔ', 'ǔ'),
+ ('ǖ', 'ǖ'),
+ ('ǘ', 'ǘ'),
+ ('ǚ', 'ǚ'),
+ ('ǜ', 'ǝ'),
+ ('ǟ', 'ǟ'),
+ ('ǡ', 'ǡ'),
+ ('ǣ', 'ǣ'),
+ ('ǥ', 'ǥ'),
+ ('ǧ', 'ǧ'),
+ ('ǩ', 'ǩ'),
+ ('ǫ', 'ǫ'),
+ ('ǭ', 'ǭ'),
+ ('ǯ', 'ǰ'),
+ ('dz', 'dz'),
+ ('ǵ', 'ǵ'),
+ ('ǹ', 'ǹ'),
+ ('ǻ', 'ǻ'),
+ ('ǽ', 'ǽ'),
+ ('ǿ', 'ǿ'),
+ ('ȁ', 'ȁ'),
+ ('ȃ', 'ȃ'),
+ ('ȅ', 'ȅ'),
+ ('ȇ', 'ȇ'),
+ ('ȉ', 'ȉ'),
+ ('ȋ', 'ȋ'),
+ ('ȍ', 'ȍ'),
+ ('ȏ', 'ȏ'),
+ ('ȑ', 'ȑ'),
+ ('ȓ', 'ȓ'),
+ ('ȕ', 'ȕ'),
+ ('ȗ', 'ȗ'),
+ ('ș', 'ș'),
+ ('ț', 'ț'),
+ ('ȝ', 'ȝ'),
+ ('ȟ', 'ȟ'),
+ ('ȡ', 'ȡ'),
+ ('ȣ', 'ȣ'),
+ ('ȥ', 'ȥ'),
+ ('ȧ', 'ȧ'),
+ ('ȩ', 'ȩ'),
+ ('ȫ', 'ȫ'),
+ ('ȭ', 'ȭ'),
+ ('ȯ', 'ȯ'),
+ ('ȱ', 'ȱ'),
+ ('ȳ', 'ȹ'),
+ ('ȼ', 'ȼ'),
+ ('ȿ', 'ɀ'),
+ ('ɂ', 'ɂ'),
+ ('ɇ', 'ɇ'),
+ ('ɉ', 'ɉ'),
+ ('ɋ', 'ɋ'),
+ ('ɍ', 'ɍ'),
+ ('ɏ', 'ʓ'),
+ ('ʕ', 'ʯ'),
+ ('ͱ', 'ͱ'),
+ ('ͳ', 'ͳ'),
+ ('ͷ', 'ͷ'),
+ ('ͻ', 'ͽ'),
+ ('ΐ', 'ΐ'),
+ ('ά', 'ώ'),
+ ('ϐ', 'ϑ'),
+ ('ϕ', 'ϗ'),
+ ('ϙ', 'ϙ'),
+ ('ϛ', 'ϛ'),
+ ('ϝ', 'ϝ'),
+ ('ϟ', 'ϟ'),
+ ('ϡ', 'ϡ'),
+ ('ϣ', 'ϣ'),
+ ('ϥ', 'ϥ'),
+ ('ϧ', 'ϧ'),
+ ('ϩ', 'ϩ'),
+ ('ϫ', 'ϫ'),
+ ('ϭ', 'ϭ'),
+ ('ϯ', 'ϳ'),
+ ('ϵ', 'ϵ'),
+ ('ϸ', 'ϸ'),
+ ('ϻ', 'ϼ'),
+ ('а', 'џ'),
+ ('ѡ', 'ѡ'),
+ ('ѣ', 'ѣ'),
+ ('ѥ', 'ѥ'),
+ ('ѧ', 'ѧ'),
+ ('ѩ', 'ѩ'),
+ ('ѫ', 'ѫ'),
+ ('ѭ', 'ѭ'),
+ ('ѯ', 'ѯ'),
+ ('ѱ', 'ѱ'),
+ ('ѳ', 'ѳ'),
+ ('ѵ', 'ѵ'),
+ ('ѷ', 'ѷ'),
+ ('ѹ', 'ѹ'),
+ ('ѻ', 'ѻ'),
+ ('ѽ', 'ѽ'),
+ ('ѿ', 'ѿ'),
+ ('ҁ', 'ҁ'),
+ ('ҋ', 'ҋ'),
+ ('ҍ', 'ҍ'),
+ ('ҏ', 'ҏ'),
+ ('ґ', 'ґ'),
+ ('ғ', 'ғ'),
+ ('ҕ', 'ҕ'),
+ ('җ', 'җ'),
+ ('ҙ', 'ҙ'),
+ ('қ', 'қ'),
+ ('ҝ', 'ҝ'),
+ ('ҟ', 'ҟ'),
+ ('ҡ', 'ҡ'),
+ ('ң', 'ң'),
+ ('ҥ', 'ҥ'),
+ ('ҧ', 'ҧ'),
+ ('ҩ', 'ҩ'),
+ ('ҫ', 'ҫ'),
+ ('ҭ', 'ҭ'),
+ ('ү', 'ү'),
+ ('ұ', 'ұ'),
+ ('ҳ', 'ҳ'),
+ ('ҵ', 'ҵ'),
+ ('ҷ', 'ҷ'),
+ ('ҹ', 'ҹ'),
+ ('һ', 'һ'),
+ ('ҽ', 'ҽ'),
+ ('ҿ', 'ҿ'),
+ ('ӂ', 'ӂ'),
+ ('ӄ', 'ӄ'),
+ ('ӆ', 'ӆ'),
+ ('ӈ', 'ӈ'),
+ ('ӊ', 'ӊ'),
+ ('ӌ', 'ӌ'),
+ ('ӎ', 'ӏ'),
+ ('ӑ', 'ӑ'),
+ ('ӓ', 'ӓ'),
+ ('ӕ', 'ӕ'),
+ ('ӗ', 'ӗ'),
+ ('ә', 'ә'),
+ ('ӛ', 'ӛ'),
+ ('ӝ', 'ӝ'),
+ ('ӟ', 'ӟ'),
+ ('ӡ', 'ӡ'),
+ ('ӣ', 'ӣ'),
+ ('ӥ', 'ӥ'),
+ ('ӧ', 'ӧ'),
+ ('ө', 'ө'),
+ ('ӫ', 'ӫ'),
+ ('ӭ', 'ӭ'),
+ ('ӯ', 'ӯ'),
+ ('ӱ', 'ӱ'),
+ ('ӳ', 'ӳ'),
+ ('ӵ', 'ӵ'),
+ ('ӷ', 'ӷ'),
+ ('ӹ', 'ӹ'),
+ ('ӻ', 'ӻ'),
+ ('ӽ', 'ӽ'),
+ ('ӿ', 'ӿ'),
+ ('ԁ', 'ԁ'),
+ ('ԃ', 'ԃ'),
+ ('ԅ', 'ԅ'),
+ ('ԇ', 'ԇ'),
+ ('ԉ', 'ԉ'),
+ ('ԋ', 'ԋ'),
+ ('ԍ', 'ԍ'),
+ ('ԏ', 'ԏ'),
+ ('ԑ', 'ԑ'),
+ ('ԓ', 'ԓ'),
+ ('ԕ', 'ԕ'),
+ ('ԗ', 'ԗ'),
+ ('ԙ', 'ԙ'),
+ ('ԛ', 'ԛ'),
+ ('ԝ', 'ԝ'),
+ ('ԟ', 'ԟ'),
+ ('ԡ', 'ԡ'),
+ ('ԣ', 'ԣ'),
+ ('ԥ', 'ԥ'),
+ ('ԧ', 'ԧ'),
+ ('ԩ', 'ԩ'),
+ ('ԫ', 'ԫ'),
+ ('ԭ', 'ԭ'),
+ ('ԯ', 'ԯ'),
+ ('ՠ', 'ֈ'),
+ ('ა', 'ჺ'),
+ ('ჽ', 'ჿ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᲀ', 'ᲈ'),
+ ('ᴀ', 'ᴫ'),
+ ('ᵫ', 'ᵷ'),
+ ('ᵹ', 'ᶚ'),
+ ('ḁ', 'ḁ'),
+ ('ḃ', 'ḃ'),
+ ('ḅ', 'ḅ'),
+ ('ḇ', 'ḇ'),
+ ('ḉ', 'ḉ'),
+ ('ḋ', 'ḋ'),
+ ('ḍ', 'ḍ'),
+ ('ḏ', 'ḏ'),
+ ('ḑ', 'ḑ'),
+ ('ḓ', 'ḓ'),
+ ('ḕ', 'ḕ'),
+ ('ḗ', 'ḗ'),
+ ('ḙ', 'ḙ'),
+ ('ḛ', 'ḛ'),
+ ('ḝ', 'ḝ'),
+ ('ḟ', 'ḟ'),
+ ('ḡ', 'ḡ'),
+ ('ḣ', 'ḣ'),
+ ('ḥ', 'ḥ'),
+ ('ḧ', 'ḧ'),
+ ('ḩ', 'ḩ'),
+ ('ḫ', 'ḫ'),
+ ('ḭ', 'ḭ'),
+ ('ḯ', 'ḯ'),
+ ('ḱ', 'ḱ'),
+ ('ḳ', 'ḳ'),
+ ('ḵ', 'ḵ'),
+ ('ḷ', 'ḷ'),
+ ('ḹ', 'ḹ'),
+ ('ḻ', 'ḻ'),
+ ('ḽ', 'ḽ'),
+ ('ḿ', 'ḿ'),
+ ('ṁ', 'ṁ'),
+ ('ṃ', 'ṃ'),
+ ('ṅ', 'ṅ'),
+ ('ṇ', 'ṇ'),
+ ('ṉ', 'ṉ'),
+ ('ṋ', 'ṋ'),
+ ('ṍ', 'ṍ'),
+ ('ṏ', 'ṏ'),
+ ('ṑ', 'ṑ'),
+ ('ṓ', 'ṓ'),
+ ('ṕ', 'ṕ'),
+ ('ṗ', 'ṗ'),
+ ('ṙ', 'ṙ'),
+ ('ṛ', 'ṛ'),
+ ('ṝ', 'ṝ'),
+ ('ṟ', 'ṟ'),
+ ('ṡ', 'ṡ'),
+ ('ṣ', 'ṣ'),
+ ('ṥ', 'ṥ'),
+ ('ṧ', 'ṧ'),
+ ('ṩ', 'ṩ'),
+ ('ṫ', 'ṫ'),
+ ('ṭ', 'ṭ'),
+ ('ṯ', 'ṯ'),
+ ('ṱ', 'ṱ'),
+ ('ṳ', 'ṳ'),
+ ('ṵ', 'ṵ'),
+ ('ṷ', 'ṷ'),
+ ('ṹ', 'ṹ'),
+ ('ṻ', 'ṻ'),
+ ('ṽ', 'ṽ'),
+ ('ṿ', 'ṿ'),
+ ('ẁ', 'ẁ'),
+ ('ẃ', 'ẃ'),
+ ('ẅ', 'ẅ'),
+ ('ẇ', 'ẇ'),
+ ('ẉ', 'ẉ'),
+ ('ẋ', 'ẋ'),
+ ('ẍ', 'ẍ'),
+ ('ẏ', 'ẏ'),
+ ('ẑ', 'ẑ'),
+ ('ẓ', 'ẓ'),
+ ('ẕ', 'ẝ'),
+ ('ẟ', 'ẟ'),
+ ('ạ', 'ạ'),
+ ('ả', 'ả'),
+ ('ấ', 'ấ'),
+ ('ầ', 'ầ'),
+ ('ẩ', 'ẩ'),
+ ('ẫ', 'ẫ'),
+ ('ậ', 'ậ'),
+ ('ắ', 'ắ'),
+ ('ằ', 'ằ'),
+ ('ẳ', 'ẳ'),
+ ('ẵ', 'ẵ'),
+ ('ặ', 'ặ'),
+ ('ẹ', 'ẹ'),
+ ('ẻ', 'ẻ'),
+ ('ẽ', 'ẽ'),
+ ('ế', 'ế'),
+ ('ề', 'ề'),
+ ('ể', 'ể'),
+ ('ễ', 'ễ'),
+ ('ệ', 'ệ'),
+ ('ỉ', 'ỉ'),
+ ('ị', 'ị'),
+ ('ọ', 'ọ'),
+ ('ỏ', 'ỏ'),
+ ('ố', 'ố'),
+ ('ồ', 'ồ'),
+ ('ổ', 'ổ'),
+ ('ỗ', 'ỗ'),
+ ('ộ', 'ộ'),
+ ('ớ', 'ớ'),
+ ('ờ', 'ờ'),
+ ('ở', 'ở'),
+ ('ỡ', 'ỡ'),
+ ('ợ', 'ợ'),
+ ('ụ', 'ụ'),
+ ('ủ', 'ủ'),
+ ('ứ', 'ứ'),
+ ('ừ', 'ừ'),
+ ('ử', 'ử'),
+ ('ữ', 'ữ'),
+ ('ự', 'ự'),
+ ('ỳ', 'ỳ'),
+ ('ỵ', 'ỵ'),
+ ('ỷ', 'ỷ'),
+ ('ỹ', 'ỹ'),
+ ('ỻ', 'ỻ'),
+ ('ỽ', 'ỽ'),
+ ('ỿ', 'ἇ'),
+ ('ἐ', 'ἕ'),
+ ('ἠ', 'ἧ'),
+ ('ἰ', 'ἷ'),
+ ('ὀ', 'ὅ'),
+ ('ὐ', 'ὗ'),
+ ('ὠ', 'ὧ'),
+ ('ὰ', 'ώ'),
+ ('ᾀ', 'ᾇ'),
+ ('ᾐ', 'ᾗ'),
+ ('ᾠ', 'ᾧ'),
+ ('ᾰ', 'ᾴ'),
+ ('ᾶ', 'ᾷ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῇ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'ῗ'),
+ ('ῠ', 'ῧ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῷ'),
+ ('ℊ', 'ℊ'),
+ ('ℎ', 'ℏ'),
+ ('ℓ', 'ℓ'),
+ ('ℯ', 'ℯ'),
+ ('ℴ', 'ℴ'),
+ ('ℹ', 'ℹ'),
+ ('ℼ', 'ℽ'),
+ ('ⅆ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('ↄ', 'ↄ'),
+ ('ⰰ', 'ⱟ'),
+ ('ⱡ', 'ⱡ'),
+ ('ⱥ', 'ⱦ'),
+ ('ⱨ', 'ⱨ'),
+ ('ⱪ', 'ⱪ'),
+ ('ⱬ', 'ⱬ'),
+ ('ⱱ', 'ⱱ'),
+ ('ⱳ', 'ⱴ'),
+ ('ⱶ', 'ⱻ'),
+ ('ⲁ', 'ⲁ'),
+ ('ⲃ', 'ⲃ'),
+ ('ⲅ', 'ⲅ'),
+ ('ⲇ', 'ⲇ'),
+ ('ⲉ', 'ⲉ'),
+ ('ⲋ', 'ⲋ'),
+ ('ⲍ', 'ⲍ'),
+ ('ⲏ', 'ⲏ'),
+ ('ⲑ', 'ⲑ'),
+ ('ⲓ', 'ⲓ'),
+ ('ⲕ', 'ⲕ'),
+ ('ⲗ', 'ⲗ'),
+ ('ⲙ', 'ⲙ'),
+ ('ⲛ', 'ⲛ'),
+ ('ⲝ', 'ⲝ'),
+ ('ⲟ', 'ⲟ'),
+ ('ⲡ', 'ⲡ'),
+ ('ⲣ', 'ⲣ'),
+ ('ⲥ', 'ⲥ'),
+ ('ⲧ', 'ⲧ'),
+ ('ⲩ', 'ⲩ'),
+ ('ⲫ', 'ⲫ'),
+ ('ⲭ', 'ⲭ'),
+ ('ⲯ', 'ⲯ'),
+ ('ⲱ', 'ⲱ'),
+ ('ⲳ', 'ⲳ'),
+ ('ⲵ', 'ⲵ'),
+ ('ⲷ', 'ⲷ'),
+ ('ⲹ', 'ⲹ'),
+ ('ⲻ', 'ⲻ'),
+ ('ⲽ', 'ⲽ'),
+ ('ⲿ', 'ⲿ'),
+ ('ⳁ', 'ⳁ'),
+ ('ⳃ', 'ⳃ'),
+ ('ⳅ', 'ⳅ'),
+ ('ⳇ', 'ⳇ'),
+ ('ⳉ', 'ⳉ'),
+ ('ⳋ', 'ⳋ'),
+ ('ⳍ', 'ⳍ'),
+ ('ⳏ', 'ⳏ'),
+ ('ⳑ', 'ⳑ'),
+ ('ⳓ', 'ⳓ'),
+ ('ⳕ', 'ⳕ'),
+ ('ⳗ', 'ⳗ'),
+ ('ⳙ', 'ⳙ'),
+ ('ⳛ', 'ⳛ'),
+ ('ⳝ', 'ⳝ'),
+ ('ⳟ', 'ⳟ'),
+ ('ⳡ', 'ⳡ'),
+ ('ⳣ', 'ⳤ'),
+ ('ⳬ', 'ⳬ'),
+ ('ⳮ', 'ⳮ'),
+ ('ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ꙁ', 'ꙁ'),
+ ('ꙃ', 'ꙃ'),
+ ('ꙅ', 'ꙅ'),
+ ('ꙇ', 'ꙇ'),
+ ('ꙉ', 'ꙉ'),
+ ('ꙋ', 'ꙋ'),
+ ('ꙍ', 'ꙍ'),
+ ('ꙏ', 'ꙏ'),
+ ('ꙑ', 'ꙑ'),
+ ('ꙓ', 'ꙓ'),
+ ('ꙕ', 'ꙕ'),
+ ('ꙗ', 'ꙗ'),
+ ('ꙙ', 'ꙙ'),
+ ('ꙛ', 'ꙛ'),
+ ('ꙝ', 'ꙝ'),
+ ('ꙟ', 'ꙟ'),
+ ('ꙡ', 'ꙡ'),
+ ('ꙣ', 'ꙣ'),
+ ('ꙥ', 'ꙥ'),
+ ('ꙧ', 'ꙧ'),
+ ('ꙩ', 'ꙩ'),
+ ('ꙫ', 'ꙫ'),
+ ('ꙭ', 'ꙭ'),
+ ('ꚁ', 'ꚁ'),
+ ('ꚃ', 'ꚃ'),
+ ('ꚅ', 'ꚅ'),
+ ('ꚇ', 'ꚇ'),
+ ('ꚉ', 'ꚉ'),
+ ('ꚋ', 'ꚋ'),
+ ('ꚍ', 'ꚍ'),
+ ('ꚏ', 'ꚏ'),
+ ('ꚑ', 'ꚑ'),
+ ('ꚓ', 'ꚓ'),
+ ('ꚕ', 'ꚕ'),
+ ('ꚗ', 'ꚗ'),
+ ('ꚙ', 'ꚙ'),
+ ('ꚛ', 'ꚛ'),
+ ('ꜣ', 'ꜣ'),
+ ('ꜥ', 'ꜥ'),
+ ('ꜧ', 'ꜧ'),
+ ('ꜩ', 'ꜩ'),
+ ('ꜫ', 'ꜫ'),
+ ('ꜭ', 'ꜭ'),
+ ('ꜯ', 'ꜱ'),
+ ('ꜳ', 'ꜳ'),
+ ('ꜵ', 'ꜵ'),
+ ('ꜷ', 'ꜷ'),
+ ('ꜹ', 'ꜹ'),
+ ('ꜻ', 'ꜻ'),
+ ('ꜽ', 'ꜽ'),
+ ('ꜿ', 'ꜿ'),
+ ('ꝁ', 'ꝁ'),
+ ('ꝃ', 'ꝃ'),
+ ('ꝅ', 'ꝅ'),
+ ('ꝇ', 'ꝇ'),
+ ('ꝉ', 'ꝉ'),
+ ('ꝋ', 'ꝋ'),
+ ('ꝍ', 'ꝍ'),
+ ('ꝏ', 'ꝏ'),
+ ('ꝑ', 'ꝑ'),
+ ('ꝓ', 'ꝓ'),
+ ('ꝕ', 'ꝕ'),
+ ('ꝗ', 'ꝗ'),
+ ('ꝙ', 'ꝙ'),
+ ('ꝛ', 'ꝛ'),
+ ('ꝝ', 'ꝝ'),
+ ('ꝟ', 'ꝟ'),
+ ('ꝡ', 'ꝡ'),
+ ('ꝣ', 'ꝣ'),
+ ('ꝥ', 'ꝥ'),
+ ('ꝧ', 'ꝧ'),
+ ('ꝩ', 'ꝩ'),
+ ('ꝫ', 'ꝫ'),
+ ('ꝭ', 'ꝭ'),
+ ('ꝯ', 'ꝯ'),
+ ('ꝱ', 'ꝸ'),
+ ('ꝺ', 'ꝺ'),
+ ('ꝼ', 'ꝼ'),
+ ('ꝿ', 'ꝿ'),
+ ('ꞁ', 'ꞁ'),
+ ('ꞃ', 'ꞃ'),
+ ('ꞅ', 'ꞅ'),
+ ('ꞇ', 'ꞇ'),
+ ('ꞌ', 'ꞌ'),
+ ('ꞎ', 'ꞎ'),
+ ('ꞑ', 'ꞑ'),
+ ('ꞓ', 'ꞕ'),
+ ('ꞗ', 'ꞗ'),
+ ('ꞙ', 'ꞙ'),
+ ('ꞛ', 'ꞛ'),
+ ('ꞝ', 'ꞝ'),
+ ('ꞟ', 'ꞟ'),
+ ('ꞡ', 'ꞡ'),
+ ('ꞣ', 'ꞣ'),
+ ('ꞥ', 'ꞥ'),
+ ('ꞧ', 'ꞧ'),
+ ('ꞩ', 'ꞩ'),
+ ('ꞯ', 'ꞯ'),
+ ('ꞵ', 'ꞵ'),
+ ('ꞷ', 'ꞷ'),
+ ('ꞹ', 'ꞹ'),
+ ('ꞻ', 'ꞻ'),
+ ('ꞽ', 'ꞽ'),
+ ('ꞿ', 'ꞿ'),
+ ('ꟁ', 'ꟁ'),
+ ('ꟃ', 'ꟃ'),
+ ('ꟈ', 'ꟈ'),
+ ('ꟊ', 'ꟊ'),
+ ('ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟕ'),
+ ('ꟗ', 'ꟗ'),
+ ('ꟙ', 'ꟙ'),
+ ('ꟶ', 'ꟶ'),
+ ('ꟺ', 'ꟺ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭠ', 'ꭨ'),
+ ('ꭰ', 'ꮿ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('a', 'z'),
+ ('𐐨', '𐑏'),
+ ('𐓘', '𐓻'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐳀', '𐳲'),
+ ('𑣀', '𑣟'),
+ ('𖹠', '𖹿'),
+ ('𝐚', '𝐳'),
+ ('𝑎', '𝑔'),
+ ('𝑖', '𝑧'),
+ ('𝒂', '𝒛'),
+ ('𝒶', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝓏'),
+ ('𝓪', '𝔃'),
+ ('𝔞', '𝔷'),
+ ('𝕒', '𝕫'),
+ ('𝖆', '𝖟'),
+ ('𝖺', '𝗓'),
+ ('𝗮', '𝘇'),
+ ('𝘢', '𝘻'),
+ ('𝙖', '𝙯'),
+ ('𝚊', '𝚥'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛡'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜛'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝕'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞏'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟉'),
+ ('𝟋', '𝟋'),
+ ('𝼀', '𝼉'),
+ ('𝼋', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞤢', '𞥃'),
+];
+
+pub const MARK: &'static [(char, char)] = &[
+ ('\u{300}', '\u{36f}'),
+ ('\u{483}', '\u{489}'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('\u{610}', '\u{61a}'),
+ ('\u{64b}', '\u{65f}'),
+ ('\u{670}', '\u{670}'),
+ ('\u{6d6}', '\u{6dc}'),
+ ('\u{6df}', '\u{6e4}'),
+ ('\u{6e7}', '\u{6e8}'),
+ ('\u{6ea}', '\u{6ed}'),
+ ('\u{711}', '\u{711}'),
+ ('\u{730}', '\u{74a}'),
+ ('\u{7a6}', '\u{7b0}'),
+ ('\u{7eb}', '\u{7f3}'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('\u{816}', '\u{819}'),
+ ('\u{81b}', '\u{823}'),
+ ('\u{825}', '\u{827}'),
+ ('\u{829}', '\u{82d}'),
+ ('\u{859}', '\u{85b}'),
+ ('\u{898}', '\u{89f}'),
+ ('\u{8ca}', '\u{8e1}'),
+ ('\u{8e3}', 'ः'),
+ ('\u{93a}', '\u{93c}'),
+ ('ा', 'ॏ'),
+ ('\u{951}', '\u{957}'),
+ ('\u{962}', '\u{963}'),
+ ('\u{981}', 'ঃ'),
+ ('\u{9bc}', '\u{9bc}'),
+ ('\u{9be}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', '\u{9cd}'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('\u{9e2}', '\u{9e3}'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', 'ਃ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('\u{a70}', '\u{a71}'),
+ ('\u{a75}', '\u{a75}'),
+ ('\u{a81}', 'ઃ'),
+ ('\u{abc}', '\u{abc}'),
+ ('ા', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('\u{ae2}', '\u{ae3}'),
+ ('\u{afa}', '\u{aff}'),
+ ('\u{b01}', 'ଃ'),
+ ('\u{b3c}', '\u{b3c}'),
+ ('\u{b3e}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('\u{b62}', '\u{b63}'),
+ ('\u{b82}', '\u{b82}'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('\u{c00}', '\u{c04}'),
+ ('\u{c3c}', '\u{c3c}'),
+ ('\u{c3e}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('\u{c62}', '\u{c63}'),
+ ('\u{c81}', 'ಃ'),
+ ('\u{cbc}', '\u{cbc}'),
+ ('ಾ', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('\u{ce2}', '\u{ce3}'),
+ ('ೳ', 'ೳ'),
+ ('\u{d00}', 'ഃ'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('\u{d3e}', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', '\u{d4d}'),
+ ('\u{d57}', '\u{d57}'),
+ ('\u{d62}', '\u{d63}'),
+ ('\u{d81}', 'ඃ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('ෲ', 'ෳ'),
+ ('\u{e31}', '\u{e31}'),
+ ('\u{e34}', '\u{e3a}'),
+ ('\u{e47}', '\u{e4e}'),
+ ('\u{eb1}', '\u{eb1}'),
+ ('\u{eb4}', '\u{ebc}'),
+ ('\u{ec8}', '\u{ece}'),
+ ('\u{f18}', '\u{f19}'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('༾', '༿'),
+ ('\u{f71}', '\u{f84}'),
+ ('\u{f86}', '\u{f87}'),
+ ('\u{f8d}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('ါ', '\u{103e}'),
+ ('ၖ', '\u{1059}'),
+ ('\u{105e}', '\u{1060}'),
+ ('ၢ', 'ၤ'),
+ ('ၧ', 'ၭ'),
+ ('\u{1071}', '\u{1074}'),
+ ('\u{1082}', '\u{108d}'),
+ ('ႏ', 'ႏ'),
+ ('ႚ', '\u{109d}'),
+ ('\u{135d}', '\u{135f}'),
+ ('\u{1712}', '᜕'),
+ ('\u{1732}', '᜴'),
+ ('\u{1752}', '\u{1753}'),
+ ('\u{1772}', '\u{1773}'),
+ ('\u{17b4}', '\u{17d3}'),
+ ('\u{17dd}', '\u{17dd}'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '\u{180f}'),
+ ('\u{1885}', '\u{1886}'),
+ ('\u{18a9}', '\u{18a9}'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('\u{1a17}', '\u{1a1b}'),
+ ('ᩕ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '\u{1a7f}'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1b00}', 'ᬄ'),
+ ('\u{1b34}', '᭄'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', 'ᮂ'),
+ ('ᮡ', '\u{1bad}'),
+ ('\u{1be6}', '᯳'),
+ ('ᰤ', '\u{1c37}'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', '\u{1ce8}'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('᳷', '\u{1cf9}'),
+ ('\u{1dc0}', '\u{1dff}'),
+ ('\u{20d0}', '\u{20f0}'),
+ ('\u{2cef}', '\u{2cf1}'),
+ ('\u{2d7f}', '\u{2d7f}'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('\u{302a}', '\u{302f}'),
+ ('\u{3099}', '\u{309a}'),
+ ('\u{a66f}', '\u{a672}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('\u{a69e}', '\u{a69f}'),
+ ('\u{a6f0}', '\u{a6f1}'),
+ ('\u{a802}', '\u{a802}'),
+ ('\u{a806}', '\u{a806}'),
+ ('\u{a80b}', '\u{a80b}'),
+ ('ꠣ', 'ꠧ'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('ꢀ', 'ꢁ'),
+ ('ꢴ', '\u{a8c5}'),
+ ('\u{a8e0}', '\u{a8f1}'),
+ ('\u{a8ff}', '\u{a8ff}'),
+ ('\u{a926}', '\u{a92d}'),
+ ('\u{a947}', '꥓'),
+ ('\u{a980}', 'ꦃ'),
+ ('\u{a9b3}', '꧀'),
+ ('\u{a9e5}', '\u{a9e5}'),
+ ('\u{aa29}', '\u{aa36}'),
+ ('\u{aa43}', '\u{aa43}'),
+ ('\u{aa4c}', 'ꩍ'),
+ ('ꩻ', 'ꩽ'),
+ ('\u{aab0}', '\u{aab0}'),
+ ('\u{aab2}', '\u{aab4}'),
+ ('\u{aab7}', '\u{aab8}'),
+ ('\u{aabe}', '\u{aabf}'),
+ ('\u{aac1}', '\u{aac1}'),
+ ('ꫫ', 'ꫯ'),
+ ('ꫵ', '\u{aaf6}'),
+ ('ꯣ', 'ꯪ'),
+ ('꯬', '\u{abed}'),
+ ('\u{fb1e}', '\u{fb1e}'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('\u{10376}', '\u{1037a}'),
+ ('\u{10a01}', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '\u{10a0f}'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('\u{10ae5}', '\u{10ae6}'),
+ ('\u{10d24}', '\u{10d27}'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('\u{10f46}', '\u{10f50}'),
+ ('\u{10f82}', '\u{10f85}'),
+ ('𑀀', '𑀂'),
+ ('\u{11038}', '\u{11046}'),
+ ('\u{11070}', '\u{11070}'),
+ ('\u{11073}', '\u{11074}'),
+ ('\u{1107f}', '𑂂'),
+ ('𑂰', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('\u{11100}', '\u{11102}'),
+ ('\u{11127}', '\u{11134}'),
+ ('𑅅', '𑅆'),
+ ('\u{11173}', '\u{11173}'),
+ ('\u{11180}', '𑆂'),
+ ('𑆳', '𑇀'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('𑇎', '\u{111cf}'),
+ ('𑈬', '\u{11237}'),
+ ('\u{1123e}', '\u{1123e}'),
+ ('\u{11241}', '\u{11241}'),
+ ('\u{112df}', '\u{112ea}'),
+ ('\u{11300}', '𑌃'),
+ ('\u{1133b}', '\u{1133c}'),
+ ('\u{1133e}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍢', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('𑐵', '\u{11446}'),
+ ('\u{1145e}', '\u{1145e}'),
+ ('\u{114b0}', '\u{114c3}'),
+ ('\u{115af}', '\u{115b5}'),
+ ('𑖸', '\u{115c0}'),
+ ('\u{115dc}', '\u{115dd}'),
+ ('𑘰', '\u{11640}'),
+ ('\u{116ab}', '\u{116b7}'),
+ ('\u{1171d}', '\u{1172b}'),
+ ('𑠬', '\u{1183a}'),
+ ('\u{11930}', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '\u{1193e}'),
+ ('𑥀', '𑥀'),
+ ('𑥂', '\u{11943}'),
+ ('𑧑', '\u{119d7}'),
+ ('\u{119da}', '\u{119e0}'),
+ ('𑧤', '𑧤'),
+ ('\u{11a01}', '\u{11a0a}'),
+ ('\u{11a33}', '𑨹'),
+ ('\u{11a3b}', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('\u{11a51}', '\u{11a5b}'),
+ ('\u{11a8a}', '\u{11a99}'),
+ ('𑰯', '\u{11c36}'),
+ ('\u{11c38}', '\u{11c3f}'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('\u{11d31}', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d45}'),
+ ('\u{11d47}', '\u{11d47}'),
+ ('𑶊', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '\u{11d97}'),
+ ('\u{11ef3}', '𑻶'),
+ ('\u{11f00}', '\u{11f01}'),
+ ('𑼃', '𑼃'),
+ ('𑼴', '\u{11f3a}'),
+ ('𑼾', '\u{11f42}'),
+ ('\u{13440}', '\u{13440}'),
+ ('\u{13447}', '\u{13455}'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('\u{16b30}', '\u{16b36}'),
+ ('\u{16f4f}', '\u{16f4f}'),
+ ('𖽑', '𖾇'),
+ ('\u{16f8f}', '\u{16f92}'),
+ ('\u{16fe4}', '\u{16fe4}'),
+ ('𖿰', '𖿱'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d165}', '\u{1d169}'),
+ ('𝅭', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('\u{1e130}', '\u{1e136}'),
+ ('\u{1e2ae}', '\u{1e2ae}'),
+ ('\u{1e2ec}', '\u{1e2ef}'),
+ ('\u{1e4ec}', '\u{1e4ef}'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('\u{1e944}', '\u{1e94a}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const MATH_SYMBOL: &'static [(char, char)] = &[
+ ('+', '+'),
+ ('<', '>'),
+ ('|', '|'),
+ ('~', '~'),
+ ('¬', '¬'),
+ ('±', '±'),
+ ('×', '×'),
+ ('÷', '÷'),
+ ('϶', '϶'),
+ ('؆', '؈'),
+ ('⁄', '⁄'),
+ ('⁒', '⁒'),
+ ('⁺', '⁼'),
+ ('₊', '₌'),
+ ('℘', '℘'),
+ ('⅀', '⅄'),
+ ('⅋', '⅋'),
+ ('←', '↔'),
+ ('↚', '↛'),
+ ('↠', '↠'),
+ ('↣', '↣'),
+ ('↦', '↦'),
+ ('↮', '↮'),
+ ('⇎', '⇏'),
+ ('⇒', '⇒'),
+ ('⇔', '⇔'),
+ ('⇴', '⋿'),
+ ('⌠', '⌡'),
+ ('⍼', '⍼'),
+ ('⎛', '⎳'),
+ ('⏜', '⏡'),
+ ('▷', '▷'),
+ ('◁', '◁'),
+ ('◸', '◿'),
+ ('♯', '♯'),
+ ('⟀', '⟄'),
+ ('⟇', '⟥'),
+ ('⟰', '⟿'),
+ ('⤀', '⦂'),
+ ('⦙', '⧗'),
+ ('⧜', '⧻'),
+ ('⧾', '⫿'),
+ ('⬰', '⭄'),
+ ('⭇', '⭌'),
+ ('﬩', '﬩'),
+ ('﹢', '﹢'),
+ ('﹤', '﹦'),
+ ('+', '+'),
+ ('<', '>'),
+ ('|', '|'),
+ ('~', '~'),
+ ('¬', '¬'),
+ ('←', '↓'),
+ ('𝛁', '𝛁'),
+ ('𝛛', '𝛛'),
+ ('𝛻', '𝛻'),
+ ('𝜕', '𝜕'),
+ ('𝜵', '𝜵'),
+ ('𝝏', '𝝏'),
+ ('𝝯', '𝝯'),
+ ('𝞉', '𝞉'),
+ ('𝞩', '𝞩'),
+ ('𝟃', '𝟃'),
+ ('𞻰', '𞻱'),
+];
+
+pub const MODIFIER_LETTER: &'static [(char, char)] = &[
+ ('ʰ', 'ˁ'),
+ ('ˆ', 'ˑ'),
+ ('ˠ', 'ˤ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('ʹ', 'ʹ'),
+ ('ͺ', 'ͺ'),
+ ('ՙ', 'ՙ'),
+ ('ـ', 'ـ'),
+ ('ۥ', 'ۦ'),
+ ('ߴ', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('ࠚ', 'ࠚ'),
+ ('ࠤ', 'ࠤ'),
+ ('ࠨ', 'ࠨ'),
+ ('ࣉ', 'ࣉ'),
+ ('ॱ', 'ॱ'),
+ ('ๆ', 'ๆ'),
+ ('ໆ', 'ໆ'),
+ ('ჼ', 'ჼ'),
+ ('ៗ', 'ៗ'),
+ ('ᡃ', 'ᡃ'),
+ ('ᪧ', 'ᪧ'),
+ ('ᱸ', 'ᱽ'),
+ ('ᴬ', 'ᵪ'),
+ ('ᵸ', 'ᵸ'),
+ ('ᶛ', 'ᶿ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ⱼ', 'ⱽ'),
+ ('ⵯ', 'ⵯ'),
+ ('ⸯ', 'ⸯ'),
+ ('々', '々'),
+ ('〱', '〵'),
+ ('〻', '〻'),
+ ('ゝ', 'ゞ'),
+ ('ー', 'ヾ'),
+ ('ꀕ', 'ꀕ'),
+ ('ꓸ', 'ꓽ'),
+ ('ꘌ', 'ꘌ'),
+ ('ꙿ', 'ꙿ'),
+ ('ꚜ', 'ꚝ'),
+ ('ꜗ', 'ꜟ'),
+ ('ꝰ', 'ꝰ'),
+ ('ꞈ', 'ꞈ'),
+ ('ꟲ', 'ꟴ'),
+ ('ꟸ', 'ꟹ'),
+ ('ꧏ', 'ꧏ'),
+ ('ꧦ', 'ꧦ'),
+ ('ꩰ', 'ꩰ'),
+ ('ꫝ', 'ꫝ'),
+ ('ꫳ', 'ꫴ'),
+ ('ꭜ', 'ꭟ'),
+ ('ꭩ', 'ꭩ'),
+ ('ー', 'ー'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𖭀', '𖭃'),
+ ('𖾓', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '𖿣'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𞀰', '𞁭'),
+ ('𞄷', '𞄽'),
+ ('𞓫', '𞓫'),
+ ('𞥋', '𞥋'),
+];
+
+pub const MODIFIER_SYMBOL: &'static [(char, char)] = &[
+ ('^', '^'),
+ ('`', '`'),
+ ('¨', '¨'),
+ ('¯', '¯'),
+ ('´', '´'),
+ ('¸', '¸'),
+ ('˂', '˅'),
+ ('˒', '˟'),
+ ('˥', '˫'),
+ ('˭', '˭'),
+ ('˯', '˿'),
+ ('͵', '͵'),
+ ('΄', '΅'),
+ ('࢈', '࢈'),
+ ('᾽', '᾽'),
+ ('᾿', '῁'),
+ ('῍', '῏'),
+ ('῝', '῟'),
+ ('῭', '`'),
+ ('´', '῾'),
+ ('゛', '゜'),
+ ('꜀', '꜖'),
+ ('꜠', '꜡'),
+ ('꞉', '꞊'),
+ ('꭛', '꭛'),
+ ('꭪', '꭫'),
+ ('﮲', '﯂'),
+ ('^', '^'),
+ ('`', '`'),
+ (' ̄', ' ̄'),
+ ('🏻', '🏿'),
+];
+
+pub const NONSPACING_MARK: &'static [(char, char)] = &[
+ ('\u{300}', '\u{36f}'),
+ ('\u{483}', '\u{487}'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('\u{610}', '\u{61a}'),
+ ('\u{64b}', '\u{65f}'),
+ ('\u{670}', '\u{670}'),
+ ('\u{6d6}', '\u{6dc}'),
+ ('\u{6df}', '\u{6e4}'),
+ ('\u{6e7}', '\u{6e8}'),
+ ('\u{6ea}', '\u{6ed}'),
+ ('\u{711}', '\u{711}'),
+ ('\u{730}', '\u{74a}'),
+ ('\u{7a6}', '\u{7b0}'),
+ ('\u{7eb}', '\u{7f3}'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('\u{816}', '\u{819}'),
+ ('\u{81b}', '\u{823}'),
+ ('\u{825}', '\u{827}'),
+ ('\u{829}', '\u{82d}'),
+ ('\u{859}', '\u{85b}'),
+ ('\u{898}', '\u{89f}'),
+ ('\u{8ca}', '\u{8e1}'),
+ ('\u{8e3}', '\u{902}'),
+ ('\u{93a}', '\u{93a}'),
+ ('\u{93c}', '\u{93c}'),
+ ('\u{941}', '\u{948}'),
+ ('\u{94d}', '\u{94d}'),
+ ('\u{951}', '\u{957}'),
+ ('\u{962}', '\u{963}'),
+ ('\u{981}', '\u{981}'),
+ ('\u{9bc}', '\u{9bc}'),
+ ('\u{9c1}', '\u{9c4}'),
+ ('\u{9cd}', '\u{9cd}'),
+ ('\u{9e2}', '\u{9e3}'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', '\u{a02}'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('\u{a41}', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('\u{a70}', '\u{a71}'),
+ ('\u{a75}', '\u{a75}'),
+ ('\u{a81}', '\u{a82}'),
+ ('\u{abc}', '\u{abc}'),
+ ('\u{ac1}', '\u{ac5}'),
+ ('\u{ac7}', '\u{ac8}'),
+ ('\u{acd}', '\u{acd}'),
+ ('\u{ae2}', '\u{ae3}'),
+ ('\u{afa}', '\u{aff}'),
+ ('\u{b01}', '\u{b01}'),
+ ('\u{b3c}', '\u{b3c}'),
+ ('\u{b3f}', '\u{b3f}'),
+ ('\u{b41}', '\u{b44}'),
+ ('\u{b4d}', '\u{b4d}'),
+ ('\u{b55}', '\u{b56}'),
+ ('\u{b62}', '\u{b63}'),
+ ('\u{b82}', '\u{b82}'),
+ ('\u{bc0}', '\u{bc0}'),
+ ('\u{bcd}', '\u{bcd}'),
+ ('\u{c00}', '\u{c00}'),
+ ('\u{c04}', '\u{c04}'),
+ ('\u{c3c}', '\u{c3c}'),
+ ('\u{c3e}', '\u{c40}'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('\u{c62}', '\u{c63}'),
+ ('\u{c81}', '\u{c81}'),
+ ('\u{cbc}', '\u{cbc}'),
+ ('\u{cbf}', '\u{cbf}'),
+ ('\u{cc6}', '\u{cc6}'),
+ ('\u{ccc}', '\u{ccd}'),
+ ('\u{ce2}', '\u{ce3}'),
+ ('\u{d00}', '\u{d01}'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('\u{d41}', '\u{d44}'),
+ ('\u{d4d}', '\u{d4d}'),
+ ('\u{d62}', '\u{d63}'),
+ ('\u{d81}', '\u{d81}'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dd2}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('\u{e31}', '\u{e31}'),
+ ('\u{e34}', '\u{e3a}'),
+ ('\u{e47}', '\u{e4e}'),
+ ('\u{eb1}', '\u{eb1}'),
+ ('\u{eb4}', '\u{ebc}'),
+ ('\u{ec8}', '\u{ece}'),
+ ('\u{f18}', '\u{f19}'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('\u{f71}', '\u{f7e}'),
+ ('\u{f80}', '\u{f84}'),
+ ('\u{f86}', '\u{f87}'),
+ ('\u{f8d}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('\u{102d}', '\u{1030}'),
+ ('\u{1032}', '\u{1037}'),
+ ('\u{1039}', '\u{103a}'),
+ ('\u{103d}', '\u{103e}'),
+ ('\u{1058}', '\u{1059}'),
+ ('\u{105e}', '\u{1060}'),
+ ('\u{1071}', '\u{1074}'),
+ ('\u{1082}', '\u{1082}'),
+ ('\u{1085}', '\u{1086}'),
+ ('\u{108d}', '\u{108d}'),
+ ('\u{109d}', '\u{109d}'),
+ ('\u{135d}', '\u{135f}'),
+ ('\u{1712}', '\u{1714}'),
+ ('\u{1732}', '\u{1733}'),
+ ('\u{1752}', '\u{1753}'),
+ ('\u{1772}', '\u{1773}'),
+ ('\u{17b4}', '\u{17b5}'),
+ ('\u{17b7}', '\u{17bd}'),
+ ('\u{17c6}', '\u{17c6}'),
+ ('\u{17c9}', '\u{17d3}'),
+ ('\u{17dd}', '\u{17dd}'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '\u{180f}'),
+ ('\u{1885}', '\u{1886}'),
+ ('\u{18a9}', '\u{18a9}'),
+ ('\u{1920}', '\u{1922}'),
+ ('\u{1927}', '\u{1928}'),
+ ('\u{1932}', '\u{1932}'),
+ ('\u{1939}', '\u{193b}'),
+ ('\u{1a17}', '\u{1a18}'),
+ ('\u{1a1b}', '\u{1a1b}'),
+ ('\u{1a56}', '\u{1a56}'),
+ ('\u{1a58}', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a60}'),
+ ('\u{1a62}', '\u{1a62}'),
+ ('\u{1a65}', '\u{1a6c}'),
+ ('\u{1a73}', '\u{1a7c}'),
+ ('\u{1a7f}', '\u{1a7f}'),
+ ('\u{1ab0}', '\u{1abd}'),
+ ('\u{1abf}', '\u{1ace}'),
+ ('\u{1b00}', '\u{1b03}'),
+ ('\u{1b34}', '\u{1b34}'),
+ ('\u{1b36}', '\u{1b3a}'),
+ ('\u{1b3c}', '\u{1b3c}'),
+ ('\u{1b42}', '\u{1b42}'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', '\u{1b81}'),
+ ('\u{1ba2}', '\u{1ba5}'),
+ ('\u{1ba8}', '\u{1ba9}'),
+ ('\u{1bab}', '\u{1bad}'),
+ ('\u{1be6}', '\u{1be6}'),
+ ('\u{1be8}', '\u{1be9}'),
+ ('\u{1bed}', '\u{1bed}'),
+ ('\u{1bef}', '\u{1bf1}'),
+ ('\u{1c2c}', '\u{1c33}'),
+ ('\u{1c36}', '\u{1c37}'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', '\u{1ce0}'),
+ ('\u{1ce2}', '\u{1ce8}'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('\u{1cf8}', '\u{1cf9}'),
+ ('\u{1dc0}', '\u{1dff}'),
+ ('\u{20d0}', '\u{20dc}'),
+ ('\u{20e1}', '\u{20e1}'),
+ ('\u{20e5}', '\u{20f0}'),
+ ('\u{2cef}', '\u{2cf1}'),
+ ('\u{2d7f}', '\u{2d7f}'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('\u{302a}', '\u{302d}'),
+ ('\u{3099}', '\u{309a}'),
+ ('\u{a66f}', '\u{a66f}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('\u{a69e}', '\u{a69f}'),
+ ('\u{a6f0}', '\u{a6f1}'),
+ ('\u{a802}', '\u{a802}'),
+ ('\u{a806}', '\u{a806}'),
+ ('\u{a80b}', '\u{a80b}'),
+ ('\u{a825}', '\u{a826}'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('\u{a8c4}', '\u{a8c5}'),
+ ('\u{a8e0}', '\u{a8f1}'),
+ ('\u{a8ff}', '\u{a8ff}'),
+ ('\u{a926}', '\u{a92d}'),
+ ('\u{a947}', '\u{a951}'),
+ ('\u{a980}', '\u{a982}'),
+ ('\u{a9b3}', '\u{a9b3}'),
+ ('\u{a9b6}', '\u{a9b9}'),
+ ('\u{a9bc}', '\u{a9bd}'),
+ ('\u{a9e5}', '\u{a9e5}'),
+ ('\u{aa29}', '\u{aa2e}'),
+ ('\u{aa31}', '\u{aa32}'),
+ ('\u{aa35}', '\u{aa36}'),
+ ('\u{aa43}', '\u{aa43}'),
+ ('\u{aa4c}', '\u{aa4c}'),
+ ('\u{aa7c}', '\u{aa7c}'),
+ ('\u{aab0}', '\u{aab0}'),
+ ('\u{aab2}', '\u{aab4}'),
+ ('\u{aab7}', '\u{aab8}'),
+ ('\u{aabe}', '\u{aabf}'),
+ ('\u{aac1}', '\u{aac1}'),
+ ('\u{aaec}', '\u{aaed}'),
+ ('\u{aaf6}', '\u{aaf6}'),
+ ('\u{abe5}', '\u{abe5}'),
+ ('\u{abe8}', '\u{abe8}'),
+ ('\u{abed}', '\u{abed}'),
+ ('\u{fb1e}', '\u{fb1e}'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('\u{10376}', '\u{1037a}'),
+ ('\u{10a01}', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '\u{10a0f}'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('\u{10ae5}', '\u{10ae6}'),
+ ('\u{10d24}', '\u{10d27}'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('\u{10f46}', '\u{10f50}'),
+ ('\u{10f82}', '\u{10f85}'),
+ ('\u{11001}', '\u{11001}'),
+ ('\u{11038}', '\u{11046}'),
+ ('\u{11070}', '\u{11070}'),
+ ('\u{11073}', '\u{11074}'),
+ ('\u{1107f}', '\u{11081}'),
+ ('\u{110b3}', '\u{110b6}'),
+ ('\u{110b9}', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('\u{11100}', '\u{11102}'),
+ ('\u{11127}', '\u{1112b}'),
+ ('\u{1112d}', '\u{11134}'),
+ ('\u{11173}', '\u{11173}'),
+ ('\u{11180}', '\u{11181}'),
+ ('\u{111b6}', '\u{111be}'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('\u{111cf}', '\u{111cf}'),
+ ('\u{1122f}', '\u{11231}'),
+ ('\u{11234}', '\u{11234}'),
+ ('\u{11236}', '\u{11237}'),
+ ('\u{1123e}', '\u{1123e}'),
+ ('\u{11241}', '\u{11241}'),
+ ('\u{112df}', '\u{112df}'),
+ ('\u{112e3}', '\u{112ea}'),
+ ('\u{11300}', '\u{11301}'),
+ ('\u{1133b}', '\u{1133c}'),
+ ('\u{11340}', '\u{11340}'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('\u{11438}', '\u{1143f}'),
+ ('\u{11442}', '\u{11444}'),
+ ('\u{11446}', '\u{11446}'),
+ ('\u{1145e}', '\u{1145e}'),
+ ('\u{114b3}', '\u{114b8}'),
+ ('\u{114ba}', '\u{114ba}'),
+ ('\u{114bf}', '\u{114c0}'),
+ ('\u{114c2}', '\u{114c3}'),
+ ('\u{115b2}', '\u{115b5}'),
+ ('\u{115bc}', '\u{115bd}'),
+ ('\u{115bf}', '\u{115c0}'),
+ ('\u{115dc}', '\u{115dd}'),
+ ('\u{11633}', '\u{1163a}'),
+ ('\u{1163d}', '\u{1163d}'),
+ ('\u{1163f}', '\u{11640}'),
+ ('\u{116ab}', '\u{116ab}'),
+ ('\u{116ad}', '\u{116ad}'),
+ ('\u{116b0}', '\u{116b5}'),
+ ('\u{116b7}', '\u{116b7}'),
+ ('\u{1171d}', '\u{1171f}'),
+ ('\u{11722}', '\u{11725}'),
+ ('\u{11727}', '\u{1172b}'),
+ ('\u{1182f}', '\u{11837}'),
+ ('\u{11839}', '\u{1183a}'),
+ ('\u{1193b}', '\u{1193c}'),
+ ('\u{1193e}', '\u{1193e}'),
+ ('\u{11943}', '\u{11943}'),
+ ('\u{119d4}', '\u{119d7}'),
+ ('\u{119da}', '\u{119db}'),
+ ('\u{119e0}', '\u{119e0}'),
+ ('\u{11a01}', '\u{11a0a}'),
+ ('\u{11a33}', '\u{11a38}'),
+ ('\u{11a3b}', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('\u{11a51}', '\u{11a56}'),
+ ('\u{11a59}', '\u{11a5b}'),
+ ('\u{11a8a}', '\u{11a96}'),
+ ('\u{11a98}', '\u{11a99}'),
+ ('\u{11c30}', '\u{11c36}'),
+ ('\u{11c38}', '\u{11c3d}'),
+ ('\u{11c3f}', '\u{11c3f}'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('\u{11caa}', '\u{11cb0}'),
+ ('\u{11cb2}', '\u{11cb3}'),
+ ('\u{11cb5}', '\u{11cb6}'),
+ ('\u{11d31}', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d45}'),
+ ('\u{11d47}', '\u{11d47}'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('\u{11d95}', '\u{11d95}'),
+ ('\u{11d97}', '\u{11d97}'),
+ ('\u{11ef3}', '\u{11ef4}'),
+ ('\u{11f00}', '\u{11f01}'),
+ ('\u{11f36}', '\u{11f3a}'),
+ ('\u{11f40}', '\u{11f40}'),
+ ('\u{11f42}', '\u{11f42}'),
+ ('\u{13440}', '\u{13440}'),
+ ('\u{13447}', '\u{13455}'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('\u{16b30}', '\u{16b36}'),
+ ('\u{16f4f}', '\u{16f4f}'),
+ ('\u{16f8f}', '\u{16f92}'),
+ ('\u{16fe4}', '\u{16fe4}'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d167}', '\u{1d169}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('\u{1e130}', '\u{1e136}'),
+ ('\u{1e2ae}', '\u{1e2ae}'),
+ ('\u{1e2ec}', '\u{1e2ef}'),
+ ('\u{1e4ec}', '\u{1e4ef}'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('\u{1e944}', '\u{1e94a}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const NUMBER: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('²', '³'),
+ ('¹', '¹'),
+ ('¼', '¾'),
+ ('٠', '٩'),
+ ('۰', '۹'),
+ ('߀', '߉'),
+ ('०', '९'),
+ ('০', '৯'),
+ ('৴', '৹'),
+ ('੦', '੯'),
+ ('૦', '૯'),
+ ('୦', '୯'),
+ ('୲', '୷'),
+ ('௦', '௲'),
+ ('౦', '౯'),
+ ('౸', '౾'),
+ ('೦', '೯'),
+ ('൘', '൞'),
+ ('൦', '൸'),
+ ('෦', '෯'),
+ ('๐', '๙'),
+ ('໐', '໙'),
+ ('༠', '༳'),
+ ('၀', '၉'),
+ ('႐', '႙'),
+ ('፩', '፼'),
+ ('ᛮ', 'ᛰ'),
+ ('០', '៩'),
+ ('៰', '៹'),
+ ('᠐', '᠙'),
+ ('᥆', '᥏'),
+ ('᧐', '᧚'),
+ ('᪀', '᪉'),
+ ('᪐', '᪙'),
+ ('᭐', '᭙'),
+ ('᮰', '᮹'),
+ ('᱀', '᱉'),
+ ('᱐', '᱙'),
+ ('⁰', '⁰'),
+ ('⁴', '⁹'),
+ ('₀', '₉'),
+ ('⅐', 'ↂ'),
+ ('ↅ', '↉'),
+ ('①', '⒛'),
+ ('⓪', '⓿'),
+ ('❶', '➓'),
+ ('⳽', '⳽'),
+ ('〇', '〇'),
+ ('〡', '〩'),
+ ('〸', '〺'),
+ ('㆒', '㆕'),
+ ('㈠', '㈩'),
+ ('㉈', '㉏'),
+ ('㉑', '㉟'),
+ ('㊀', '㊉'),
+ ('㊱', '㊿'),
+ ('꘠', '꘩'),
+ ('ꛦ', 'ꛯ'),
+ ('꠰', '꠵'),
+ ('꣐', '꣙'),
+ ('꤀', '꤉'),
+ ('꧐', '꧙'),
+ ('꧰', '꧹'),
+ ('꩐', '꩙'),
+ ('꯰', '꯹'),
+ ('0', '9'),
+ ('𐄇', '𐄳'),
+ ('𐅀', '𐅸'),
+ ('𐆊', '𐆋'),
+ ('𐋡', '𐋻'),
+ ('𐌠', '𐌣'),
+ ('𐍁', '𐍁'),
+ ('𐍊', '𐍊'),
+ ('𐏑', '𐏕'),
+ ('𐒠', '𐒩'),
+ ('𐡘', '𐡟'),
+ ('𐡹', '𐡿'),
+ ('𐢧', '𐢯'),
+ ('𐣻', '𐣿'),
+ ('𐤖', '𐤛'),
+ ('𐦼', '𐦽'),
+ ('𐧀', '𐧏'),
+ ('𐧒', '𐧿'),
+ ('𐩀', '𐩈'),
+ ('𐩽', '𐩾'),
+ ('𐪝', '𐪟'),
+ ('𐫫', '𐫯'),
+ ('𐭘', '𐭟'),
+ ('𐭸', '𐭿'),
+ ('𐮩', '𐮯'),
+ ('𐳺', '𐳿'),
+ ('𐴰', '𐴹'),
+ ('𐹠', '𐹾'),
+ ('𐼝', '𐼦'),
+ ('𐽑', '𐽔'),
+ ('𐿅', '𐿋'),
+ ('𑁒', '𑁯'),
+ ('𑃰', '𑃹'),
+ ('𑄶', '𑄿'),
+ ('𑇐', '𑇙'),
+ ('𑇡', '𑇴'),
+ ('𑋰', '𑋹'),
+ ('𑑐', '𑑙'),
+ ('𑓐', '𑓙'),
+ ('𑙐', '𑙙'),
+ ('𑛀', '𑛉'),
+ ('𑜰', '𑜻'),
+ ('𑣠', '𑣲'),
+ ('𑥐', '𑥙'),
+ ('𑱐', '𑱬'),
+ ('𑵐', '𑵙'),
+ ('𑶠', '𑶩'),
+ ('𑽐', '𑽙'),
+ ('𑿀', '𑿔'),
+ ('𒐀', '𒑮'),
+ ('𖩠', '𖩩'),
+ ('𖫀', '𖫉'),
+ ('𖭐', '𖭙'),
+ ('𖭛', '𖭡'),
+ ('𖺀', '𖺖'),
+ ('𝋀', '𝋓'),
+ ('𝋠', '𝋳'),
+ ('𝍠', '𝍸'),
+ ('𝟎', '𝟿'),
+ ('𞅀', '𞅉'),
+ ('𞋰', '𞋹'),
+ ('𞓰', '𞓹'),
+ ('𞣇', '𞣏'),
+ ('𞥐', '𞥙'),
+ ('𞱱', '𞲫'),
+ ('𞲭', '𞲯'),
+ ('𞲱', '𞲴'),
+ ('𞴁', '𞴭'),
+ ('𞴯', '𞴽'),
+ ('🄀', '🄌'),
+ ('🯰', '🯹'),
+];
+
+pub const OPEN_PUNCTUATION: &'static [(char, char)] = &[
+ ('(', '('),
+ ('[', '['),
+ ('{', '{'),
+ ('༺', '༺'),
+ ('༼', '༼'),
+ ('᚛', '᚛'),
+ ('‚', '‚'),
+ ('„', '„'),
+ ('⁅', '⁅'),
+ ('⁽', '⁽'),
+ ('₍', '₍'),
+ ('⌈', '⌈'),
+ ('⌊', '⌊'),
+ ('〈', '〈'),
+ ('❨', '❨'),
+ ('❪', '❪'),
+ ('❬', '❬'),
+ ('❮', '❮'),
+ ('❰', '❰'),
+ ('❲', '❲'),
+ ('❴', '❴'),
+ ('⟅', '⟅'),
+ ('⟦', '⟦'),
+ ('⟨', '⟨'),
+ ('⟪', '⟪'),
+ ('⟬', '⟬'),
+ ('⟮', '⟮'),
+ ('⦃', '⦃'),
+ ('⦅', '⦅'),
+ ('⦇', '⦇'),
+ ('⦉', '⦉'),
+ ('⦋', '⦋'),
+ ('⦍', '⦍'),
+ ('⦏', '⦏'),
+ ('⦑', '⦑'),
+ ('⦓', '⦓'),
+ ('⦕', '⦕'),
+ ('⦗', '⦗'),
+ ('⧘', '⧘'),
+ ('⧚', '⧚'),
+ ('⧼', '⧼'),
+ ('⸢', '⸢'),
+ ('⸤', '⸤'),
+ ('⸦', '⸦'),
+ ('⸨', '⸨'),
+ ('⹂', '⹂'),
+ ('⹕', '⹕'),
+ ('⹗', '⹗'),
+ ('⹙', '⹙'),
+ ('⹛', '⹛'),
+ ('〈', '〈'),
+ ('《', '《'),
+ ('「', '「'),
+ ('『', '『'),
+ ('【', '【'),
+ ('〔', '〔'),
+ ('〖', '〖'),
+ ('〘', '〘'),
+ ('〚', '〚'),
+ ('〝', '〝'),
+ ('﴿', '﴿'),
+ ('︗', '︗'),
+ ('︵', '︵'),
+ ('︷', '︷'),
+ ('︹', '︹'),
+ ('︻', '︻'),
+ ('︽', '︽'),
+ ('︿', '︿'),
+ ('﹁', '﹁'),
+ ('﹃', '﹃'),
+ ('﹇', '﹇'),
+ ('﹙', '﹙'),
+ ('﹛', '﹛'),
+ ('﹝', '﹝'),
+ ('(', '('),
+ ('[', '['),
+ ('{', '{'),
+ ('⦅', '⦅'),
+ ('「', '「'),
+];
+
+pub const OTHER: &'static [(char, char)] = &[
+ ('\0', '\u{1f}'),
+ ('\u{7f}', '\u{9f}'),
+ ('\u{ad}', '\u{ad}'),
+ ('\u{378}', '\u{379}'),
+ ('\u{380}', '\u{383}'),
+ ('\u{38b}', '\u{38b}'),
+ ('\u{38d}', '\u{38d}'),
+ ('\u{3a2}', '\u{3a2}'),
+ ('\u{530}', '\u{530}'),
+ ('\u{557}', '\u{558}'),
+ ('\u{58b}', '\u{58c}'),
+ ('\u{590}', '\u{590}'),
+ ('\u{5c8}', '\u{5cf}'),
+ ('\u{5eb}', '\u{5ee}'),
+ ('\u{5f5}', '\u{605}'),
+ ('\u{61c}', '\u{61c}'),
+ ('\u{6dd}', '\u{6dd}'),
+ ('\u{70e}', '\u{70f}'),
+ ('\u{74b}', '\u{74c}'),
+ ('\u{7b2}', '\u{7bf}'),
+ ('\u{7fb}', '\u{7fc}'),
+ ('\u{82e}', '\u{82f}'),
+ ('\u{83f}', '\u{83f}'),
+ ('\u{85c}', '\u{85d}'),
+ ('\u{85f}', '\u{85f}'),
+ ('\u{86b}', '\u{86f}'),
+ ('\u{88f}', '\u{897}'),
+ ('\u{8e2}', '\u{8e2}'),
+ ('\u{984}', '\u{984}'),
+ ('\u{98d}', '\u{98e}'),
+ ('\u{991}', '\u{992}'),
+ ('\u{9a9}', '\u{9a9}'),
+ ('\u{9b1}', '\u{9b1}'),
+ ('\u{9b3}', '\u{9b5}'),
+ ('\u{9ba}', '\u{9bb}'),
+ ('\u{9c5}', '\u{9c6}'),
+ ('\u{9c9}', '\u{9ca}'),
+ ('\u{9cf}', '\u{9d6}'),
+ ('\u{9d8}', '\u{9db}'),
+ ('\u{9de}', '\u{9de}'),
+ ('\u{9e4}', '\u{9e5}'),
+ ('\u{9ff}', '\u{a00}'),
+ ('\u{a04}', '\u{a04}'),
+ ('\u{a0b}', '\u{a0e}'),
+ ('\u{a11}', '\u{a12}'),
+ ('\u{a29}', '\u{a29}'),
+ ('\u{a31}', '\u{a31}'),
+ ('\u{a34}', '\u{a34}'),
+ ('\u{a37}', '\u{a37}'),
+ ('\u{a3a}', '\u{a3b}'),
+ ('\u{a3d}', '\u{a3d}'),
+ ('\u{a43}', '\u{a46}'),
+ ('\u{a49}', '\u{a4a}'),
+ ('\u{a4e}', '\u{a50}'),
+ ('\u{a52}', '\u{a58}'),
+ ('\u{a5d}', '\u{a5d}'),
+ ('\u{a5f}', '\u{a65}'),
+ ('\u{a77}', '\u{a80}'),
+ ('\u{a84}', '\u{a84}'),
+ ('\u{a8e}', '\u{a8e}'),
+ ('\u{a92}', '\u{a92}'),
+ ('\u{aa9}', '\u{aa9}'),
+ ('\u{ab1}', '\u{ab1}'),
+ ('\u{ab4}', '\u{ab4}'),
+ ('\u{aba}', '\u{abb}'),
+ ('\u{ac6}', '\u{ac6}'),
+ ('\u{aca}', '\u{aca}'),
+ ('\u{ace}', '\u{acf}'),
+ ('\u{ad1}', '\u{adf}'),
+ ('\u{ae4}', '\u{ae5}'),
+ ('\u{af2}', '\u{af8}'),
+ ('\u{b00}', '\u{b00}'),
+ ('\u{b04}', '\u{b04}'),
+ ('\u{b0d}', '\u{b0e}'),
+ ('\u{b11}', '\u{b12}'),
+ ('\u{b29}', '\u{b29}'),
+ ('\u{b31}', '\u{b31}'),
+ ('\u{b34}', '\u{b34}'),
+ ('\u{b3a}', '\u{b3b}'),
+ ('\u{b45}', '\u{b46}'),
+ ('\u{b49}', '\u{b4a}'),
+ ('\u{b4e}', '\u{b54}'),
+ ('\u{b58}', '\u{b5b}'),
+ ('\u{b5e}', '\u{b5e}'),
+ ('\u{b64}', '\u{b65}'),
+ ('\u{b78}', '\u{b81}'),
+ ('\u{b84}', '\u{b84}'),
+ ('\u{b8b}', '\u{b8d}'),
+ ('\u{b91}', '\u{b91}'),
+ ('\u{b96}', '\u{b98}'),
+ ('\u{b9b}', '\u{b9b}'),
+ ('\u{b9d}', '\u{b9d}'),
+ ('\u{ba0}', '\u{ba2}'),
+ ('\u{ba5}', '\u{ba7}'),
+ ('\u{bab}', '\u{bad}'),
+ ('\u{bba}', '\u{bbd}'),
+ ('\u{bc3}', '\u{bc5}'),
+ ('\u{bc9}', '\u{bc9}'),
+ ('\u{bce}', '\u{bcf}'),
+ ('\u{bd1}', '\u{bd6}'),
+ ('\u{bd8}', '\u{be5}'),
+ ('\u{bfb}', '\u{bff}'),
+ ('\u{c0d}', '\u{c0d}'),
+ ('\u{c11}', '\u{c11}'),
+ ('\u{c29}', '\u{c29}'),
+ ('\u{c3a}', '\u{c3b}'),
+ ('\u{c45}', '\u{c45}'),
+ ('\u{c49}', '\u{c49}'),
+ ('\u{c4e}', '\u{c54}'),
+ ('\u{c57}', '\u{c57}'),
+ ('\u{c5b}', '\u{c5c}'),
+ ('\u{c5e}', '\u{c5f}'),
+ ('\u{c64}', '\u{c65}'),
+ ('\u{c70}', '\u{c76}'),
+ ('\u{c8d}', '\u{c8d}'),
+ ('\u{c91}', '\u{c91}'),
+ ('\u{ca9}', '\u{ca9}'),
+ ('\u{cb4}', '\u{cb4}'),
+ ('\u{cba}', '\u{cbb}'),
+ ('\u{cc5}', '\u{cc5}'),
+ ('\u{cc9}', '\u{cc9}'),
+ ('\u{cce}', '\u{cd4}'),
+ ('\u{cd7}', '\u{cdc}'),
+ ('\u{cdf}', '\u{cdf}'),
+ ('\u{ce4}', '\u{ce5}'),
+ ('\u{cf0}', '\u{cf0}'),
+ ('\u{cf4}', '\u{cff}'),
+ ('\u{d0d}', '\u{d0d}'),
+ ('\u{d11}', '\u{d11}'),
+ ('\u{d45}', '\u{d45}'),
+ ('\u{d49}', '\u{d49}'),
+ ('\u{d50}', '\u{d53}'),
+ ('\u{d64}', '\u{d65}'),
+ ('\u{d80}', '\u{d80}'),
+ ('\u{d84}', '\u{d84}'),
+ ('\u{d97}', '\u{d99}'),
+ ('\u{db2}', '\u{db2}'),
+ ('\u{dbc}', '\u{dbc}'),
+ ('\u{dbe}', '\u{dbf}'),
+ ('\u{dc7}', '\u{dc9}'),
+ ('\u{dcb}', '\u{dce}'),
+ ('\u{dd5}', '\u{dd5}'),
+ ('\u{dd7}', '\u{dd7}'),
+ ('\u{de0}', '\u{de5}'),
+ ('\u{df0}', '\u{df1}'),
+ ('\u{df5}', '\u{e00}'),
+ ('\u{e3b}', '\u{e3e}'),
+ ('\u{e5c}', '\u{e80}'),
+ ('\u{e83}', '\u{e83}'),
+ ('\u{e85}', '\u{e85}'),
+ ('\u{e8b}', '\u{e8b}'),
+ ('\u{ea4}', '\u{ea4}'),
+ ('\u{ea6}', '\u{ea6}'),
+ ('\u{ebe}', '\u{ebf}'),
+ ('\u{ec5}', '\u{ec5}'),
+ ('\u{ec7}', '\u{ec7}'),
+ ('\u{ecf}', '\u{ecf}'),
+ ('\u{eda}', '\u{edb}'),
+ ('\u{ee0}', '\u{eff}'),
+ ('\u{f48}', '\u{f48}'),
+ ('\u{f6d}', '\u{f70}'),
+ ('\u{f98}', '\u{f98}'),
+ ('\u{fbd}', '\u{fbd}'),
+ ('\u{fcd}', '\u{fcd}'),
+ ('\u{fdb}', '\u{fff}'),
+ ('\u{10c6}', '\u{10c6}'),
+ ('\u{10c8}', '\u{10cc}'),
+ ('\u{10ce}', '\u{10cf}'),
+ ('\u{1249}', '\u{1249}'),
+ ('\u{124e}', '\u{124f}'),
+ ('\u{1257}', '\u{1257}'),
+ ('\u{1259}', '\u{1259}'),
+ ('\u{125e}', '\u{125f}'),
+ ('\u{1289}', '\u{1289}'),
+ ('\u{128e}', '\u{128f}'),
+ ('\u{12b1}', '\u{12b1}'),
+ ('\u{12b6}', '\u{12b7}'),
+ ('\u{12bf}', '\u{12bf}'),
+ ('\u{12c1}', '\u{12c1}'),
+ ('\u{12c6}', '\u{12c7}'),
+ ('\u{12d7}', '\u{12d7}'),
+ ('\u{1311}', '\u{1311}'),
+ ('\u{1316}', '\u{1317}'),
+ ('\u{135b}', '\u{135c}'),
+ ('\u{137d}', '\u{137f}'),
+ ('\u{139a}', '\u{139f}'),
+ ('\u{13f6}', '\u{13f7}'),
+ ('\u{13fe}', '\u{13ff}'),
+ ('\u{169d}', '\u{169f}'),
+ ('\u{16f9}', '\u{16ff}'),
+ ('\u{1716}', '\u{171e}'),
+ ('\u{1737}', '\u{173f}'),
+ ('\u{1754}', '\u{175f}'),
+ ('\u{176d}', '\u{176d}'),
+ ('\u{1771}', '\u{1771}'),
+ ('\u{1774}', '\u{177f}'),
+ ('\u{17de}', '\u{17df}'),
+ ('\u{17ea}', '\u{17ef}'),
+ ('\u{17fa}', '\u{17ff}'),
+ ('\u{180e}', '\u{180e}'),
+ ('\u{181a}', '\u{181f}'),
+ ('\u{1879}', '\u{187f}'),
+ ('\u{18ab}', '\u{18af}'),
+ ('\u{18f6}', '\u{18ff}'),
+ ('\u{191f}', '\u{191f}'),
+ ('\u{192c}', '\u{192f}'),
+ ('\u{193c}', '\u{193f}'),
+ ('\u{1941}', '\u{1943}'),
+ ('\u{196e}', '\u{196f}'),
+ ('\u{1975}', '\u{197f}'),
+ ('\u{19ac}', '\u{19af}'),
+ ('\u{19ca}', '\u{19cf}'),
+ ('\u{19db}', '\u{19dd}'),
+ ('\u{1a1c}', '\u{1a1d}'),
+ ('\u{1a5f}', '\u{1a5f}'),
+ ('\u{1a7d}', '\u{1a7e}'),
+ ('\u{1a8a}', '\u{1a8f}'),
+ ('\u{1a9a}', '\u{1a9f}'),
+ ('\u{1aae}', '\u{1aaf}'),
+ ('\u{1acf}', '\u{1aff}'),
+ ('\u{1b4d}', '\u{1b4f}'),
+ ('\u{1b7f}', '\u{1b7f}'),
+ ('\u{1bf4}', '\u{1bfb}'),
+ ('\u{1c38}', '\u{1c3a}'),
+ ('\u{1c4a}', '\u{1c4c}'),
+ ('\u{1c89}', '\u{1c8f}'),
+ ('\u{1cbb}', '\u{1cbc}'),
+ ('\u{1cc8}', '\u{1ccf}'),
+ ('\u{1cfb}', '\u{1cff}'),
+ ('\u{1f16}', '\u{1f17}'),
+ ('\u{1f1e}', '\u{1f1f}'),
+ ('\u{1f46}', '\u{1f47}'),
+ ('\u{1f4e}', '\u{1f4f}'),
+ ('\u{1f58}', '\u{1f58}'),
+ ('\u{1f5a}', '\u{1f5a}'),
+ ('\u{1f5c}', '\u{1f5c}'),
+ ('\u{1f5e}', '\u{1f5e}'),
+ ('\u{1f7e}', '\u{1f7f}'),
+ ('\u{1fb5}', '\u{1fb5}'),
+ ('\u{1fc5}', '\u{1fc5}'),
+ ('\u{1fd4}', '\u{1fd5}'),
+ ('\u{1fdc}', '\u{1fdc}'),
+ ('\u{1ff0}', '\u{1ff1}'),
+ ('\u{1ff5}', '\u{1ff5}'),
+ ('\u{1fff}', '\u{1fff}'),
+ ('\u{200b}', '\u{200f}'),
+ ('\u{202a}', '\u{202e}'),
+ ('\u{2060}', '\u{206f}'),
+ ('\u{2072}', '\u{2073}'),
+ ('\u{208f}', '\u{208f}'),
+ ('\u{209d}', '\u{209f}'),
+ ('\u{20c1}', '\u{20cf}'),
+ ('\u{20f1}', '\u{20ff}'),
+ ('\u{218c}', '\u{218f}'),
+ ('\u{2427}', '\u{243f}'),
+ ('\u{244b}', '\u{245f}'),
+ ('\u{2b74}', '\u{2b75}'),
+ ('\u{2b96}', '\u{2b96}'),
+ ('\u{2cf4}', '\u{2cf8}'),
+ ('\u{2d26}', '\u{2d26}'),
+ ('\u{2d28}', '\u{2d2c}'),
+ ('\u{2d2e}', '\u{2d2f}'),
+ ('\u{2d68}', '\u{2d6e}'),
+ ('\u{2d71}', '\u{2d7e}'),
+ ('\u{2d97}', '\u{2d9f}'),
+ ('\u{2da7}', '\u{2da7}'),
+ ('\u{2daf}', '\u{2daf}'),
+ ('\u{2db7}', '\u{2db7}'),
+ ('\u{2dbf}', '\u{2dbf}'),
+ ('\u{2dc7}', '\u{2dc7}'),
+ ('\u{2dcf}', '\u{2dcf}'),
+ ('\u{2dd7}', '\u{2dd7}'),
+ ('\u{2ddf}', '\u{2ddf}'),
+ ('\u{2e5e}', '\u{2e7f}'),
+ ('\u{2e9a}', '\u{2e9a}'),
+ ('\u{2ef4}', '\u{2eff}'),
+ ('\u{2fd6}', '\u{2fef}'),
+ ('\u{2ffc}', '\u{2fff}'),
+ ('\u{3040}', '\u{3040}'),
+ ('\u{3097}', '\u{3098}'),
+ ('\u{3100}', '\u{3104}'),
+ ('\u{3130}', '\u{3130}'),
+ ('\u{318f}', '\u{318f}'),
+ ('\u{31e4}', '\u{31ef}'),
+ ('\u{321f}', '\u{321f}'),
+ ('\u{a48d}', '\u{a48f}'),
+ ('\u{a4c7}', '\u{a4cf}'),
+ ('\u{a62c}', '\u{a63f}'),
+ ('\u{a6f8}', '\u{a6ff}'),
+ ('\u{a7cb}', '\u{a7cf}'),
+ ('\u{a7d2}', '\u{a7d2}'),
+ ('\u{a7d4}', '\u{a7d4}'),
+ ('\u{a7da}', '\u{a7f1}'),
+ ('\u{a82d}', '\u{a82f}'),
+ ('\u{a83a}', '\u{a83f}'),
+ ('\u{a878}', '\u{a87f}'),
+ ('\u{a8c6}', '\u{a8cd}'),
+ ('\u{a8da}', '\u{a8df}'),
+ ('\u{a954}', '\u{a95e}'),
+ ('\u{a97d}', '\u{a97f}'),
+ ('\u{a9ce}', '\u{a9ce}'),
+ ('\u{a9da}', '\u{a9dd}'),
+ ('\u{a9ff}', '\u{a9ff}'),
+ ('\u{aa37}', '\u{aa3f}'),
+ ('\u{aa4e}', '\u{aa4f}'),
+ ('\u{aa5a}', '\u{aa5b}'),
+ ('\u{aac3}', '\u{aada}'),
+ ('\u{aaf7}', '\u{ab00}'),
+ ('\u{ab07}', '\u{ab08}'),
+ ('\u{ab0f}', '\u{ab10}'),
+ ('\u{ab17}', '\u{ab1f}'),
+ ('\u{ab27}', '\u{ab27}'),
+ ('\u{ab2f}', '\u{ab2f}'),
+ ('\u{ab6c}', '\u{ab6f}'),
+ ('\u{abee}', '\u{abef}'),
+ ('\u{abfa}', '\u{abff}'),
+ ('\u{d7a4}', '\u{d7af}'),
+ ('\u{d7c7}', '\u{d7ca}'),
+ ('\u{d7fc}', '\u{f8ff}'),
+ ('\u{fa6e}', '\u{fa6f}'),
+ ('\u{fada}', '\u{faff}'),
+ ('\u{fb07}', '\u{fb12}'),
+ ('\u{fb18}', '\u{fb1c}'),
+ ('\u{fb37}', '\u{fb37}'),
+ ('\u{fb3d}', '\u{fb3d}'),
+ ('\u{fb3f}', '\u{fb3f}'),
+ ('\u{fb42}', '\u{fb42}'),
+ ('\u{fb45}', '\u{fb45}'),
+ ('\u{fbc3}', '\u{fbd2}'),
+ ('\u{fd90}', '\u{fd91}'),
+ ('\u{fdc8}', '\u{fdce}'),
+ ('\u{fdd0}', '\u{fdef}'),
+ ('\u{fe1a}', '\u{fe1f}'),
+ ('\u{fe53}', '\u{fe53}'),
+ ('\u{fe67}', '\u{fe67}'),
+ ('\u{fe6c}', '\u{fe6f}'),
+ ('\u{fe75}', '\u{fe75}'),
+ ('\u{fefd}', '\u{ff00}'),
+ ('\u{ffbf}', '\u{ffc1}'),
+ ('\u{ffc8}', '\u{ffc9}'),
+ ('\u{ffd0}', '\u{ffd1}'),
+ ('\u{ffd8}', '\u{ffd9}'),
+ ('\u{ffdd}', '\u{ffdf}'),
+ ('\u{ffe7}', '\u{ffe7}'),
+ ('\u{ffef}', '\u{fffb}'),
+ ('\u{fffe}', '\u{ffff}'),
+ ('\u{1000c}', '\u{1000c}'),
+ ('\u{10027}', '\u{10027}'),
+ ('\u{1003b}', '\u{1003b}'),
+ ('\u{1003e}', '\u{1003e}'),
+ ('\u{1004e}', '\u{1004f}'),
+ ('\u{1005e}', '\u{1007f}'),
+ ('\u{100fb}', '\u{100ff}'),
+ ('\u{10103}', '\u{10106}'),
+ ('\u{10134}', '\u{10136}'),
+ ('\u{1018f}', '\u{1018f}'),
+ ('\u{1019d}', '\u{1019f}'),
+ ('\u{101a1}', '\u{101cf}'),
+ ('\u{101fe}', '\u{1027f}'),
+ ('\u{1029d}', '\u{1029f}'),
+ ('\u{102d1}', '\u{102df}'),
+ ('\u{102fc}', '\u{102ff}'),
+ ('\u{10324}', '\u{1032c}'),
+ ('\u{1034b}', '\u{1034f}'),
+ ('\u{1037b}', '\u{1037f}'),
+ ('\u{1039e}', '\u{1039e}'),
+ ('\u{103c4}', '\u{103c7}'),
+ ('\u{103d6}', '\u{103ff}'),
+ ('\u{1049e}', '\u{1049f}'),
+ ('\u{104aa}', '\u{104af}'),
+ ('\u{104d4}', '\u{104d7}'),
+ ('\u{104fc}', '\u{104ff}'),
+ ('\u{10528}', '\u{1052f}'),
+ ('\u{10564}', '\u{1056e}'),
+ ('\u{1057b}', '\u{1057b}'),
+ ('\u{1058b}', '\u{1058b}'),
+ ('\u{10593}', '\u{10593}'),
+ ('\u{10596}', '\u{10596}'),
+ ('\u{105a2}', '\u{105a2}'),
+ ('\u{105b2}', '\u{105b2}'),
+ ('\u{105ba}', '\u{105ba}'),
+ ('\u{105bd}', '\u{105ff}'),
+ ('\u{10737}', '\u{1073f}'),
+ ('\u{10756}', '\u{1075f}'),
+ ('\u{10768}', '\u{1077f}'),
+ ('\u{10786}', '\u{10786}'),
+ ('\u{107b1}', '\u{107b1}'),
+ ('\u{107bb}', '\u{107ff}'),
+ ('\u{10806}', '\u{10807}'),
+ ('\u{10809}', '\u{10809}'),
+ ('\u{10836}', '\u{10836}'),
+ ('\u{10839}', '\u{1083b}'),
+ ('\u{1083d}', '\u{1083e}'),
+ ('\u{10856}', '\u{10856}'),
+ ('\u{1089f}', '\u{108a6}'),
+ ('\u{108b0}', '\u{108df}'),
+ ('\u{108f3}', '\u{108f3}'),
+ ('\u{108f6}', '\u{108fa}'),
+ ('\u{1091c}', '\u{1091e}'),
+ ('\u{1093a}', '\u{1093e}'),
+ ('\u{10940}', '\u{1097f}'),
+ ('\u{109b8}', '\u{109bb}'),
+ ('\u{109d0}', '\u{109d1}'),
+ ('\u{10a04}', '\u{10a04}'),
+ ('\u{10a07}', '\u{10a0b}'),
+ ('\u{10a14}', '\u{10a14}'),
+ ('\u{10a18}', '\u{10a18}'),
+ ('\u{10a36}', '\u{10a37}'),
+ ('\u{10a3b}', '\u{10a3e}'),
+ ('\u{10a49}', '\u{10a4f}'),
+ ('\u{10a59}', '\u{10a5f}'),
+ ('\u{10aa0}', '\u{10abf}'),
+ ('\u{10ae7}', '\u{10aea}'),
+ ('\u{10af7}', '\u{10aff}'),
+ ('\u{10b36}', '\u{10b38}'),
+ ('\u{10b56}', '\u{10b57}'),
+ ('\u{10b73}', '\u{10b77}'),
+ ('\u{10b92}', '\u{10b98}'),
+ ('\u{10b9d}', '\u{10ba8}'),
+ ('\u{10bb0}', '\u{10bff}'),
+ ('\u{10c49}', '\u{10c7f}'),
+ ('\u{10cb3}', '\u{10cbf}'),
+ ('\u{10cf3}', '\u{10cf9}'),
+ ('\u{10d28}', '\u{10d2f}'),
+ ('\u{10d3a}', '\u{10e5f}'),
+ ('\u{10e7f}', '\u{10e7f}'),
+ ('\u{10eaa}', '\u{10eaa}'),
+ ('\u{10eae}', '\u{10eaf}'),
+ ('\u{10eb2}', '\u{10efc}'),
+ ('\u{10f28}', '\u{10f2f}'),
+ ('\u{10f5a}', '\u{10f6f}'),
+ ('\u{10f8a}', '\u{10faf}'),
+ ('\u{10fcc}', '\u{10fdf}'),
+ ('\u{10ff7}', '\u{10fff}'),
+ ('\u{1104e}', '\u{11051}'),
+ ('\u{11076}', '\u{1107e}'),
+ ('\u{110bd}', '\u{110bd}'),
+ ('\u{110c3}', '\u{110cf}'),
+ ('\u{110e9}', '\u{110ef}'),
+ ('\u{110fa}', '\u{110ff}'),
+ ('\u{11135}', '\u{11135}'),
+ ('\u{11148}', '\u{1114f}'),
+ ('\u{11177}', '\u{1117f}'),
+ ('\u{111e0}', '\u{111e0}'),
+ ('\u{111f5}', '\u{111ff}'),
+ ('\u{11212}', '\u{11212}'),
+ ('\u{11242}', '\u{1127f}'),
+ ('\u{11287}', '\u{11287}'),
+ ('\u{11289}', '\u{11289}'),
+ ('\u{1128e}', '\u{1128e}'),
+ ('\u{1129e}', '\u{1129e}'),
+ ('\u{112aa}', '\u{112af}'),
+ ('\u{112eb}', '\u{112ef}'),
+ ('\u{112fa}', '\u{112ff}'),
+ ('\u{11304}', '\u{11304}'),
+ ('\u{1130d}', '\u{1130e}'),
+ ('\u{11311}', '\u{11312}'),
+ ('\u{11329}', '\u{11329}'),
+ ('\u{11331}', '\u{11331}'),
+ ('\u{11334}', '\u{11334}'),
+ ('\u{1133a}', '\u{1133a}'),
+ ('\u{11345}', '\u{11346}'),
+ ('\u{11349}', '\u{1134a}'),
+ ('\u{1134e}', '\u{1134f}'),
+ ('\u{11351}', '\u{11356}'),
+ ('\u{11358}', '\u{1135c}'),
+ ('\u{11364}', '\u{11365}'),
+ ('\u{1136d}', '\u{1136f}'),
+ ('\u{11375}', '\u{113ff}'),
+ ('\u{1145c}', '\u{1145c}'),
+ ('\u{11462}', '\u{1147f}'),
+ ('\u{114c8}', '\u{114cf}'),
+ ('\u{114da}', '\u{1157f}'),
+ ('\u{115b6}', '\u{115b7}'),
+ ('\u{115de}', '\u{115ff}'),
+ ('\u{11645}', '\u{1164f}'),
+ ('\u{1165a}', '\u{1165f}'),
+ ('\u{1166d}', '\u{1167f}'),
+ ('\u{116ba}', '\u{116bf}'),
+ ('\u{116ca}', '\u{116ff}'),
+ ('\u{1171b}', '\u{1171c}'),
+ ('\u{1172c}', '\u{1172f}'),
+ ('\u{11747}', '\u{117ff}'),
+ ('\u{1183c}', '\u{1189f}'),
+ ('\u{118f3}', '\u{118fe}'),
+ ('\u{11907}', '\u{11908}'),
+ ('\u{1190a}', '\u{1190b}'),
+ ('\u{11914}', '\u{11914}'),
+ ('\u{11917}', '\u{11917}'),
+ ('\u{11936}', '\u{11936}'),
+ ('\u{11939}', '\u{1193a}'),
+ ('\u{11947}', '\u{1194f}'),
+ ('\u{1195a}', '\u{1199f}'),
+ ('\u{119a8}', '\u{119a9}'),
+ ('\u{119d8}', '\u{119d9}'),
+ ('\u{119e5}', '\u{119ff}'),
+ ('\u{11a48}', '\u{11a4f}'),
+ ('\u{11aa3}', '\u{11aaf}'),
+ ('\u{11af9}', '\u{11aff}'),
+ ('\u{11b0a}', '\u{11bff}'),
+ ('\u{11c09}', '\u{11c09}'),
+ ('\u{11c37}', '\u{11c37}'),
+ ('\u{11c46}', '\u{11c4f}'),
+ ('\u{11c6d}', '\u{11c6f}'),
+ ('\u{11c90}', '\u{11c91}'),
+ ('\u{11ca8}', '\u{11ca8}'),
+ ('\u{11cb7}', '\u{11cff}'),
+ ('\u{11d07}', '\u{11d07}'),
+ ('\u{11d0a}', '\u{11d0a}'),
+ ('\u{11d37}', '\u{11d39}'),
+ ('\u{11d3b}', '\u{11d3b}'),
+ ('\u{11d3e}', '\u{11d3e}'),
+ ('\u{11d48}', '\u{11d4f}'),
+ ('\u{11d5a}', '\u{11d5f}'),
+ ('\u{11d66}', '\u{11d66}'),
+ ('\u{11d69}', '\u{11d69}'),
+ ('\u{11d8f}', '\u{11d8f}'),
+ ('\u{11d92}', '\u{11d92}'),
+ ('\u{11d99}', '\u{11d9f}'),
+ ('\u{11daa}', '\u{11edf}'),
+ ('\u{11ef9}', '\u{11eff}'),
+ ('\u{11f11}', '\u{11f11}'),
+ ('\u{11f3b}', '\u{11f3d}'),
+ ('\u{11f5a}', '\u{11faf}'),
+ ('\u{11fb1}', '\u{11fbf}'),
+ ('\u{11ff2}', '\u{11ffe}'),
+ ('\u{1239a}', '\u{123ff}'),
+ ('\u{1246f}', '\u{1246f}'),
+ ('\u{12475}', '\u{1247f}'),
+ ('\u{12544}', '\u{12f8f}'),
+ ('\u{12ff3}', '\u{12fff}'),
+ ('\u{13430}', '\u{1343f}'),
+ ('\u{13456}', '\u{143ff}'),
+ ('\u{14647}', '\u{167ff}'),
+ ('\u{16a39}', '\u{16a3f}'),
+ ('\u{16a5f}', '\u{16a5f}'),
+ ('\u{16a6a}', '\u{16a6d}'),
+ ('\u{16abf}', '\u{16abf}'),
+ ('\u{16aca}', '\u{16acf}'),
+ ('\u{16aee}', '\u{16aef}'),
+ ('\u{16af6}', '\u{16aff}'),
+ ('\u{16b46}', '\u{16b4f}'),
+ ('\u{16b5a}', '\u{16b5a}'),
+ ('\u{16b62}', '\u{16b62}'),
+ ('\u{16b78}', '\u{16b7c}'),
+ ('\u{16b90}', '\u{16e3f}'),
+ ('\u{16e9b}', '\u{16eff}'),
+ ('\u{16f4b}', '\u{16f4e}'),
+ ('\u{16f88}', '\u{16f8e}'),
+ ('\u{16fa0}', '\u{16fdf}'),
+ ('\u{16fe5}', '\u{16fef}'),
+ ('\u{16ff2}', '\u{16fff}'),
+ ('\u{187f8}', '\u{187ff}'),
+ ('\u{18cd6}', '\u{18cff}'),
+ ('\u{18d09}', '\u{1afef}'),
+ ('\u{1aff4}', '\u{1aff4}'),
+ ('\u{1affc}', '\u{1affc}'),
+ ('\u{1afff}', '\u{1afff}'),
+ ('\u{1b123}', '\u{1b131}'),
+ ('\u{1b133}', '\u{1b14f}'),
+ ('\u{1b153}', '\u{1b154}'),
+ ('\u{1b156}', '\u{1b163}'),
+ ('\u{1b168}', '\u{1b16f}'),
+ ('\u{1b2fc}', '\u{1bbff}'),
+ ('\u{1bc6b}', '\u{1bc6f}'),
+ ('\u{1bc7d}', '\u{1bc7f}'),
+ ('\u{1bc89}', '\u{1bc8f}'),
+ ('\u{1bc9a}', '\u{1bc9b}'),
+ ('\u{1bca0}', '\u{1ceff}'),
+ ('\u{1cf2e}', '\u{1cf2f}'),
+ ('\u{1cf47}', '\u{1cf4f}'),
+ ('\u{1cfc4}', '\u{1cfff}'),
+ ('\u{1d0f6}', '\u{1d0ff}'),
+ ('\u{1d127}', '\u{1d128}'),
+ ('\u{1d173}', '\u{1d17a}'),
+ ('\u{1d1eb}', '\u{1d1ff}'),
+ ('\u{1d246}', '\u{1d2bf}'),
+ ('\u{1d2d4}', '\u{1d2df}'),
+ ('\u{1d2f4}', '\u{1d2ff}'),
+ ('\u{1d357}', '\u{1d35f}'),
+ ('\u{1d379}', '\u{1d3ff}'),
+ ('\u{1d455}', '\u{1d455}'),
+ ('\u{1d49d}', '\u{1d49d}'),
+ ('\u{1d4a0}', '\u{1d4a1}'),
+ ('\u{1d4a3}', '\u{1d4a4}'),
+ ('\u{1d4a7}', '\u{1d4a8}'),
+ ('\u{1d4ad}', '\u{1d4ad}'),
+ ('\u{1d4ba}', '\u{1d4ba}'),
+ ('\u{1d4bc}', '\u{1d4bc}'),
+ ('\u{1d4c4}', '\u{1d4c4}'),
+ ('\u{1d506}', '\u{1d506}'),
+ ('\u{1d50b}', '\u{1d50c}'),
+ ('\u{1d515}', '\u{1d515}'),
+ ('\u{1d51d}', '\u{1d51d}'),
+ ('\u{1d53a}', '\u{1d53a}'),
+ ('\u{1d53f}', '\u{1d53f}'),
+ ('\u{1d545}', '\u{1d545}'),
+ ('\u{1d547}', '\u{1d549}'),
+ ('\u{1d551}', '\u{1d551}'),
+ ('\u{1d6a6}', '\u{1d6a7}'),
+ ('\u{1d7cc}', '\u{1d7cd}'),
+ ('\u{1da8c}', '\u{1da9a}'),
+ ('\u{1daa0}', '\u{1daa0}'),
+ ('\u{1dab0}', '\u{1deff}'),
+ ('\u{1df1f}', '\u{1df24}'),
+ ('\u{1df2b}', '\u{1dfff}'),
+ ('\u{1e007}', '\u{1e007}'),
+ ('\u{1e019}', '\u{1e01a}'),
+ ('\u{1e022}', '\u{1e022}'),
+ ('\u{1e025}', '\u{1e025}'),
+ ('\u{1e02b}', '\u{1e02f}'),
+ ('\u{1e06e}', '\u{1e08e}'),
+ ('\u{1e090}', '\u{1e0ff}'),
+ ('\u{1e12d}', '\u{1e12f}'),
+ ('\u{1e13e}', '\u{1e13f}'),
+ ('\u{1e14a}', '\u{1e14d}'),
+ ('\u{1e150}', '\u{1e28f}'),
+ ('\u{1e2af}', '\u{1e2bf}'),
+ ('\u{1e2fa}', '\u{1e2fe}'),
+ ('\u{1e300}', '\u{1e4cf}'),
+ ('\u{1e4fa}', '\u{1e7df}'),
+ ('\u{1e7e7}', '\u{1e7e7}'),
+ ('\u{1e7ec}', '\u{1e7ec}'),
+ ('\u{1e7ef}', '\u{1e7ef}'),
+ ('\u{1e7ff}', '\u{1e7ff}'),
+ ('\u{1e8c5}', '\u{1e8c6}'),
+ ('\u{1e8d7}', '\u{1e8ff}'),
+ ('\u{1e94c}', '\u{1e94f}'),
+ ('\u{1e95a}', '\u{1e95d}'),
+ ('\u{1e960}', '\u{1ec70}'),
+ ('\u{1ecb5}', '\u{1ed00}'),
+ ('\u{1ed3e}', '\u{1edff}'),
+ ('\u{1ee04}', '\u{1ee04}'),
+ ('\u{1ee20}', '\u{1ee20}'),
+ ('\u{1ee23}', '\u{1ee23}'),
+ ('\u{1ee25}', '\u{1ee26}'),
+ ('\u{1ee28}', '\u{1ee28}'),
+ ('\u{1ee33}', '\u{1ee33}'),
+ ('\u{1ee38}', '\u{1ee38}'),
+ ('\u{1ee3a}', '\u{1ee3a}'),
+ ('\u{1ee3c}', '\u{1ee41}'),
+ ('\u{1ee43}', '\u{1ee46}'),
+ ('\u{1ee48}', '\u{1ee48}'),
+ ('\u{1ee4a}', '\u{1ee4a}'),
+ ('\u{1ee4c}', '\u{1ee4c}'),
+ ('\u{1ee50}', '\u{1ee50}'),
+ ('\u{1ee53}', '\u{1ee53}'),
+ ('\u{1ee55}', '\u{1ee56}'),
+ ('\u{1ee58}', '\u{1ee58}'),
+ ('\u{1ee5a}', '\u{1ee5a}'),
+ ('\u{1ee5c}', '\u{1ee5c}'),
+ ('\u{1ee5e}', '\u{1ee5e}'),
+ ('\u{1ee60}', '\u{1ee60}'),
+ ('\u{1ee63}', '\u{1ee63}'),
+ ('\u{1ee65}', '\u{1ee66}'),
+ ('\u{1ee6b}', '\u{1ee6b}'),
+ ('\u{1ee73}', '\u{1ee73}'),
+ ('\u{1ee78}', '\u{1ee78}'),
+ ('\u{1ee7d}', '\u{1ee7d}'),
+ ('\u{1ee7f}', '\u{1ee7f}'),
+ ('\u{1ee8a}', '\u{1ee8a}'),
+ ('\u{1ee9c}', '\u{1eea0}'),
+ ('\u{1eea4}', '\u{1eea4}'),
+ ('\u{1eeaa}', '\u{1eeaa}'),
+ ('\u{1eebc}', '\u{1eeef}'),
+ ('\u{1eef2}', '\u{1efff}'),
+ ('\u{1f02c}', '\u{1f02f}'),
+ ('\u{1f094}', '\u{1f09f}'),
+ ('\u{1f0af}', '\u{1f0b0}'),
+ ('\u{1f0c0}', '\u{1f0c0}'),
+ ('\u{1f0d0}', '\u{1f0d0}'),
+ ('\u{1f0f6}', '\u{1f0ff}'),
+ ('\u{1f1ae}', '\u{1f1e5}'),
+ ('\u{1f203}', '\u{1f20f}'),
+ ('\u{1f23c}', '\u{1f23f}'),
+ ('\u{1f249}', '\u{1f24f}'),
+ ('\u{1f252}', '\u{1f25f}'),
+ ('\u{1f266}', '\u{1f2ff}'),
+ ('\u{1f6d8}', '\u{1f6db}'),
+ ('\u{1f6ed}', '\u{1f6ef}'),
+ ('\u{1f6fd}', '\u{1f6ff}'),
+ ('\u{1f777}', '\u{1f77a}'),
+ ('\u{1f7da}', '\u{1f7df}'),
+ ('\u{1f7ec}', '\u{1f7ef}'),
+ ('\u{1f7f1}', '\u{1f7ff}'),
+ ('\u{1f80c}', '\u{1f80f}'),
+ ('\u{1f848}', '\u{1f84f}'),
+ ('\u{1f85a}', '\u{1f85f}'),
+ ('\u{1f888}', '\u{1f88f}'),
+ ('\u{1f8ae}', '\u{1f8af}'),
+ ('\u{1f8b2}', '\u{1f8ff}'),
+ ('\u{1fa54}', '\u{1fa5f}'),
+ ('\u{1fa6e}', '\u{1fa6f}'),
+ ('\u{1fa7d}', '\u{1fa7f}'),
+ ('\u{1fa89}', '\u{1fa8f}'),
+ ('\u{1fabe}', '\u{1fabe}'),
+ ('\u{1fac6}', '\u{1facd}'),
+ ('\u{1fadc}', '\u{1fadf}'),
+ ('\u{1fae9}', '\u{1faef}'),
+ ('\u{1faf9}', '\u{1faff}'),
+ ('\u{1fb93}', '\u{1fb93}'),
+ ('\u{1fbcb}', '\u{1fbef}'),
+ ('\u{1fbfa}', '\u{1ffff}'),
+ ('\u{2a6e0}', '\u{2a6ff}'),
+ ('\u{2b73a}', '\u{2b73f}'),
+ ('\u{2b81e}', '\u{2b81f}'),
+ ('\u{2cea2}', '\u{2ceaf}'),
+ ('\u{2ebe1}', '\u{2f7ff}'),
+ ('\u{2fa1e}', '\u{2ffff}'),
+ ('\u{3134b}', '\u{3134f}'),
+ ('\u{323b0}', '\u{e00ff}'),
+ ('\u{e01f0}', '\u{10ffff}'),
+];
+
+pub const OTHER_LETTER: &'static [(char, char)] = &[
+ ('ª', 'ª'),
+ ('º', 'º'),
+ ('ƻ', 'ƻ'),
+ ('ǀ', 'ǃ'),
+ ('ʔ', 'ʔ'),
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('ؠ', 'ؿ'),
+ ('ف', 'ي'),
+ ('ٮ', 'ٯ'),
+ ('ٱ', 'ۓ'),
+ ('ە', 'ە'),
+ ('ۮ', 'ۯ'),
+ ('ۺ', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', 'ܐ'),
+ ('ܒ', 'ܯ'),
+ ('ݍ', 'ޥ'),
+ ('ޱ', 'ޱ'),
+ ('ߊ', 'ߪ'),
+ ('ࠀ', 'ࠕ'),
+ ('ࡀ', 'ࡘ'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('ࢠ', 'ࣈ'),
+ ('ऄ', 'ह'),
+ ('ऽ', 'ऽ'),
+ ('ॐ', 'ॐ'),
+ ('क़', 'ॡ'),
+ ('ॲ', 'ঀ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('ঽ', 'ঽ'),
+ ('ৎ', 'ৎ'),
+ ('ড়', 'ঢ়'),
+ ('য়', 'ৡ'),
+ ('ৰ', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('ੲ', 'ੴ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('ઽ', 'ઽ'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', 'ૡ'),
+ ('ૹ', 'ૹ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('ଽ', 'ଽ'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', 'ୡ'),
+ ('ୱ', 'ୱ'),
+ ('ஃ', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('ௐ', 'ௐ'),
+ ('అ', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('ఽ', 'ఽ'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', 'ౡ'),
+ ('ಀ', 'ಀ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('ಽ', 'ಽ'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', 'ೡ'),
+ ('ೱ', 'ೲ'),
+ ('ഄ', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', 'ഺ'),
+ ('ഽ', 'ഽ'),
+ ('ൎ', 'ൎ'),
+ ('ൔ', 'ൖ'),
+ ('ൟ', 'ൡ'),
+ ('ൺ', 'ൿ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('ก', 'ะ'),
+ ('า', 'ำ'),
+ ('เ', 'ๅ'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ະ'),
+ ('າ', 'ຳ'),
+ ('ຽ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('ཀ', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('ྈ', 'ྌ'),
+ ('က', 'ဪ'),
+ ('ဿ', 'ဿ'),
+ ('ၐ', 'ၕ'),
+ ('ၚ', 'ၝ'),
+ ('ၡ', 'ၡ'),
+ ('ၥ', 'ၦ'),
+ ('ၮ', 'ၰ'),
+ ('ၵ', 'ႁ'),
+ ('ႎ', 'ႎ'),
+ ('ᄀ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('ᎀ', 'ᎏ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛱ', 'ᛸ'),
+ ('ᜀ', 'ᜑ'),
+ ('ᜟ', 'ᜱ'),
+ ('ᝀ', 'ᝑ'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('ក', 'ឳ'),
+ ('ៜ', 'ៜ'),
+ ('ᠠ', 'ᡂ'),
+ ('ᡄ', 'ᡸ'),
+ ('ᢀ', 'ᢄ'),
+ ('ᢇ', 'ᢨ'),
+ ('ᢪ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('ᥐ', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('ᨀ', 'ᨖ'),
+ ('ᨠ', 'ᩔ'),
+ ('ᬅ', 'ᬳ'),
+ ('ᭅ', 'ᭌ'),
+ ('ᮃ', 'ᮠ'),
+ ('ᮮ', 'ᮯ'),
+ ('ᮺ', 'ᯥ'),
+ ('ᰀ', 'ᰣ'),
+ ('ᱍ', 'ᱏ'),
+ ('ᱚ', 'ᱷ'),
+ ('ᳩ', 'ᳬ'),
+ ('ᳮ', 'ᳳ'),
+ ('ᳵ', 'ᳶ'),
+ ('ᳺ', 'ᳺ'),
+ ('ℵ', 'ℸ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('〆', '〆'),
+ ('〼', '〼'),
+ ('ぁ', 'ゖ'),
+ ('ゟ', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ヿ', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꀔ'),
+ ('ꀖ', 'ꒌ'),
+ ('ꓐ', 'ꓷ'),
+ ('ꔀ', 'ꘋ'),
+ ('ꘐ', 'ꘟ'),
+ ('ꘪ', 'ꘫ'),
+ ('ꙮ', 'ꙮ'),
+ ('ꚠ', 'ꛥ'),
+ ('ꞏ', 'ꞏ'),
+ ('ꟷ', 'ꟷ'),
+ ('ꟻ', 'ꠁ'),
+ ('ꠃ', 'ꠅ'),
+ ('ꠇ', 'ꠊ'),
+ ('ꠌ', 'ꠢ'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢂ', 'ꢳ'),
+ ('ꣲ', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', 'ꣾ'),
+ ('ꤊ', 'ꤥ'),
+ ('ꤰ', 'ꥆ'),
+ ('ꥠ', 'ꥼ'),
+ ('ꦄ', 'ꦲ'),
+ ('ꧠ', 'ꧤ'),
+ ('ꧧ', 'ꧯ'),
+ ('ꧺ', 'ꧾ'),
+ ('ꨀ', 'ꨨ'),
+ ('ꩀ', 'ꩂ'),
+ ('ꩄ', 'ꩋ'),
+ ('ꩠ', 'ꩯ'),
+ ('ꩱ', 'ꩶ'),
+ ('ꩺ', 'ꩺ'),
+ ('ꩾ', 'ꪯ'),
+ ('ꪱ', 'ꪱ'),
+ ('ꪵ', 'ꪶ'),
+ ('ꪹ', 'ꪽ'),
+ ('ꫀ', 'ꫀ'),
+ ('ꫂ', 'ꫂ'),
+ ('ꫛ', 'ꫜ'),
+ ('ꫠ', 'ꫪ'),
+ ('ꫲ', 'ꫲ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꯀ', 'ꯢ'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('יִ', 'יִ'),
+ ('ײַ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('ヲ', 'ッ'),
+ ('ア', 'ン'),
+ ('ᅠ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍀'),
+ ('𐍂', '𐍉'),
+ ('𐍐', '𐍵'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐑐', '𐒝'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '𐨀'),
+ ('𐨐', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '𐫤'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐴀', '𐴣'),
+ ('𐺀', '𐺩'),
+ ('𐺰', '𐺱'),
+ ('𐼀', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '𐽅'),
+ ('𐽰', '𐾁'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀃', '𑀷'),
+ ('𑁱', '𑁲'),
+ ('𑁵', '𑁵'),
+ ('𑂃', '𑂯'),
+ ('𑃐', '𑃨'),
+ ('𑄃', '𑄦'),
+ ('𑅄', '𑅄'),
+ ('𑅇', '𑅇'),
+ ('𑅐', '𑅲'),
+ ('𑅶', '𑅶'),
+ ('𑆃', '𑆲'),
+ ('𑇁', '𑇄'),
+ ('𑇚', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '𑈫'),
+ ('𑈿', '𑉀'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '𑋞'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('𑌽', '𑌽'),
+ ('𑍐', '𑍐'),
+ ('𑍝', '𑍡'),
+ ('𑐀', '𑐴'),
+ ('𑑇', '𑑊'),
+ ('𑑟', '𑑡'),
+ ('𑒀', '𑒯'),
+ ('𑓄', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑖀', '𑖮'),
+ ('𑗘', '𑗛'),
+ ('𑘀', '𑘯'),
+ ('𑙄', '𑙄'),
+ ('𑚀', '𑚪'),
+ ('𑚸', '𑚸'),
+ ('𑜀', '𑜚'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '𑠫'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤯'),
+ ('𑤿', '𑤿'),
+ ('𑥁', '𑥁'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '𑧐'),
+ ('𑧡', '𑧡'),
+ ('𑧣', '𑧣'),
+ ('𑨀', '𑨀'),
+ ('𑨋', '𑨲'),
+ ('𑨺', '𑨺'),
+ ('𑩐', '𑩐'),
+ ('𑩜', '𑪉'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '𑰮'),
+ ('𑱀', '𑱀'),
+ ('𑱲', '𑲏'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '𑴰'),
+ ('𑵆', '𑵆'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶉'),
+ ('𑶘', '𑶘'),
+ ('𑻠', '𑻲'),
+ ('𑼂', '𑼂'),
+ ('𑼄', '𑼐'),
+ ('𑼒', '𑼳'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('𓑁', '𓑆'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩰', '𖪾'),
+ ('𖫐', '𖫭'),
+ ('𖬀', '𖬯'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖼀', '𖽊'),
+ ('𖽐', '𖽐'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('𝼊', '𝼊'),
+ ('𞄀', '𞄬'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '𞊭'),
+ ('𞋀', '𞋫'),
+ ('𞓐', '𞓪'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const OTHER_NUMBER: &'static [(char, char)] = &[
+ ('²', '³'),
+ ('¹', '¹'),
+ ('¼', '¾'),
+ ('৴', '৹'),
+ ('୲', '୷'),
+ ('௰', '௲'),
+ ('౸', '౾'),
+ ('൘', '൞'),
+ ('൰', '൸'),
+ ('༪', '༳'),
+ ('፩', '፼'),
+ ('៰', '៹'),
+ ('᧚', '᧚'),
+ ('⁰', '⁰'),
+ ('⁴', '⁹'),
+ ('₀', '₉'),
+ ('⅐', '⅟'),
+ ('↉', '↉'),
+ ('①', '⒛'),
+ ('⓪', '⓿'),
+ ('❶', '➓'),
+ ('⳽', '⳽'),
+ ('㆒', '㆕'),
+ ('㈠', '㈩'),
+ ('㉈', '㉏'),
+ ('㉑', '㉟'),
+ ('㊀', '㊉'),
+ ('㊱', '㊿'),
+ ('꠰', '꠵'),
+ ('𐄇', '𐄳'),
+ ('𐅵', '𐅸'),
+ ('𐆊', '𐆋'),
+ ('𐋡', '𐋻'),
+ ('𐌠', '𐌣'),
+ ('𐡘', '𐡟'),
+ ('𐡹', '𐡿'),
+ ('𐢧', '𐢯'),
+ ('𐣻', '𐣿'),
+ ('𐤖', '𐤛'),
+ ('𐦼', '𐦽'),
+ ('𐧀', '𐧏'),
+ ('𐧒', '𐧿'),
+ ('𐩀', '𐩈'),
+ ('𐩽', '𐩾'),
+ ('𐪝', '𐪟'),
+ ('𐫫', '𐫯'),
+ ('𐭘', '𐭟'),
+ ('𐭸', '𐭿'),
+ ('𐮩', '𐮯'),
+ ('𐳺', '𐳿'),
+ ('𐹠', '𐹾'),
+ ('𐼝', '𐼦'),
+ ('𐽑', '𐽔'),
+ ('𐿅', '𐿋'),
+ ('𑁒', '𑁥'),
+ ('𑇡', '𑇴'),
+ ('𑜺', '𑜻'),
+ ('𑣪', '𑣲'),
+ ('𑱚', '𑱬'),
+ ('𑿀', '𑿔'),
+ ('𖭛', '𖭡'),
+ ('𖺀', '𖺖'),
+ ('𝋀', '𝋓'),
+ ('𝋠', '𝋳'),
+ ('𝍠', '𝍸'),
+ ('𞣇', '𞣏'),
+ ('𞱱', '𞲫'),
+ ('𞲭', '𞲯'),
+ ('𞲱', '𞲴'),
+ ('𞴁', '𞴭'),
+ ('𞴯', '𞴽'),
+ ('🄀', '🄌'),
+];
+
+pub const OTHER_PUNCTUATION: &'static [(char, char)] = &[
+ ('!', '#'),
+ ('%', '\''),
+ ('*', '*'),
+ (',', ','),
+ ('.', '/'),
+ (':', ';'),
+ ('?', '@'),
+ ('\\', '\\'),
+ ('¡', '¡'),
+ ('§', '§'),
+ ('¶', '·'),
+ ('¿', '¿'),
+ (';', ';'),
+ ('·', '·'),
+ ('՚', '՟'),
+ ('։', '։'),
+ ('׀', '׀'),
+ ('׃', '׃'),
+ ('׆', '׆'),
+ ('׳', '״'),
+ ('؉', '؊'),
+ ('،', '؍'),
+ ('؛', '؛'),
+ ('؝', '؟'),
+ ('٪', '٭'),
+ ('۔', '۔'),
+ ('܀', '܍'),
+ ('߷', '߹'),
+ ('࠰', '࠾'),
+ ('࡞', '࡞'),
+ ('।', '॥'),
+ ('॰', '॰'),
+ ('৽', '৽'),
+ ('੶', '੶'),
+ ('૰', '૰'),
+ ('౷', '౷'),
+ ('಄', '಄'),
+ ('෴', '෴'),
+ ('๏', '๏'),
+ ('๚', '๛'),
+ ('༄', '༒'),
+ ('༔', '༔'),
+ ('྅', '྅'),
+ ('࿐', '࿔'),
+ ('࿙', '࿚'),
+ ('၊', '၏'),
+ ('჻', '჻'),
+ ('፠', '፨'),
+ ('᙮', '᙮'),
+ ('᛫', '᛭'),
+ ('᜵', '᜶'),
+ ('។', '៖'),
+ ('៘', '៚'),
+ ('᠀', '᠅'),
+ ('᠇', '᠊'),
+ ('᥄', '᥅'),
+ ('᨞', '᨟'),
+ ('᪠', '᪦'),
+ ('᪨', '᪭'),
+ ('᭚', '᭠'),
+ ('᭽', '᭾'),
+ ('᯼', '᯿'),
+ ('᰻', '᰿'),
+ ('᱾', '᱿'),
+ ('᳀', '᳇'),
+ ('᳓', '᳓'),
+ ('‖', '‗'),
+ ('†', '‧'),
+ ('‰', '‸'),
+ ('※', '‾'),
+ ('⁁', '⁃'),
+ ('⁇', '⁑'),
+ ('⁓', '⁓'),
+ ('⁕', '⁞'),
+ ('⳹', '⳼'),
+ ('⳾', '⳿'),
+ ('⵰', '⵰'),
+ ('⸀', '⸁'),
+ ('⸆', '⸈'),
+ ('⸋', '⸋'),
+ ('⸎', '⸖'),
+ ('⸘', '⸙'),
+ ('⸛', '⸛'),
+ ('⸞', '⸟'),
+ ('⸪', '⸮'),
+ ('⸰', '⸹'),
+ ('⸼', '⸿'),
+ ('⹁', '⹁'),
+ ('⹃', '⹏'),
+ ('⹒', '⹔'),
+ ('、', '〃'),
+ ('〽', '〽'),
+ ('・', '・'),
+ ('꓾', '꓿'),
+ ('꘍', '꘏'),
+ ('꙳', '꙳'),
+ ('꙾', '꙾'),
+ ('꛲', '꛷'),
+ ('꡴', '꡷'),
+ ('꣎', '꣏'),
+ ('꣸', '꣺'),
+ ('꣼', '꣼'),
+ ('꤮', '꤯'),
+ ('꥟', '꥟'),
+ ('꧁', '꧍'),
+ ('꧞', '꧟'),
+ ('꩜', '꩟'),
+ ('꫞', '꫟'),
+ ('꫰', '꫱'),
+ ('꯫', '꯫'),
+ ('︐', '︖'),
+ ('︙', '︙'),
+ ('︰', '︰'),
+ ('﹅', '﹆'),
+ ('﹉', '﹌'),
+ ('﹐', '﹒'),
+ ('﹔', '﹗'),
+ ('﹟', '﹡'),
+ ('﹨', '﹨'),
+ ('﹪', '﹫'),
+ ('!', '#'),
+ ('%', '''),
+ ('*', '*'),
+ (',', ','),
+ ('.', '/'),
+ (':', ';'),
+ ('?', '@'),
+ ('\', '\'),
+ ('。', '。'),
+ ('、', '・'),
+ ('𐄀', '𐄂'),
+ ('𐎟', '𐎟'),
+ ('𐏐', '𐏐'),
+ ('𐕯', '𐕯'),
+ ('𐡗', '𐡗'),
+ ('𐤟', '𐤟'),
+ ('𐤿', '𐤿'),
+ ('𐩐', '𐩘'),
+ ('𐩿', '𐩿'),
+ ('𐫰', '𐫶'),
+ ('𐬹', '𐬿'),
+ ('𐮙', '𐮜'),
+ ('𐽕', '𐽙'),
+ ('𐾆', '𐾉'),
+ ('𑁇', '𑁍'),
+ ('𑂻', '𑂼'),
+ ('𑂾', '𑃁'),
+ ('𑅀', '𑅃'),
+ ('𑅴', '𑅵'),
+ ('𑇅', '𑇈'),
+ ('𑇍', '𑇍'),
+ ('𑇛', '𑇛'),
+ ('𑇝', '𑇟'),
+ ('𑈸', '𑈽'),
+ ('𑊩', '𑊩'),
+ ('𑑋', '𑑏'),
+ ('𑑚', '𑑛'),
+ ('𑑝', '𑑝'),
+ ('𑓆', '𑓆'),
+ ('𑗁', '𑗗'),
+ ('𑙁', '𑙃'),
+ ('𑙠', '𑙬'),
+ ('𑚹', '𑚹'),
+ ('𑜼', '𑜾'),
+ ('𑠻', '𑠻'),
+ ('𑥄', '𑥆'),
+ ('𑧢', '𑧢'),
+ ('𑨿', '𑩆'),
+ ('𑪚', '𑪜'),
+ ('𑪞', '𑪢'),
+ ('𑬀', '𑬉'),
+ ('𑱁', '𑱅'),
+ ('𑱰', '𑱱'),
+ ('𑻷', '𑻸'),
+ ('𑽃', '𑽏'),
+ ('𑿿', '𑿿'),
+ ('𒑰', '𒑴'),
+ ('𒿱', '𒿲'),
+ ('𖩮', '𖩯'),
+ ('𖫵', '𖫵'),
+ ('𖬷', '𖬻'),
+ ('𖭄', '𖭄'),
+ ('𖺗', '𖺚'),
+ ('𖿢', '𖿢'),
+ ('𛲟', '𛲟'),
+ ('𝪇', '𝪋'),
+ ('𞥞', '𞥟'),
+];
+
+pub const OTHER_SYMBOL: &'static [(char, char)] = &[
+ ('¦', '¦'),
+ ('©', '©'),
+ ('®', '®'),
+ ('°', '°'),
+ ('҂', '҂'),
+ ('֍', '֎'),
+ ('؎', '؏'),
+ ('۞', '۞'),
+ ('۩', '۩'),
+ ('۽', '۾'),
+ ('߶', '߶'),
+ ('৺', '৺'),
+ ('୰', '୰'),
+ ('௳', '௸'),
+ ('௺', '௺'),
+ ('౿', '౿'),
+ ('൏', '൏'),
+ ('൹', '൹'),
+ ('༁', '༃'),
+ ('༓', '༓'),
+ ('༕', '༗'),
+ ('༚', '༟'),
+ ('༴', '༴'),
+ ('༶', '༶'),
+ ('༸', '༸'),
+ ('྾', '࿅'),
+ ('࿇', '࿌'),
+ ('࿎', '࿏'),
+ ('࿕', '࿘'),
+ ('႞', '႟'),
+ ('᎐', '᎙'),
+ ('᙭', '᙭'),
+ ('᥀', '᥀'),
+ ('᧞', '᧿'),
+ ('᭡', '᭪'),
+ ('᭴', '᭼'),
+ ('℀', '℁'),
+ ('℃', '℆'),
+ ('℈', '℉'),
+ ('℔', '℔'),
+ ('№', '℗'),
+ ('℞', '℣'),
+ ('℥', '℥'),
+ ('℧', '℧'),
+ ('℩', '℩'),
+ ('℮', '℮'),
+ ('℺', '℻'),
+ ('⅊', '⅊'),
+ ('⅌', '⅍'),
+ ('⅏', '⅏'),
+ ('↊', '↋'),
+ ('↕', '↙'),
+ ('↜', '↟'),
+ ('↡', '↢'),
+ ('↤', '↥'),
+ ('↧', '↭'),
+ ('↯', '⇍'),
+ ('⇐', '⇑'),
+ ('⇓', '⇓'),
+ ('⇕', '⇳'),
+ ('⌀', '⌇'),
+ ('⌌', '⌟'),
+ ('⌢', '⌨'),
+ ('⌫', '⍻'),
+ ('⍽', '⎚'),
+ ('⎴', '⏛'),
+ ('⏢', '␦'),
+ ('⑀', '⑊'),
+ ('⒜', 'ⓩ'),
+ ('─', '▶'),
+ ('▸', '◀'),
+ ('◂', '◷'),
+ ('☀', '♮'),
+ ('♰', '❧'),
+ ('➔', '➿'),
+ ('⠀', '⣿'),
+ ('⬀', '⬯'),
+ ('⭅', '⭆'),
+ ('⭍', '⭳'),
+ ('⭶', '⮕'),
+ ('⮗', '⯿'),
+ ('⳥', '⳪'),
+ ('⹐', '⹑'),
+ ('⺀', '⺙'),
+ ('⺛', '⻳'),
+ ('⼀', '⿕'),
+ ('⿰', '⿻'),
+ ('〄', '〄'),
+ ('〒', '〓'),
+ ('〠', '〠'),
+ ('〶', '〷'),
+ ('〾', '〿'),
+ ('㆐', '㆑'),
+ ('㆖', '㆟'),
+ ('㇀', '㇣'),
+ ('㈀', '㈞'),
+ ('㈪', '㉇'),
+ ('㉐', '㉐'),
+ ('㉠', '㉿'),
+ ('㊊', '㊰'),
+ ('㋀', '㏿'),
+ ('䷀', '䷿'),
+ ('꒐', '꓆'),
+ ('꠨', '꠫'),
+ ('꠶', '꠷'),
+ ('꠹', '꠹'),
+ ('꩷', '꩹'),
+ ('﵀', '﵏'),
+ ('﷏', '﷏'),
+ ('﷽', '﷿'),
+ ('¦', '¦'),
+ ('│', '│'),
+ ('■', '○'),
+ ('', '�'),
+ ('𐄷', '𐄿'),
+ ('𐅹', '𐆉'),
+ ('𐆌', '𐆎'),
+ ('𐆐', '𐆜'),
+ ('𐆠', '𐆠'),
+ ('𐇐', '𐇼'),
+ ('𐡷', '𐡸'),
+ ('𐫈', '𐫈'),
+ ('𑜿', '𑜿'),
+ ('𑿕', '𑿜'),
+ ('𑿡', '𑿱'),
+ ('𖬼', '𖬿'),
+ ('𖭅', '𖭅'),
+ ('𛲜', '𛲜'),
+ ('𜽐', '𜿃'),
+ ('𝀀', '𝃵'),
+ ('𝄀', '𝄦'),
+ ('𝄩', '𝅘𝅥𝅲'),
+ ('𝅪', '𝅬'),
+ ('𝆃', '𝆄'),
+ ('𝆌', '𝆩'),
+ ('𝆮', '𝇪'),
+ ('𝈀', '𝉁'),
+ ('𝉅', '𝉅'),
+ ('𝌀', '𝍖'),
+ ('𝠀', '𝧿'),
+ ('𝨷', '𝨺'),
+ ('𝩭', '𝩴'),
+ ('𝩶', '𝪃'),
+ ('𝪅', '𝪆'),
+ ('𞅏', '𞅏'),
+ ('𞲬', '𞲬'),
+ ('𞴮', '𞴮'),
+ ('🀀', '🀫'),
+ ('🀰', '🂓'),
+ ('🂠', '🂮'),
+ ('🂱', '🂿'),
+ ('🃁', '🃏'),
+ ('🃑', '🃵'),
+ ('🄍', '🆭'),
+ ('🇦', '🈂'),
+ ('🈐', '🈻'),
+ ('🉀', '🉈'),
+ ('🉐', '🉑'),
+ ('🉠', '🉥'),
+ ('🌀', '🏺'),
+ ('🐀', '🛗'),
+ ('🛜', '🛬'),
+ ('🛰', '🛼'),
+ ('🜀', '🝶'),
+ ('🝻', '🟙'),
+ ('🟠', '🟫'),
+ ('🟰', '🟰'),
+ ('🠀', '🠋'),
+ ('🠐', '🡇'),
+ ('🡐', '🡙'),
+ ('🡠', '🢇'),
+ ('🢐', '🢭'),
+ ('🢰', '🢱'),
+ ('🤀', '🩓'),
+ ('🩠', '🩭'),
+ ('🩰', '🩼'),
+ ('🪀', '🪈'),
+ ('🪐', '🪽'),
+ ('🪿', '🫅'),
+ ('🫎', '🫛'),
+ ('🫠', '🫨'),
+ ('🫰', '🫸'),
+ ('🬀', '🮒'),
+ ('🮔', '🯊'),
+];
+
+pub const PARAGRAPH_SEPARATOR: &'static [(char, char)] =
+ &[('\u{2029}', '\u{2029}')];
+
+pub const PRIVATE_USE: &'static [(char, char)] = &[
+ ('\u{e000}', '\u{f8ff}'),
+ ('\u{f0000}', '\u{ffffd}'),
+ ('\u{100000}', '\u{10fffd}'),
+];
+
+pub const PUNCTUATION: &'static [(char, char)] = &[
+ ('!', '#'),
+ ('%', '*'),
+ (',', '/'),
+ (':', ';'),
+ ('?', '@'),
+ ('[', ']'),
+ ('_', '_'),
+ ('{', '{'),
+ ('}', '}'),
+ ('¡', '¡'),
+ ('§', '§'),
+ ('«', '«'),
+ ('¶', '·'),
+ ('»', '»'),
+ ('¿', '¿'),
+ (';', ';'),
+ ('·', '·'),
+ ('՚', '՟'),
+ ('։', '֊'),
+ ('־', '־'),
+ ('׀', '׀'),
+ ('׃', '׃'),
+ ('׆', '׆'),
+ ('׳', '״'),
+ ('؉', '؊'),
+ ('،', '؍'),
+ ('؛', '؛'),
+ ('؝', '؟'),
+ ('٪', '٭'),
+ ('۔', '۔'),
+ ('܀', '܍'),
+ ('߷', '߹'),
+ ('࠰', '࠾'),
+ ('࡞', '࡞'),
+ ('।', '॥'),
+ ('॰', '॰'),
+ ('৽', '৽'),
+ ('੶', '੶'),
+ ('૰', '૰'),
+ ('౷', '౷'),
+ ('಄', '಄'),
+ ('෴', '෴'),
+ ('๏', '๏'),
+ ('๚', '๛'),
+ ('༄', '༒'),
+ ('༔', '༔'),
+ ('༺', '༽'),
+ ('྅', '྅'),
+ ('࿐', '࿔'),
+ ('࿙', '࿚'),
+ ('၊', '၏'),
+ ('჻', '჻'),
+ ('፠', '፨'),
+ ('᐀', '᐀'),
+ ('᙮', '᙮'),
+ ('᚛', '᚜'),
+ ('᛫', '᛭'),
+ ('᜵', '᜶'),
+ ('។', '៖'),
+ ('៘', '៚'),
+ ('᠀', '᠊'),
+ ('᥄', '᥅'),
+ ('᨞', '᨟'),
+ ('᪠', '᪦'),
+ ('᪨', '᪭'),
+ ('᭚', '᭠'),
+ ('᭽', '᭾'),
+ ('᯼', '᯿'),
+ ('᰻', '᰿'),
+ ('᱾', '᱿'),
+ ('᳀', '᳇'),
+ ('᳓', '᳓'),
+ ('‐', '‧'),
+ ('‰', '⁃'),
+ ('⁅', '⁑'),
+ ('⁓', '⁞'),
+ ('⁽', '⁾'),
+ ('₍', '₎'),
+ ('⌈', '⌋'),
+ ('〈', '〉'),
+ ('❨', '❵'),
+ ('⟅', '⟆'),
+ ('⟦', '⟯'),
+ ('⦃', '⦘'),
+ ('⧘', '⧛'),
+ ('⧼', '⧽'),
+ ('⳹', '⳼'),
+ ('⳾', '⳿'),
+ ('⵰', '⵰'),
+ ('⸀', '⸮'),
+ ('⸰', '⹏'),
+ ('⹒', '⹝'),
+ ('、', '〃'),
+ ('〈', '】'),
+ ('〔', '〟'),
+ ('〰', '〰'),
+ ('〽', '〽'),
+ ('゠', '゠'),
+ ('・', '・'),
+ ('꓾', '꓿'),
+ ('꘍', '꘏'),
+ ('꙳', '꙳'),
+ ('꙾', '꙾'),
+ ('꛲', '꛷'),
+ ('꡴', '꡷'),
+ ('꣎', '꣏'),
+ ('꣸', '꣺'),
+ ('꣼', '꣼'),
+ ('꤮', '꤯'),
+ ('꥟', '꥟'),
+ ('꧁', '꧍'),
+ ('꧞', '꧟'),
+ ('꩜', '꩟'),
+ ('꫞', '꫟'),
+ ('꫰', '꫱'),
+ ('꯫', '꯫'),
+ ('﴾', '﴿'),
+ ('︐', '︙'),
+ ('︰', '﹒'),
+ ('﹔', '﹡'),
+ ('﹣', '﹣'),
+ ('﹨', '﹨'),
+ ('﹪', '﹫'),
+ ('!', '#'),
+ ('%', '*'),
+ (',', '/'),
+ (':', ';'),
+ ('?', '@'),
+ ('[', ']'),
+ ('_', '_'),
+ ('{', '{'),
+ ('}', '}'),
+ ('⦅', '・'),
+ ('𐄀', '𐄂'),
+ ('𐎟', '𐎟'),
+ ('𐏐', '𐏐'),
+ ('𐕯', '𐕯'),
+ ('𐡗', '𐡗'),
+ ('𐤟', '𐤟'),
+ ('𐤿', '𐤿'),
+ ('𐩐', '𐩘'),
+ ('𐩿', '𐩿'),
+ ('𐫰', '𐫶'),
+ ('𐬹', '𐬿'),
+ ('𐮙', '𐮜'),
+ ('𐺭', '𐺭'),
+ ('𐽕', '𐽙'),
+ ('𐾆', '𐾉'),
+ ('𑁇', '𑁍'),
+ ('𑂻', '𑂼'),
+ ('𑂾', '𑃁'),
+ ('𑅀', '𑅃'),
+ ('𑅴', '𑅵'),
+ ('𑇅', '𑇈'),
+ ('𑇍', '𑇍'),
+ ('𑇛', '𑇛'),
+ ('𑇝', '𑇟'),
+ ('𑈸', '𑈽'),
+ ('𑊩', '𑊩'),
+ ('𑑋', '𑑏'),
+ ('𑑚', '𑑛'),
+ ('𑑝', '𑑝'),
+ ('𑓆', '𑓆'),
+ ('𑗁', '𑗗'),
+ ('𑙁', '𑙃'),
+ ('𑙠', '𑙬'),
+ ('𑚹', '𑚹'),
+ ('𑜼', '𑜾'),
+ ('𑠻', '𑠻'),
+ ('𑥄', '𑥆'),
+ ('𑧢', '𑧢'),
+ ('𑨿', '𑩆'),
+ ('𑪚', '𑪜'),
+ ('𑪞', '𑪢'),
+ ('𑬀', '𑬉'),
+ ('𑱁', '𑱅'),
+ ('𑱰', '𑱱'),
+ ('𑻷', '𑻸'),
+ ('𑽃', '𑽏'),
+ ('𑿿', '𑿿'),
+ ('𒑰', '𒑴'),
+ ('𒿱', '𒿲'),
+ ('𖩮', '𖩯'),
+ ('𖫵', '𖫵'),
+ ('𖬷', '𖬻'),
+ ('𖭄', '𖭄'),
+ ('𖺗', '𖺚'),
+ ('𖿢', '𖿢'),
+ ('𛲟', '𛲟'),
+ ('𝪇', '𝪋'),
+ ('𞥞', '𞥟'),
+];
+
+pub const SEPARATOR: &'static [(char, char)] = &[
+ (' ', ' '),
+ ('\u{a0}', '\u{a0}'),
+ ('\u{1680}', '\u{1680}'),
+ ('\u{2000}', '\u{200a}'),
+ ('\u{2028}', '\u{2029}'),
+ ('\u{202f}', '\u{202f}'),
+ ('\u{205f}', '\u{205f}'),
+ ('\u{3000}', '\u{3000}'),
+];
+
+pub const SPACE_SEPARATOR: &'static [(char, char)] = &[
+ (' ', ' '),
+ ('\u{a0}', '\u{a0}'),
+ ('\u{1680}', '\u{1680}'),
+ ('\u{2000}', '\u{200a}'),
+ ('\u{202f}', '\u{202f}'),
+ ('\u{205f}', '\u{205f}'),
+ ('\u{3000}', '\u{3000}'),
+];
+
+pub const SPACING_MARK: &'static [(char, char)] = &[
+ ('ः', 'ः'),
+ ('ऻ', 'ऻ'),
+ ('ा', 'ी'),
+ ('ॉ', 'ौ'),
+ ('ॎ', 'ॏ'),
+ ('ং', 'ঃ'),
+ ('\u{9be}', 'ী'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৌ'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('ਃ', 'ਃ'),
+ ('ਾ', 'ੀ'),
+ ('ઃ', 'ઃ'),
+ ('ા', 'ી'),
+ ('ૉ', 'ૉ'),
+ ('ો', 'ૌ'),
+ ('ଂ', 'ଃ'),
+ ('\u{b3e}', '\u{b3e}'),
+ ('ୀ', 'ୀ'),
+ ('େ', 'ୈ'),
+ ('ୋ', 'ୌ'),
+ ('\u{b57}', '\u{b57}'),
+ ('\u{bbe}', 'ி'),
+ ('ு', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', 'ௌ'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('ఁ', 'ః'),
+ ('ు', 'ౄ'),
+ ('ಂ', 'ಃ'),
+ ('ಾ', 'ಾ'),
+ ('ೀ', 'ೄ'),
+ ('ೇ', 'ೈ'),
+ ('ೊ', 'ೋ'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('ೳ', 'ೳ'),
+ ('ം', 'ഃ'),
+ ('\u{d3e}', 'ീ'),
+ ('െ', 'ൈ'),
+ ('ൊ', 'ൌ'),
+ ('\u{d57}', '\u{d57}'),
+ ('ං', 'ඃ'),
+ ('\u{dcf}', 'ෑ'),
+ ('ෘ', '\u{ddf}'),
+ ('ෲ', 'ෳ'),
+ ('༾', '༿'),
+ ('ཿ', 'ཿ'),
+ ('ါ', 'ာ'),
+ ('ေ', 'ေ'),
+ ('း', 'း'),
+ ('ျ', 'ြ'),
+ ('ၖ', 'ၗ'),
+ ('ၢ', 'ၤ'),
+ ('ၧ', 'ၭ'),
+ ('ႃ', 'ႄ'),
+ ('ႇ', 'ႌ'),
+ ('ႏ', 'ႏ'),
+ ('ႚ', 'ႜ'),
+ ('᜕', '᜕'),
+ ('᜴', '᜴'),
+ ('ា', 'ា'),
+ ('ើ', 'ៅ'),
+ ('ះ', 'ៈ'),
+ ('ᤣ', 'ᤦ'),
+ ('ᤩ', 'ᤫ'),
+ ('ᤰ', 'ᤱ'),
+ ('ᤳ', 'ᤸ'),
+ ('ᨙ', 'ᨚ'),
+ ('ᩕ', 'ᩕ'),
+ ('ᩗ', 'ᩗ'),
+ ('ᩡ', 'ᩡ'),
+ ('ᩣ', 'ᩤ'),
+ ('ᩭ', 'ᩲ'),
+ ('ᬄ', 'ᬄ'),
+ ('\u{1b35}', '\u{1b35}'),
+ ('ᬻ', 'ᬻ'),
+ ('ᬽ', 'ᭁ'),
+ ('ᭃ', '᭄'),
+ ('ᮂ', 'ᮂ'),
+ ('ᮡ', 'ᮡ'),
+ ('ᮦ', 'ᮧ'),
+ ('᮪', '᮪'),
+ ('ᯧ', 'ᯧ'),
+ ('ᯪ', 'ᯬ'),
+ ('ᯮ', 'ᯮ'),
+ ('᯲', '᯳'),
+ ('ᰤ', 'ᰫ'),
+ ('ᰴ', 'ᰵ'),
+ ('᳡', '᳡'),
+ ('᳷', '᳷'),
+ ('\u{302e}', '\u{302f}'),
+ ('ꠣ', 'ꠤ'),
+ ('ꠧ', 'ꠧ'),
+ ('ꢀ', 'ꢁ'),
+ ('ꢴ', 'ꣃ'),
+ ('ꥒ', '꥓'),
+ ('ꦃ', 'ꦃ'),
+ ('ꦴ', 'ꦵ'),
+ ('ꦺ', 'ꦻ'),
+ ('ꦾ', '꧀'),
+ ('ꨯ', 'ꨰ'),
+ ('ꨳ', 'ꨴ'),
+ ('ꩍ', 'ꩍ'),
+ ('ꩻ', 'ꩻ'),
+ ('ꩽ', 'ꩽ'),
+ ('ꫫ', 'ꫫ'),
+ ('ꫮ', 'ꫯ'),
+ ('ꫵ', 'ꫵ'),
+ ('ꯣ', 'ꯤ'),
+ ('ꯦ', 'ꯧ'),
+ ('ꯩ', 'ꯪ'),
+ ('꯬', '꯬'),
+ ('𑀀', '𑀀'),
+ ('𑀂', '𑀂'),
+ ('𑂂', '𑂂'),
+ ('𑂰', '𑂲'),
+ ('𑂷', '𑂸'),
+ ('𑄬', '𑄬'),
+ ('𑅅', '𑅆'),
+ ('𑆂', '𑆂'),
+ ('𑆳', '𑆵'),
+ ('𑆿', '𑇀'),
+ ('𑇎', '𑇎'),
+ ('𑈬', '𑈮'),
+ ('𑈲', '𑈳'),
+ ('𑈵', '𑈵'),
+ ('𑋠', '𑋢'),
+ ('𑌂', '𑌃'),
+ ('\u{1133e}', '𑌿'),
+ ('𑍁', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍢', '𑍣'),
+ ('𑐵', '𑐷'),
+ ('𑑀', '𑑁'),
+ ('𑑅', '𑑅'),
+ ('\u{114b0}', '𑒲'),
+ ('𑒹', '𑒹'),
+ ('𑒻', '𑒾'),
+ ('𑓁', '𑓁'),
+ ('\u{115af}', '𑖱'),
+ ('𑖸', '𑖻'),
+ ('𑖾', '𑖾'),
+ ('𑘰', '𑘲'),
+ ('𑘻', '𑘼'),
+ ('𑘾', '𑘾'),
+ ('𑚬', '𑚬'),
+ ('𑚮', '𑚯'),
+ ('𑚶', '𑚶'),
+ ('𑜠', '𑜡'),
+ ('𑜦', '𑜦'),
+ ('𑠬', '𑠮'),
+ ('𑠸', '𑠸'),
+ ('\u{11930}', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('𑤽', '𑤽'),
+ ('𑥀', '𑥀'),
+ ('𑥂', '𑥂'),
+ ('𑧑', '𑧓'),
+ ('𑧜', '𑧟'),
+ ('𑧤', '𑧤'),
+ ('𑨹', '𑨹'),
+ ('𑩗', '𑩘'),
+ ('𑪗', '𑪗'),
+ ('𑰯', '𑰯'),
+ ('𑰾', '𑰾'),
+ ('𑲩', '𑲩'),
+ ('𑲱', '𑲱'),
+ ('𑲴', '𑲴'),
+ ('𑶊', '𑶎'),
+ ('𑶓', '𑶔'),
+ ('𑶖', '𑶖'),
+ ('𑻵', '𑻶'),
+ ('𑼃', '𑼃'),
+ ('𑼴', '𑼵'),
+ ('𑼾', '𑼿'),
+ ('𑽁', '𑽁'),
+ ('𖽑', '𖾇'),
+ ('𖿰', '𖿱'),
+ ('\u{1d165}', '𝅦'),
+ ('𝅭', '\u{1d172}'),
+];
+
+pub const SYMBOL: &'static [(char, char)] = &[
+ ('$', '$'),
+ ('+', '+'),
+ ('<', '>'),
+ ('^', '^'),
+ ('`', '`'),
+ ('|', '|'),
+ ('~', '~'),
+ ('¢', '¦'),
+ ('¨', '©'),
+ ('¬', '¬'),
+ ('®', '±'),
+ ('´', '´'),
+ ('¸', '¸'),
+ ('×', '×'),
+ ('÷', '÷'),
+ ('˂', '˅'),
+ ('˒', '˟'),
+ ('˥', '˫'),
+ ('˭', '˭'),
+ ('˯', '˿'),
+ ('͵', '͵'),
+ ('΄', '΅'),
+ ('϶', '϶'),
+ ('҂', '҂'),
+ ('֍', '֏'),
+ ('؆', '؈'),
+ ('؋', '؋'),
+ ('؎', '؏'),
+ ('۞', '۞'),
+ ('۩', '۩'),
+ ('۽', '۾'),
+ ('߶', '߶'),
+ ('߾', '߿'),
+ ('࢈', '࢈'),
+ ('৲', '৳'),
+ ('৺', '৻'),
+ ('૱', '૱'),
+ ('୰', '୰'),
+ ('௳', '௺'),
+ ('౿', '౿'),
+ ('൏', '൏'),
+ ('൹', '൹'),
+ ('฿', '฿'),
+ ('༁', '༃'),
+ ('༓', '༓'),
+ ('༕', '༗'),
+ ('༚', '༟'),
+ ('༴', '༴'),
+ ('༶', '༶'),
+ ('༸', '༸'),
+ ('྾', '࿅'),
+ ('࿇', '࿌'),
+ ('࿎', '࿏'),
+ ('࿕', '࿘'),
+ ('႞', '႟'),
+ ('᎐', '᎙'),
+ ('᙭', '᙭'),
+ ('៛', '៛'),
+ ('᥀', '᥀'),
+ ('᧞', '᧿'),
+ ('᭡', '᭪'),
+ ('᭴', '᭼'),
+ ('᾽', '᾽'),
+ ('᾿', '῁'),
+ ('῍', '῏'),
+ ('῝', '῟'),
+ ('῭', '`'),
+ ('´', '῾'),
+ ('⁄', '⁄'),
+ ('⁒', '⁒'),
+ ('⁺', '⁼'),
+ ('₊', '₌'),
+ ('₠', '⃀'),
+ ('℀', '℁'),
+ ('℃', '℆'),
+ ('℈', '℉'),
+ ('℔', '℔'),
+ ('№', '℘'),
+ ('℞', '℣'),
+ ('℥', '℥'),
+ ('℧', '℧'),
+ ('℩', '℩'),
+ ('℮', '℮'),
+ ('℺', '℻'),
+ ('⅀', '⅄'),
+ ('⅊', '⅍'),
+ ('⅏', '⅏'),
+ ('↊', '↋'),
+ ('←', '⌇'),
+ ('⌌', '⌨'),
+ ('⌫', '␦'),
+ ('⑀', '⑊'),
+ ('⒜', 'ⓩ'),
+ ('─', '❧'),
+ ('➔', '⟄'),
+ ('⟇', '⟥'),
+ ('⟰', '⦂'),
+ ('⦙', '⧗'),
+ ('⧜', '⧻'),
+ ('⧾', '⭳'),
+ ('⭶', '⮕'),
+ ('⮗', '⯿'),
+ ('⳥', '⳪'),
+ ('⹐', '⹑'),
+ ('⺀', '⺙'),
+ ('⺛', '⻳'),
+ ('⼀', '⿕'),
+ ('⿰', '⿻'),
+ ('〄', '〄'),
+ ('〒', '〓'),
+ ('〠', '〠'),
+ ('〶', '〷'),
+ ('〾', '〿'),
+ ('゛', '゜'),
+ ('㆐', '㆑'),
+ ('㆖', '㆟'),
+ ('㇀', '㇣'),
+ ('㈀', '㈞'),
+ ('㈪', '㉇'),
+ ('㉐', '㉐'),
+ ('㉠', '㉿'),
+ ('㊊', '㊰'),
+ ('㋀', '㏿'),
+ ('䷀', '䷿'),
+ ('꒐', '꓆'),
+ ('꜀', '꜖'),
+ ('꜠', '꜡'),
+ ('꞉', '꞊'),
+ ('꠨', '꠫'),
+ ('꠶', '꠹'),
+ ('꩷', '꩹'),
+ ('꭛', '꭛'),
+ ('꭪', '꭫'),
+ ('﬩', '﬩'),
+ ('﮲', '﯂'),
+ ('﵀', '﵏'),
+ ('﷏', '﷏'),
+ ('﷼', '﷿'),
+ ('﹢', '﹢'),
+ ('﹤', '﹦'),
+ ('﹩', '﹩'),
+ ('$', '$'),
+ ('+', '+'),
+ ('<', '>'),
+ ('^', '^'),
+ ('`', '`'),
+ ('|', '|'),
+ ('~', '~'),
+ ('¢', '₩'),
+ ('│', '○'),
+ ('', '�'),
+ ('𐄷', '𐄿'),
+ ('𐅹', '𐆉'),
+ ('𐆌', '𐆎'),
+ ('𐆐', '𐆜'),
+ ('𐆠', '𐆠'),
+ ('𐇐', '𐇼'),
+ ('𐡷', '𐡸'),
+ ('𐫈', '𐫈'),
+ ('𑜿', '𑜿'),
+ ('𑿕', '𑿱'),
+ ('𖬼', '𖬿'),
+ ('𖭅', '𖭅'),
+ ('𛲜', '𛲜'),
+ ('𜽐', '𜿃'),
+ ('𝀀', '𝃵'),
+ ('𝄀', '𝄦'),
+ ('𝄩', '𝅘𝅥𝅲'),
+ ('𝅪', '𝅬'),
+ ('𝆃', '𝆄'),
+ ('𝆌', '𝆩'),
+ ('𝆮', '𝇪'),
+ ('𝈀', '𝉁'),
+ ('𝉅', '𝉅'),
+ ('𝌀', '𝍖'),
+ ('𝛁', '𝛁'),
+ ('𝛛', '𝛛'),
+ ('𝛻', '𝛻'),
+ ('𝜕', '𝜕'),
+ ('𝜵', '𝜵'),
+ ('𝝏', '𝝏'),
+ ('𝝯', '𝝯'),
+ ('𝞉', '𝞉'),
+ ('𝞩', '𝞩'),
+ ('𝟃', '𝟃'),
+ ('𝠀', '𝧿'),
+ ('𝨷', '𝨺'),
+ ('𝩭', '𝩴'),
+ ('𝩶', '𝪃'),
+ ('𝪅', '𝪆'),
+ ('𞅏', '𞅏'),
+ ('𞋿', '𞋿'),
+ ('𞲬', '𞲬'),
+ ('𞲰', '𞲰'),
+ ('𞴮', '𞴮'),
+ ('𞻰', '𞻱'),
+ ('🀀', '🀫'),
+ ('🀰', '🂓'),
+ ('🂠', '🂮'),
+ ('🂱', '🂿'),
+ ('🃁', '🃏'),
+ ('🃑', '🃵'),
+ ('🄍', '🆭'),
+ ('🇦', '🈂'),
+ ('🈐', '🈻'),
+ ('🉀', '🉈'),
+ ('🉐', '🉑'),
+ ('🉠', '🉥'),
+ ('🌀', '🛗'),
+ ('🛜', '🛬'),
+ ('🛰', '🛼'),
+ ('🜀', '🝶'),
+ ('🝻', '🟙'),
+ ('🟠', '🟫'),
+ ('🟰', '🟰'),
+ ('🠀', '🠋'),
+ ('🠐', '🡇'),
+ ('🡐', '🡙'),
+ ('🡠', '🢇'),
+ ('🢐', '🢭'),
+ ('🢰', '🢱'),
+ ('🤀', '🩓'),
+ ('🩠', '🩭'),
+ ('🩰', '🩼'),
+ ('🪀', '🪈'),
+ ('🪐', '🪽'),
+ ('🪿', '🫅'),
+ ('🫎', '🫛'),
+ ('🫠', '🫨'),
+ ('🫰', '🫸'),
+ ('🬀', '🮒'),
+ ('🮔', '🯊'),
+];
+
+pub const TITLECASE_LETTER: &'static [(char, char)] = &[
+ ('Dž', 'Dž'),
+ ('Lj', 'Lj'),
+ ('Nj', 'Nj'),
+ ('Dz', 'Dz'),
+ ('ᾈ', 'ᾏ'),
+ ('ᾘ', 'ᾟ'),
+ ('ᾨ', 'ᾯ'),
+ ('ᾼ', 'ᾼ'),
+ ('ῌ', 'ῌ'),
+ ('ῼ', 'ῼ'),
+];
+
+pub const UNASSIGNED: &'static [(char, char)] = &[
+ ('\u{378}', '\u{379}'),
+ ('\u{380}', '\u{383}'),
+ ('\u{38b}', '\u{38b}'),
+ ('\u{38d}', '\u{38d}'),
+ ('\u{3a2}', '\u{3a2}'),
+ ('\u{530}', '\u{530}'),
+ ('\u{557}', '\u{558}'),
+ ('\u{58b}', '\u{58c}'),
+ ('\u{590}', '\u{590}'),
+ ('\u{5c8}', '\u{5cf}'),
+ ('\u{5eb}', '\u{5ee}'),
+ ('\u{5f5}', '\u{5ff}'),
+ ('\u{70e}', '\u{70e}'),
+ ('\u{74b}', '\u{74c}'),
+ ('\u{7b2}', '\u{7bf}'),
+ ('\u{7fb}', '\u{7fc}'),
+ ('\u{82e}', '\u{82f}'),
+ ('\u{83f}', '\u{83f}'),
+ ('\u{85c}', '\u{85d}'),
+ ('\u{85f}', '\u{85f}'),
+ ('\u{86b}', '\u{86f}'),
+ ('\u{88f}', '\u{88f}'),
+ ('\u{892}', '\u{897}'),
+ ('\u{984}', '\u{984}'),
+ ('\u{98d}', '\u{98e}'),
+ ('\u{991}', '\u{992}'),
+ ('\u{9a9}', '\u{9a9}'),
+ ('\u{9b1}', '\u{9b1}'),
+ ('\u{9b3}', '\u{9b5}'),
+ ('\u{9ba}', '\u{9bb}'),
+ ('\u{9c5}', '\u{9c6}'),
+ ('\u{9c9}', '\u{9ca}'),
+ ('\u{9cf}', '\u{9d6}'),
+ ('\u{9d8}', '\u{9db}'),
+ ('\u{9de}', '\u{9de}'),
+ ('\u{9e4}', '\u{9e5}'),
+ ('\u{9ff}', '\u{a00}'),
+ ('\u{a04}', '\u{a04}'),
+ ('\u{a0b}', '\u{a0e}'),
+ ('\u{a11}', '\u{a12}'),
+ ('\u{a29}', '\u{a29}'),
+ ('\u{a31}', '\u{a31}'),
+ ('\u{a34}', '\u{a34}'),
+ ('\u{a37}', '\u{a37}'),
+ ('\u{a3a}', '\u{a3b}'),
+ ('\u{a3d}', '\u{a3d}'),
+ ('\u{a43}', '\u{a46}'),
+ ('\u{a49}', '\u{a4a}'),
+ ('\u{a4e}', '\u{a50}'),
+ ('\u{a52}', '\u{a58}'),
+ ('\u{a5d}', '\u{a5d}'),
+ ('\u{a5f}', '\u{a65}'),
+ ('\u{a77}', '\u{a80}'),
+ ('\u{a84}', '\u{a84}'),
+ ('\u{a8e}', '\u{a8e}'),
+ ('\u{a92}', '\u{a92}'),
+ ('\u{aa9}', '\u{aa9}'),
+ ('\u{ab1}', '\u{ab1}'),
+ ('\u{ab4}', '\u{ab4}'),
+ ('\u{aba}', '\u{abb}'),
+ ('\u{ac6}', '\u{ac6}'),
+ ('\u{aca}', '\u{aca}'),
+ ('\u{ace}', '\u{acf}'),
+ ('\u{ad1}', '\u{adf}'),
+ ('\u{ae4}', '\u{ae5}'),
+ ('\u{af2}', '\u{af8}'),
+ ('\u{b00}', '\u{b00}'),
+ ('\u{b04}', '\u{b04}'),
+ ('\u{b0d}', '\u{b0e}'),
+ ('\u{b11}', '\u{b12}'),
+ ('\u{b29}', '\u{b29}'),
+ ('\u{b31}', '\u{b31}'),
+ ('\u{b34}', '\u{b34}'),
+ ('\u{b3a}', '\u{b3b}'),
+ ('\u{b45}', '\u{b46}'),
+ ('\u{b49}', '\u{b4a}'),
+ ('\u{b4e}', '\u{b54}'),
+ ('\u{b58}', '\u{b5b}'),
+ ('\u{b5e}', '\u{b5e}'),
+ ('\u{b64}', '\u{b65}'),
+ ('\u{b78}', '\u{b81}'),
+ ('\u{b84}', '\u{b84}'),
+ ('\u{b8b}', '\u{b8d}'),
+ ('\u{b91}', '\u{b91}'),
+ ('\u{b96}', '\u{b98}'),
+ ('\u{b9b}', '\u{b9b}'),
+ ('\u{b9d}', '\u{b9d}'),
+ ('\u{ba0}', '\u{ba2}'),
+ ('\u{ba5}', '\u{ba7}'),
+ ('\u{bab}', '\u{bad}'),
+ ('\u{bba}', '\u{bbd}'),
+ ('\u{bc3}', '\u{bc5}'),
+ ('\u{bc9}', '\u{bc9}'),
+ ('\u{bce}', '\u{bcf}'),
+ ('\u{bd1}', '\u{bd6}'),
+ ('\u{bd8}', '\u{be5}'),
+ ('\u{bfb}', '\u{bff}'),
+ ('\u{c0d}', '\u{c0d}'),
+ ('\u{c11}', '\u{c11}'),
+ ('\u{c29}', '\u{c29}'),
+ ('\u{c3a}', '\u{c3b}'),
+ ('\u{c45}', '\u{c45}'),
+ ('\u{c49}', '\u{c49}'),
+ ('\u{c4e}', '\u{c54}'),
+ ('\u{c57}', '\u{c57}'),
+ ('\u{c5b}', '\u{c5c}'),
+ ('\u{c5e}', '\u{c5f}'),
+ ('\u{c64}', '\u{c65}'),
+ ('\u{c70}', '\u{c76}'),
+ ('\u{c8d}', '\u{c8d}'),
+ ('\u{c91}', '\u{c91}'),
+ ('\u{ca9}', '\u{ca9}'),
+ ('\u{cb4}', '\u{cb4}'),
+ ('\u{cba}', '\u{cbb}'),
+ ('\u{cc5}', '\u{cc5}'),
+ ('\u{cc9}', '\u{cc9}'),
+ ('\u{cce}', '\u{cd4}'),
+ ('\u{cd7}', '\u{cdc}'),
+ ('\u{cdf}', '\u{cdf}'),
+ ('\u{ce4}', '\u{ce5}'),
+ ('\u{cf0}', '\u{cf0}'),
+ ('\u{cf4}', '\u{cff}'),
+ ('\u{d0d}', '\u{d0d}'),
+ ('\u{d11}', '\u{d11}'),
+ ('\u{d45}', '\u{d45}'),
+ ('\u{d49}', '\u{d49}'),
+ ('\u{d50}', '\u{d53}'),
+ ('\u{d64}', '\u{d65}'),
+ ('\u{d80}', '\u{d80}'),
+ ('\u{d84}', '\u{d84}'),
+ ('\u{d97}', '\u{d99}'),
+ ('\u{db2}', '\u{db2}'),
+ ('\u{dbc}', '\u{dbc}'),
+ ('\u{dbe}', '\u{dbf}'),
+ ('\u{dc7}', '\u{dc9}'),
+ ('\u{dcb}', '\u{dce}'),
+ ('\u{dd5}', '\u{dd5}'),
+ ('\u{dd7}', '\u{dd7}'),
+ ('\u{de0}', '\u{de5}'),
+ ('\u{df0}', '\u{df1}'),
+ ('\u{df5}', '\u{e00}'),
+ ('\u{e3b}', '\u{e3e}'),
+ ('\u{e5c}', '\u{e80}'),
+ ('\u{e83}', '\u{e83}'),
+ ('\u{e85}', '\u{e85}'),
+ ('\u{e8b}', '\u{e8b}'),
+ ('\u{ea4}', '\u{ea4}'),
+ ('\u{ea6}', '\u{ea6}'),
+ ('\u{ebe}', '\u{ebf}'),
+ ('\u{ec5}', '\u{ec5}'),
+ ('\u{ec7}', '\u{ec7}'),
+ ('\u{ecf}', '\u{ecf}'),
+ ('\u{eda}', '\u{edb}'),
+ ('\u{ee0}', '\u{eff}'),
+ ('\u{f48}', '\u{f48}'),
+ ('\u{f6d}', '\u{f70}'),
+ ('\u{f98}', '\u{f98}'),
+ ('\u{fbd}', '\u{fbd}'),
+ ('\u{fcd}', '\u{fcd}'),
+ ('\u{fdb}', '\u{fff}'),
+ ('\u{10c6}', '\u{10c6}'),
+ ('\u{10c8}', '\u{10cc}'),
+ ('\u{10ce}', '\u{10cf}'),
+ ('\u{1249}', '\u{1249}'),
+ ('\u{124e}', '\u{124f}'),
+ ('\u{1257}', '\u{1257}'),
+ ('\u{1259}', '\u{1259}'),
+ ('\u{125e}', '\u{125f}'),
+ ('\u{1289}', '\u{1289}'),
+ ('\u{128e}', '\u{128f}'),
+ ('\u{12b1}', '\u{12b1}'),
+ ('\u{12b6}', '\u{12b7}'),
+ ('\u{12bf}', '\u{12bf}'),
+ ('\u{12c1}', '\u{12c1}'),
+ ('\u{12c6}', '\u{12c7}'),
+ ('\u{12d7}', '\u{12d7}'),
+ ('\u{1311}', '\u{1311}'),
+ ('\u{1316}', '\u{1317}'),
+ ('\u{135b}', '\u{135c}'),
+ ('\u{137d}', '\u{137f}'),
+ ('\u{139a}', '\u{139f}'),
+ ('\u{13f6}', '\u{13f7}'),
+ ('\u{13fe}', '\u{13ff}'),
+ ('\u{169d}', '\u{169f}'),
+ ('\u{16f9}', '\u{16ff}'),
+ ('\u{1716}', '\u{171e}'),
+ ('\u{1737}', '\u{173f}'),
+ ('\u{1754}', '\u{175f}'),
+ ('\u{176d}', '\u{176d}'),
+ ('\u{1771}', '\u{1771}'),
+ ('\u{1774}', '\u{177f}'),
+ ('\u{17de}', '\u{17df}'),
+ ('\u{17ea}', '\u{17ef}'),
+ ('\u{17fa}', '\u{17ff}'),
+ ('\u{181a}', '\u{181f}'),
+ ('\u{1879}', '\u{187f}'),
+ ('\u{18ab}', '\u{18af}'),
+ ('\u{18f6}', '\u{18ff}'),
+ ('\u{191f}', '\u{191f}'),
+ ('\u{192c}', '\u{192f}'),
+ ('\u{193c}', '\u{193f}'),
+ ('\u{1941}', '\u{1943}'),
+ ('\u{196e}', '\u{196f}'),
+ ('\u{1975}', '\u{197f}'),
+ ('\u{19ac}', '\u{19af}'),
+ ('\u{19ca}', '\u{19cf}'),
+ ('\u{19db}', '\u{19dd}'),
+ ('\u{1a1c}', '\u{1a1d}'),
+ ('\u{1a5f}', '\u{1a5f}'),
+ ('\u{1a7d}', '\u{1a7e}'),
+ ('\u{1a8a}', '\u{1a8f}'),
+ ('\u{1a9a}', '\u{1a9f}'),
+ ('\u{1aae}', '\u{1aaf}'),
+ ('\u{1acf}', '\u{1aff}'),
+ ('\u{1b4d}', '\u{1b4f}'),
+ ('\u{1b7f}', '\u{1b7f}'),
+ ('\u{1bf4}', '\u{1bfb}'),
+ ('\u{1c38}', '\u{1c3a}'),
+ ('\u{1c4a}', '\u{1c4c}'),
+ ('\u{1c89}', '\u{1c8f}'),
+ ('\u{1cbb}', '\u{1cbc}'),
+ ('\u{1cc8}', '\u{1ccf}'),
+ ('\u{1cfb}', '\u{1cff}'),
+ ('\u{1f16}', '\u{1f17}'),
+ ('\u{1f1e}', '\u{1f1f}'),
+ ('\u{1f46}', '\u{1f47}'),
+ ('\u{1f4e}', '\u{1f4f}'),
+ ('\u{1f58}', '\u{1f58}'),
+ ('\u{1f5a}', '\u{1f5a}'),
+ ('\u{1f5c}', '\u{1f5c}'),
+ ('\u{1f5e}', '\u{1f5e}'),
+ ('\u{1f7e}', '\u{1f7f}'),
+ ('\u{1fb5}', '\u{1fb5}'),
+ ('\u{1fc5}', '\u{1fc5}'),
+ ('\u{1fd4}', '\u{1fd5}'),
+ ('\u{1fdc}', '\u{1fdc}'),
+ ('\u{1ff0}', '\u{1ff1}'),
+ ('\u{1ff5}', '\u{1ff5}'),
+ ('\u{1fff}', '\u{1fff}'),
+ ('\u{2065}', '\u{2065}'),
+ ('\u{2072}', '\u{2073}'),
+ ('\u{208f}', '\u{208f}'),
+ ('\u{209d}', '\u{209f}'),
+ ('\u{20c1}', '\u{20cf}'),
+ ('\u{20f1}', '\u{20ff}'),
+ ('\u{218c}', '\u{218f}'),
+ ('\u{2427}', '\u{243f}'),
+ ('\u{244b}', '\u{245f}'),
+ ('\u{2b74}', '\u{2b75}'),
+ ('\u{2b96}', '\u{2b96}'),
+ ('\u{2cf4}', '\u{2cf8}'),
+ ('\u{2d26}', '\u{2d26}'),
+ ('\u{2d28}', '\u{2d2c}'),
+ ('\u{2d2e}', '\u{2d2f}'),
+ ('\u{2d68}', '\u{2d6e}'),
+ ('\u{2d71}', '\u{2d7e}'),
+ ('\u{2d97}', '\u{2d9f}'),
+ ('\u{2da7}', '\u{2da7}'),
+ ('\u{2daf}', '\u{2daf}'),
+ ('\u{2db7}', '\u{2db7}'),
+ ('\u{2dbf}', '\u{2dbf}'),
+ ('\u{2dc7}', '\u{2dc7}'),
+ ('\u{2dcf}', '\u{2dcf}'),
+ ('\u{2dd7}', '\u{2dd7}'),
+ ('\u{2ddf}', '\u{2ddf}'),
+ ('\u{2e5e}', '\u{2e7f}'),
+ ('\u{2e9a}', '\u{2e9a}'),
+ ('\u{2ef4}', '\u{2eff}'),
+ ('\u{2fd6}', '\u{2fef}'),
+ ('\u{2ffc}', '\u{2fff}'),
+ ('\u{3040}', '\u{3040}'),
+ ('\u{3097}', '\u{3098}'),
+ ('\u{3100}', '\u{3104}'),
+ ('\u{3130}', '\u{3130}'),
+ ('\u{318f}', '\u{318f}'),
+ ('\u{31e4}', '\u{31ef}'),
+ ('\u{321f}', '\u{321f}'),
+ ('\u{a48d}', '\u{a48f}'),
+ ('\u{a4c7}', '\u{a4cf}'),
+ ('\u{a62c}', '\u{a63f}'),
+ ('\u{a6f8}', '\u{a6ff}'),
+ ('\u{a7cb}', '\u{a7cf}'),
+ ('\u{a7d2}', '\u{a7d2}'),
+ ('\u{a7d4}', '\u{a7d4}'),
+ ('\u{a7da}', '\u{a7f1}'),
+ ('\u{a82d}', '\u{a82f}'),
+ ('\u{a83a}', '\u{a83f}'),
+ ('\u{a878}', '\u{a87f}'),
+ ('\u{a8c6}', '\u{a8cd}'),
+ ('\u{a8da}', '\u{a8df}'),
+ ('\u{a954}', '\u{a95e}'),
+ ('\u{a97d}', '\u{a97f}'),
+ ('\u{a9ce}', '\u{a9ce}'),
+ ('\u{a9da}', '\u{a9dd}'),
+ ('\u{a9ff}', '\u{a9ff}'),
+ ('\u{aa37}', '\u{aa3f}'),
+ ('\u{aa4e}', '\u{aa4f}'),
+ ('\u{aa5a}', '\u{aa5b}'),
+ ('\u{aac3}', '\u{aada}'),
+ ('\u{aaf7}', '\u{ab00}'),
+ ('\u{ab07}', '\u{ab08}'),
+ ('\u{ab0f}', '\u{ab10}'),
+ ('\u{ab17}', '\u{ab1f}'),
+ ('\u{ab27}', '\u{ab27}'),
+ ('\u{ab2f}', '\u{ab2f}'),
+ ('\u{ab6c}', '\u{ab6f}'),
+ ('\u{abee}', '\u{abef}'),
+ ('\u{abfa}', '\u{abff}'),
+ ('\u{d7a4}', '\u{d7af}'),
+ ('\u{d7c7}', '\u{d7ca}'),
+ ('\u{d7fc}', '\u{d7ff}'),
+ ('\u{fa6e}', '\u{fa6f}'),
+ ('\u{fada}', '\u{faff}'),
+ ('\u{fb07}', '\u{fb12}'),
+ ('\u{fb18}', '\u{fb1c}'),
+ ('\u{fb37}', '\u{fb37}'),
+ ('\u{fb3d}', '\u{fb3d}'),
+ ('\u{fb3f}', '\u{fb3f}'),
+ ('\u{fb42}', '\u{fb42}'),
+ ('\u{fb45}', '\u{fb45}'),
+ ('\u{fbc3}', '\u{fbd2}'),
+ ('\u{fd90}', '\u{fd91}'),
+ ('\u{fdc8}', '\u{fdce}'),
+ ('\u{fdd0}', '\u{fdef}'),
+ ('\u{fe1a}', '\u{fe1f}'),
+ ('\u{fe53}', '\u{fe53}'),
+ ('\u{fe67}', '\u{fe67}'),
+ ('\u{fe6c}', '\u{fe6f}'),
+ ('\u{fe75}', '\u{fe75}'),
+ ('\u{fefd}', '\u{fefe}'),
+ ('\u{ff00}', '\u{ff00}'),
+ ('\u{ffbf}', '\u{ffc1}'),
+ ('\u{ffc8}', '\u{ffc9}'),
+ ('\u{ffd0}', '\u{ffd1}'),
+ ('\u{ffd8}', '\u{ffd9}'),
+ ('\u{ffdd}', '\u{ffdf}'),
+ ('\u{ffe7}', '\u{ffe7}'),
+ ('\u{ffef}', '\u{fff8}'),
+ ('\u{fffe}', '\u{ffff}'),
+ ('\u{1000c}', '\u{1000c}'),
+ ('\u{10027}', '\u{10027}'),
+ ('\u{1003b}', '\u{1003b}'),
+ ('\u{1003e}', '\u{1003e}'),
+ ('\u{1004e}', '\u{1004f}'),
+ ('\u{1005e}', '\u{1007f}'),
+ ('\u{100fb}', '\u{100ff}'),
+ ('\u{10103}', '\u{10106}'),
+ ('\u{10134}', '\u{10136}'),
+ ('\u{1018f}', '\u{1018f}'),
+ ('\u{1019d}', '\u{1019f}'),
+ ('\u{101a1}', '\u{101cf}'),
+ ('\u{101fe}', '\u{1027f}'),
+ ('\u{1029d}', '\u{1029f}'),
+ ('\u{102d1}', '\u{102df}'),
+ ('\u{102fc}', '\u{102ff}'),
+ ('\u{10324}', '\u{1032c}'),
+ ('\u{1034b}', '\u{1034f}'),
+ ('\u{1037b}', '\u{1037f}'),
+ ('\u{1039e}', '\u{1039e}'),
+ ('\u{103c4}', '\u{103c7}'),
+ ('\u{103d6}', '\u{103ff}'),
+ ('\u{1049e}', '\u{1049f}'),
+ ('\u{104aa}', '\u{104af}'),
+ ('\u{104d4}', '\u{104d7}'),
+ ('\u{104fc}', '\u{104ff}'),
+ ('\u{10528}', '\u{1052f}'),
+ ('\u{10564}', '\u{1056e}'),
+ ('\u{1057b}', '\u{1057b}'),
+ ('\u{1058b}', '\u{1058b}'),
+ ('\u{10593}', '\u{10593}'),
+ ('\u{10596}', '\u{10596}'),
+ ('\u{105a2}', '\u{105a2}'),
+ ('\u{105b2}', '\u{105b2}'),
+ ('\u{105ba}', '\u{105ba}'),
+ ('\u{105bd}', '\u{105ff}'),
+ ('\u{10737}', '\u{1073f}'),
+ ('\u{10756}', '\u{1075f}'),
+ ('\u{10768}', '\u{1077f}'),
+ ('\u{10786}', '\u{10786}'),
+ ('\u{107b1}', '\u{107b1}'),
+ ('\u{107bb}', '\u{107ff}'),
+ ('\u{10806}', '\u{10807}'),
+ ('\u{10809}', '\u{10809}'),
+ ('\u{10836}', '\u{10836}'),
+ ('\u{10839}', '\u{1083b}'),
+ ('\u{1083d}', '\u{1083e}'),
+ ('\u{10856}', '\u{10856}'),
+ ('\u{1089f}', '\u{108a6}'),
+ ('\u{108b0}', '\u{108df}'),
+ ('\u{108f3}', '\u{108f3}'),
+ ('\u{108f6}', '\u{108fa}'),
+ ('\u{1091c}', '\u{1091e}'),
+ ('\u{1093a}', '\u{1093e}'),
+ ('\u{10940}', '\u{1097f}'),
+ ('\u{109b8}', '\u{109bb}'),
+ ('\u{109d0}', '\u{109d1}'),
+ ('\u{10a04}', '\u{10a04}'),
+ ('\u{10a07}', '\u{10a0b}'),
+ ('\u{10a14}', '\u{10a14}'),
+ ('\u{10a18}', '\u{10a18}'),
+ ('\u{10a36}', '\u{10a37}'),
+ ('\u{10a3b}', '\u{10a3e}'),
+ ('\u{10a49}', '\u{10a4f}'),
+ ('\u{10a59}', '\u{10a5f}'),
+ ('\u{10aa0}', '\u{10abf}'),
+ ('\u{10ae7}', '\u{10aea}'),
+ ('\u{10af7}', '\u{10aff}'),
+ ('\u{10b36}', '\u{10b38}'),
+ ('\u{10b56}', '\u{10b57}'),
+ ('\u{10b73}', '\u{10b77}'),
+ ('\u{10b92}', '\u{10b98}'),
+ ('\u{10b9d}', '\u{10ba8}'),
+ ('\u{10bb0}', '\u{10bff}'),
+ ('\u{10c49}', '\u{10c7f}'),
+ ('\u{10cb3}', '\u{10cbf}'),
+ ('\u{10cf3}', '\u{10cf9}'),
+ ('\u{10d28}', '\u{10d2f}'),
+ ('\u{10d3a}', '\u{10e5f}'),
+ ('\u{10e7f}', '\u{10e7f}'),
+ ('\u{10eaa}', '\u{10eaa}'),
+ ('\u{10eae}', '\u{10eaf}'),
+ ('\u{10eb2}', '\u{10efc}'),
+ ('\u{10f28}', '\u{10f2f}'),
+ ('\u{10f5a}', '\u{10f6f}'),
+ ('\u{10f8a}', '\u{10faf}'),
+ ('\u{10fcc}', '\u{10fdf}'),
+ ('\u{10ff7}', '\u{10fff}'),
+ ('\u{1104e}', '\u{11051}'),
+ ('\u{11076}', '\u{1107e}'),
+ ('\u{110c3}', '\u{110cc}'),
+ ('\u{110ce}', '\u{110cf}'),
+ ('\u{110e9}', '\u{110ef}'),
+ ('\u{110fa}', '\u{110ff}'),
+ ('\u{11135}', '\u{11135}'),
+ ('\u{11148}', '\u{1114f}'),
+ ('\u{11177}', '\u{1117f}'),
+ ('\u{111e0}', '\u{111e0}'),
+ ('\u{111f5}', '\u{111ff}'),
+ ('\u{11212}', '\u{11212}'),
+ ('\u{11242}', '\u{1127f}'),
+ ('\u{11287}', '\u{11287}'),
+ ('\u{11289}', '\u{11289}'),
+ ('\u{1128e}', '\u{1128e}'),
+ ('\u{1129e}', '\u{1129e}'),
+ ('\u{112aa}', '\u{112af}'),
+ ('\u{112eb}', '\u{112ef}'),
+ ('\u{112fa}', '\u{112ff}'),
+ ('\u{11304}', '\u{11304}'),
+ ('\u{1130d}', '\u{1130e}'),
+ ('\u{11311}', '\u{11312}'),
+ ('\u{11329}', '\u{11329}'),
+ ('\u{11331}', '\u{11331}'),
+ ('\u{11334}', '\u{11334}'),
+ ('\u{1133a}', '\u{1133a}'),
+ ('\u{11345}', '\u{11346}'),
+ ('\u{11349}', '\u{1134a}'),
+ ('\u{1134e}', '\u{1134f}'),
+ ('\u{11351}', '\u{11356}'),
+ ('\u{11358}', '\u{1135c}'),
+ ('\u{11364}', '\u{11365}'),
+ ('\u{1136d}', '\u{1136f}'),
+ ('\u{11375}', '\u{113ff}'),
+ ('\u{1145c}', '\u{1145c}'),
+ ('\u{11462}', '\u{1147f}'),
+ ('\u{114c8}', '\u{114cf}'),
+ ('\u{114da}', '\u{1157f}'),
+ ('\u{115b6}', '\u{115b7}'),
+ ('\u{115de}', '\u{115ff}'),
+ ('\u{11645}', '\u{1164f}'),
+ ('\u{1165a}', '\u{1165f}'),
+ ('\u{1166d}', '\u{1167f}'),
+ ('\u{116ba}', '\u{116bf}'),
+ ('\u{116ca}', '\u{116ff}'),
+ ('\u{1171b}', '\u{1171c}'),
+ ('\u{1172c}', '\u{1172f}'),
+ ('\u{11747}', '\u{117ff}'),
+ ('\u{1183c}', '\u{1189f}'),
+ ('\u{118f3}', '\u{118fe}'),
+ ('\u{11907}', '\u{11908}'),
+ ('\u{1190a}', '\u{1190b}'),
+ ('\u{11914}', '\u{11914}'),
+ ('\u{11917}', '\u{11917}'),
+ ('\u{11936}', '\u{11936}'),
+ ('\u{11939}', '\u{1193a}'),
+ ('\u{11947}', '\u{1194f}'),
+ ('\u{1195a}', '\u{1199f}'),
+ ('\u{119a8}', '\u{119a9}'),
+ ('\u{119d8}', '\u{119d9}'),
+ ('\u{119e5}', '\u{119ff}'),
+ ('\u{11a48}', '\u{11a4f}'),
+ ('\u{11aa3}', '\u{11aaf}'),
+ ('\u{11af9}', '\u{11aff}'),
+ ('\u{11b0a}', '\u{11bff}'),
+ ('\u{11c09}', '\u{11c09}'),
+ ('\u{11c37}', '\u{11c37}'),
+ ('\u{11c46}', '\u{11c4f}'),
+ ('\u{11c6d}', '\u{11c6f}'),
+ ('\u{11c90}', '\u{11c91}'),
+ ('\u{11ca8}', '\u{11ca8}'),
+ ('\u{11cb7}', '\u{11cff}'),
+ ('\u{11d07}', '\u{11d07}'),
+ ('\u{11d0a}', '\u{11d0a}'),
+ ('\u{11d37}', '\u{11d39}'),
+ ('\u{11d3b}', '\u{11d3b}'),
+ ('\u{11d3e}', '\u{11d3e}'),
+ ('\u{11d48}', '\u{11d4f}'),
+ ('\u{11d5a}', '\u{11d5f}'),
+ ('\u{11d66}', '\u{11d66}'),
+ ('\u{11d69}', '\u{11d69}'),
+ ('\u{11d8f}', '\u{11d8f}'),
+ ('\u{11d92}', '\u{11d92}'),
+ ('\u{11d99}', '\u{11d9f}'),
+ ('\u{11daa}', '\u{11edf}'),
+ ('\u{11ef9}', '\u{11eff}'),
+ ('\u{11f11}', '\u{11f11}'),
+ ('\u{11f3b}', '\u{11f3d}'),
+ ('\u{11f5a}', '\u{11faf}'),
+ ('\u{11fb1}', '\u{11fbf}'),
+ ('\u{11ff2}', '\u{11ffe}'),
+ ('\u{1239a}', '\u{123ff}'),
+ ('\u{1246f}', '\u{1246f}'),
+ ('\u{12475}', '\u{1247f}'),
+ ('\u{12544}', '\u{12f8f}'),
+ ('\u{12ff3}', '\u{12fff}'),
+ ('\u{13456}', '\u{143ff}'),
+ ('\u{14647}', '\u{167ff}'),
+ ('\u{16a39}', '\u{16a3f}'),
+ ('\u{16a5f}', '\u{16a5f}'),
+ ('\u{16a6a}', '\u{16a6d}'),
+ ('\u{16abf}', '\u{16abf}'),
+ ('\u{16aca}', '\u{16acf}'),
+ ('\u{16aee}', '\u{16aef}'),
+ ('\u{16af6}', '\u{16aff}'),
+ ('\u{16b46}', '\u{16b4f}'),
+ ('\u{16b5a}', '\u{16b5a}'),
+ ('\u{16b62}', '\u{16b62}'),
+ ('\u{16b78}', '\u{16b7c}'),
+ ('\u{16b90}', '\u{16e3f}'),
+ ('\u{16e9b}', '\u{16eff}'),
+ ('\u{16f4b}', '\u{16f4e}'),
+ ('\u{16f88}', '\u{16f8e}'),
+ ('\u{16fa0}', '\u{16fdf}'),
+ ('\u{16fe5}', '\u{16fef}'),
+ ('\u{16ff2}', '\u{16fff}'),
+ ('\u{187f8}', '\u{187ff}'),
+ ('\u{18cd6}', '\u{18cff}'),
+ ('\u{18d09}', '\u{1afef}'),
+ ('\u{1aff4}', '\u{1aff4}'),
+ ('\u{1affc}', '\u{1affc}'),
+ ('\u{1afff}', '\u{1afff}'),
+ ('\u{1b123}', '\u{1b131}'),
+ ('\u{1b133}', '\u{1b14f}'),
+ ('\u{1b153}', '\u{1b154}'),
+ ('\u{1b156}', '\u{1b163}'),
+ ('\u{1b168}', '\u{1b16f}'),
+ ('\u{1b2fc}', '\u{1bbff}'),
+ ('\u{1bc6b}', '\u{1bc6f}'),
+ ('\u{1bc7d}', '\u{1bc7f}'),
+ ('\u{1bc89}', '\u{1bc8f}'),
+ ('\u{1bc9a}', '\u{1bc9b}'),
+ ('\u{1bca4}', '\u{1ceff}'),
+ ('\u{1cf2e}', '\u{1cf2f}'),
+ ('\u{1cf47}', '\u{1cf4f}'),
+ ('\u{1cfc4}', '\u{1cfff}'),
+ ('\u{1d0f6}', '\u{1d0ff}'),
+ ('\u{1d127}', '\u{1d128}'),
+ ('\u{1d1eb}', '\u{1d1ff}'),
+ ('\u{1d246}', '\u{1d2bf}'),
+ ('\u{1d2d4}', '\u{1d2df}'),
+ ('\u{1d2f4}', '\u{1d2ff}'),
+ ('\u{1d357}', '\u{1d35f}'),
+ ('\u{1d379}', '\u{1d3ff}'),
+ ('\u{1d455}', '\u{1d455}'),
+ ('\u{1d49d}', '\u{1d49d}'),
+ ('\u{1d4a0}', '\u{1d4a1}'),
+ ('\u{1d4a3}', '\u{1d4a4}'),
+ ('\u{1d4a7}', '\u{1d4a8}'),
+ ('\u{1d4ad}', '\u{1d4ad}'),
+ ('\u{1d4ba}', '\u{1d4ba}'),
+ ('\u{1d4bc}', '\u{1d4bc}'),
+ ('\u{1d4c4}', '\u{1d4c4}'),
+ ('\u{1d506}', '\u{1d506}'),
+ ('\u{1d50b}', '\u{1d50c}'),
+ ('\u{1d515}', '\u{1d515}'),
+ ('\u{1d51d}', '\u{1d51d}'),
+ ('\u{1d53a}', '\u{1d53a}'),
+ ('\u{1d53f}', '\u{1d53f}'),
+ ('\u{1d545}', '\u{1d545}'),
+ ('\u{1d547}', '\u{1d549}'),
+ ('\u{1d551}', '\u{1d551}'),
+ ('\u{1d6a6}', '\u{1d6a7}'),
+ ('\u{1d7cc}', '\u{1d7cd}'),
+ ('\u{1da8c}', '\u{1da9a}'),
+ ('\u{1daa0}', '\u{1daa0}'),
+ ('\u{1dab0}', '\u{1deff}'),
+ ('\u{1df1f}', '\u{1df24}'),
+ ('\u{1df2b}', '\u{1dfff}'),
+ ('\u{1e007}', '\u{1e007}'),
+ ('\u{1e019}', '\u{1e01a}'),
+ ('\u{1e022}', '\u{1e022}'),
+ ('\u{1e025}', '\u{1e025}'),
+ ('\u{1e02b}', '\u{1e02f}'),
+ ('\u{1e06e}', '\u{1e08e}'),
+ ('\u{1e090}', '\u{1e0ff}'),
+ ('\u{1e12d}', '\u{1e12f}'),
+ ('\u{1e13e}', '\u{1e13f}'),
+ ('\u{1e14a}', '\u{1e14d}'),
+ ('\u{1e150}', '\u{1e28f}'),
+ ('\u{1e2af}', '\u{1e2bf}'),
+ ('\u{1e2fa}', '\u{1e2fe}'),
+ ('\u{1e300}', '\u{1e4cf}'),
+ ('\u{1e4fa}', '\u{1e7df}'),
+ ('\u{1e7e7}', '\u{1e7e7}'),
+ ('\u{1e7ec}', '\u{1e7ec}'),
+ ('\u{1e7ef}', '\u{1e7ef}'),
+ ('\u{1e7ff}', '\u{1e7ff}'),
+ ('\u{1e8c5}', '\u{1e8c6}'),
+ ('\u{1e8d7}', '\u{1e8ff}'),
+ ('\u{1e94c}', '\u{1e94f}'),
+ ('\u{1e95a}', '\u{1e95d}'),
+ ('\u{1e960}', '\u{1ec70}'),
+ ('\u{1ecb5}', '\u{1ed00}'),
+ ('\u{1ed3e}', '\u{1edff}'),
+ ('\u{1ee04}', '\u{1ee04}'),
+ ('\u{1ee20}', '\u{1ee20}'),
+ ('\u{1ee23}', '\u{1ee23}'),
+ ('\u{1ee25}', '\u{1ee26}'),
+ ('\u{1ee28}', '\u{1ee28}'),
+ ('\u{1ee33}', '\u{1ee33}'),
+ ('\u{1ee38}', '\u{1ee38}'),
+ ('\u{1ee3a}', '\u{1ee3a}'),
+ ('\u{1ee3c}', '\u{1ee41}'),
+ ('\u{1ee43}', '\u{1ee46}'),
+ ('\u{1ee48}', '\u{1ee48}'),
+ ('\u{1ee4a}', '\u{1ee4a}'),
+ ('\u{1ee4c}', '\u{1ee4c}'),
+ ('\u{1ee50}', '\u{1ee50}'),
+ ('\u{1ee53}', '\u{1ee53}'),
+ ('\u{1ee55}', '\u{1ee56}'),
+ ('\u{1ee58}', '\u{1ee58}'),
+ ('\u{1ee5a}', '\u{1ee5a}'),
+ ('\u{1ee5c}', '\u{1ee5c}'),
+ ('\u{1ee5e}', '\u{1ee5e}'),
+ ('\u{1ee60}', '\u{1ee60}'),
+ ('\u{1ee63}', '\u{1ee63}'),
+ ('\u{1ee65}', '\u{1ee66}'),
+ ('\u{1ee6b}', '\u{1ee6b}'),
+ ('\u{1ee73}', '\u{1ee73}'),
+ ('\u{1ee78}', '\u{1ee78}'),
+ ('\u{1ee7d}', '\u{1ee7d}'),
+ ('\u{1ee7f}', '\u{1ee7f}'),
+ ('\u{1ee8a}', '\u{1ee8a}'),
+ ('\u{1ee9c}', '\u{1eea0}'),
+ ('\u{1eea4}', '\u{1eea4}'),
+ ('\u{1eeaa}', '\u{1eeaa}'),
+ ('\u{1eebc}', '\u{1eeef}'),
+ ('\u{1eef2}', '\u{1efff}'),
+ ('\u{1f02c}', '\u{1f02f}'),
+ ('\u{1f094}', '\u{1f09f}'),
+ ('\u{1f0af}', '\u{1f0b0}'),
+ ('\u{1f0c0}', '\u{1f0c0}'),
+ ('\u{1f0d0}', '\u{1f0d0}'),
+ ('\u{1f0f6}', '\u{1f0ff}'),
+ ('\u{1f1ae}', '\u{1f1e5}'),
+ ('\u{1f203}', '\u{1f20f}'),
+ ('\u{1f23c}', '\u{1f23f}'),
+ ('\u{1f249}', '\u{1f24f}'),
+ ('\u{1f252}', '\u{1f25f}'),
+ ('\u{1f266}', '\u{1f2ff}'),
+ ('\u{1f6d8}', '\u{1f6db}'),
+ ('\u{1f6ed}', '\u{1f6ef}'),
+ ('\u{1f6fd}', '\u{1f6ff}'),
+ ('\u{1f777}', '\u{1f77a}'),
+ ('\u{1f7da}', '\u{1f7df}'),
+ ('\u{1f7ec}', '\u{1f7ef}'),
+ ('\u{1f7f1}', '\u{1f7ff}'),
+ ('\u{1f80c}', '\u{1f80f}'),
+ ('\u{1f848}', '\u{1f84f}'),
+ ('\u{1f85a}', '\u{1f85f}'),
+ ('\u{1f888}', '\u{1f88f}'),
+ ('\u{1f8ae}', '\u{1f8af}'),
+ ('\u{1f8b2}', '\u{1f8ff}'),
+ ('\u{1fa54}', '\u{1fa5f}'),
+ ('\u{1fa6e}', '\u{1fa6f}'),
+ ('\u{1fa7d}', '\u{1fa7f}'),
+ ('\u{1fa89}', '\u{1fa8f}'),
+ ('\u{1fabe}', '\u{1fabe}'),
+ ('\u{1fac6}', '\u{1facd}'),
+ ('\u{1fadc}', '\u{1fadf}'),
+ ('\u{1fae9}', '\u{1faef}'),
+ ('\u{1faf9}', '\u{1faff}'),
+ ('\u{1fb93}', '\u{1fb93}'),
+ ('\u{1fbcb}', '\u{1fbef}'),
+ ('\u{1fbfa}', '\u{1ffff}'),
+ ('\u{2a6e0}', '\u{2a6ff}'),
+ ('\u{2b73a}', '\u{2b73f}'),
+ ('\u{2b81e}', '\u{2b81f}'),
+ ('\u{2cea2}', '\u{2ceaf}'),
+ ('\u{2ebe1}', '\u{2f7ff}'),
+ ('\u{2fa1e}', '\u{2ffff}'),
+ ('\u{3134b}', '\u{3134f}'),
+ ('\u{323b0}', '\u{e0000}'),
+ ('\u{e0002}', '\u{e001f}'),
+ ('\u{e0080}', '\u{e00ff}'),
+ ('\u{e01f0}', '\u{effff}'),
+ ('\u{ffffe}', '\u{fffff}'),
+ ('\u{10fffe}', '\u{10ffff}'),
+];
+
+pub const UPPERCASE_LETTER: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('À', 'Ö'),
+ ('Ø', 'Þ'),
+ ('Ā', 'Ā'),
+ ('Ă', 'Ă'),
+ ('Ą', 'Ą'),
+ ('Ć', 'Ć'),
+ ('Ĉ', 'Ĉ'),
+ ('Ċ', 'Ċ'),
+ ('Č', 'Č'),
+ ('Ď', 'Ď'),
+ ('Đ', 'Đ'),
+ ('Ē', 'Ē'),
+ ('Ĕ', 'Ĕ'),
+ ('Ė', 'Ė'),
+ ('Ę', 'Ę'),
+ ('Ě', 'Ě'),
+ ('Ĝ', 'Ĝ'),
+ ('Ğ', 'Ğ'),
+ ('Ġ', 'Ġ'),
+ ('Ģ', 'Ģ'),
+ ('Ĥ', 'Ĥ'),
+ ('Ħ', 'Ħ'),
+ ('Ĩ', 'Ĩ'),
+ ('Ī', 'Ī'),
+ ('Ĭ', 'Ĭ'),
+ ('Į', 'Į'),
+ ('İ', 'İ'),
+ ('IJ', 'IJ'),
+ ('Ĵ', 'Ĵ'),
+ ('Ķ', 'Ķ'),
+ ('Ĺ', 'Ĺ'),
+ ('Ļ', 'Ļ'),
+ ('Ľ', 'Ľ'),
+ ('Ŀ', 'Ŀ'),
+ ('Ł', 'Ł'),
+ ('Ń', 'Ń'),
+ ('Ņ', 'Ņ'),
+ ('Ň', 'Ň'),
+ ('Ŋ', 'Ŋ'),
+ ('Ō', 'Ō'),
+ ('Ŏ', 'Ŏ'),
+ ('Ő', 'Ő'),
+ ('Œ', 'Œ'),
+ ('Ŕ', 'Ŕ'),
+ ('Ŗ', 'Ŗ'),
+ ('Ř', 'Ř'),
+ ('Ś', 'Ś'),
+ ('Ŝ', 'Ŝ'),
+ ('Ş', 'Ş'),
+ ('Š', 'Š'),
+ ('Ţ', 'Ţ'),
+ ('Ť', 'Ť'),
+ ('Ŧ', 'Ŧ'),
+ ('Ũ', 'Ũ'),
+ ('Ū', 'Ū'),
+ ('Ŭ', 'Ŭ'),
+ ('Ů', 'Ů'),
+ ('Ű', 'Ű'),
+ ('Ų', 'Ų'),
+ ('Ŵ', 'Ŵ'),
+ ('Ŷ', 'Ŷ'),
+ ('Ÿ', 'Ź'),
+ ('Ż', 'Ż'),
+ ('Ž', 'Ž'),
+ ('Ɓ', 'Ƃ'),
+ ('Ƅ', 'Ƅ'),
+ ('Ɔ', 'Ƈ'),
+ ('Ɖ', 'Ƌ'),
+ ('Ǝ', 'Ƒ'),
+ ('Ɠ', 'Ɣ'),
+ ('Ɩ', 'Ƙ'),
+ ('Ɯ', 'Ɲ'),
+ ('Ɵ', 'Ơ'),
+ ('Ƣ', 'Ƣ'),
+ ('Ƥ', 'Ƥ'),
+ ('Ʀ', 'Ƨ'),
+ ('Ʃ', 'Ʃ'),
+ ('Ƭ', 'Ƭ'),
+ ('Ʈ', 'Ư'),
+ ('Ʊ', 'Ƴ'),
+ ('Ƶ', 'Ƶ'),
+ ('Ʒ', 'Ƹ'),
+ ('Ƽ', 'Ƽ'),
+ ('DŽ', 'DŽ'),
+ ('LJ', 'LJ'),
+ ('NJ', 'NJ'),
+ ('Ǎ', 'Ǎ'),
+ ('Ǐ', 'Ǐ'),
+ ('Ǒ', 'Ǒ'),
+ ('Ǔ', 'Ǔ'),
+ ('Ǖ', 'Ǖ'),
+ ('Ǘ', 'Ǘ'),
+ ('Ǚ', 'Ǚ'),
+ ('Ǜ', 'Ǜ'),
+ ('Ǟ', 'Ǟ'),
+ ('Ǡ', 'Ǡ'),
+ ('Ǣ', 'Ǣ'),
+ ('Ǥ', 'Ǥ'),
+ ('Ǧ', 'Ǧ'),
+ ('Ǩ', 'Ǩ'),
+ ('Ǫ', 'Ǫ'),
+ ('Ǭ', 'Ǭ'),
+ ('Ǯ', 'Ǯ'),
+ ('DZ', 'DZ'),
+ ('Ǵ', 'Ǵ'),
+ ('Ƕ', 'Ǹ'),
+ ('Ǻ', 'Ǻ'),
+ ('Ǽ', 'Ǽ'),
+ ('Ǿ', 'Ǿ'),
+ ('Ȁ', 'Ȁ'),
+ ('Ȃ', 'Ȃ'),
+ ('Ȅ', 'Ȅ'),
+ ('Ȇ', 'Ȇ'),
+ ('Ȉ', 'Ȉ'),
+ ('Ȋ', 'Ȋ'),
+ ('Ȍ', 'Ȍ'),
+ ('Ȏ', 'Ȏ'),
+ ('Ȑ', 'Ȑ'),
+ ('Ȓ', 'Ȓ'),
+ ('Ȕ', 'Ȕ'),
+ ('Ȗ', 'Ȗ'),
+ ('Ș', 'Ș'),
+ ('Ț', 'Ț'),
+ ('Ȝ', 'Ȝ'),
+ ('Ȟ', 'Ȟ'),
+ ('Ƞ', 'Ƞ'),
+ ('Ȣ', 'Ȣ'),
+ ('Ȥ', 'Ȥ'),
+ ('Ȧ', 'Ȧ'),
+ ('Ȩ', 'Ȩ'),
+ ('Ȫ', 'Ȫ'),
+ ('Ȭ', 'Ȭ'),
+ ('Ȯ', 'Ȯ'),
+ ('Ȱ', 'Ȱ'),
+ ('Ȳ', 'Ȳ'),
+ ('Ⱥ', 'Ȼ'),
+ ('Ƚ', 'Ⱦ'),
+ ('Ɂ', 'Ɂ'),
+ ('Ƀ', 'Ɇ'),
+ ('Ɉ', 'Ɉ'),
+ ('Ɋ', 'Ɋ'),
+ ('Ɍ', 'Ɍ'),
+ ('Ɏ', 'Ɏ'),
+ ('Ͱ', 'Ͱ'),
+ ('Ͳ', 'Ͳ'),
+ ('Ͷ', 'Ͷ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ώ'),
+ ('Α', 'Ρ'),
+ ('Σ', 'Ϋ'),
+ ('Ϗ', 'Ϗ'),
+ ('ϒ', 'ϔ'),
+ ('Ϙ', 'Ϙ'),
+ ('Ϛ', 'Ϛ'),
+ ('Ϝ', 'Ϝ'),
+ ('Ϟ', 'Ϟ'),
+ ('Ϡ', 'Ϡ'),
+ ('Ϣ', 'Ϣ'),
+ ('Ϥ', 'Ϥ'),
+ ('Ϧ', 'Ϧ'),
+ ('Ϩ', 'Ϩ'),
+ ('Ϫ', 'Ϫ'),
+ ('Ϭ', 'Ϭ'),
+ ('Ϯ', 'Ϯ'),
+ ('ϴ', 'ϴ'),
+ ('Ϸ', 'Ϸ'),
+ ('Ϲ', 'Ϻ'),
+ ('Ͻ', 'Я'),
+ ('Ѡ', 'Ѡ'),
+ ('Ѣ', 'Ѣ'),
+ ('Ѥ', 'Ѥ'),
+ ('Ѧ', 'Ѧ'),
+ ('Ѩ', 'Ѩ'),
+ ('Ѫ', 'Ѫ'),
+ ('Ѭ', 'Ѭ'),
+ ('Ѯ', 'Ѯ'),
+ ('Ѱ', 'Ѱ'),
+ ('Ѳ', 'Ѳ'),
+ ('Ѵ', 'Ѵ'),
+ ('Ѷ', 'Ѷ'),
+ ('Ѹ', 'Ѹ'),
+ ('Ѻ', 'Ѻ'),
+ ('Ѽ', 'Ѽ'),
+ ('Ѿ', 'Ѿ'),
+ ('Ҁ', 'Ҁ'),
+ ('Ҋ', 'Ҋ'),
+ ('Ҍ', 'Ҍ'),
+ ('Ҏ', 'Ҏ'),
+ ('Ґ', 'Ґ'),
+ ('Ғ', 'Ғ'),
+ ('Ҕ', 'Ҕ'),
+ ('Җ', 'Җ'),
+ ('Ҙ', 'Ҙ'),
+ ('Қ', 'Қ'),
+ ('Ҝ', 'Ҝ'),
+ ('Ҟ', 'Ҟ'),
+ ('Ҡ', 'Ҡ'),
+ ('Ң', 'Ң'),
+ ('Ҥ', 'Ҥ'),
+ ('Ҧ', 'Ҧ'),
+ ('Ҩ', 'Ҩ'),
+ ('Ҫ', 'Ҫ'),
+ ('Ҭ', 'Ҭ'),
+ ('Ү', 'Ү'),
+ ('Ұ', 'Ұ'),
+ ('Ҳ', 'Ҳ'),
+ ('Ҵ', 'Ҵ'),
+ ('Ҷ', 'Ҷ'),
+ ('Ҹ', 'Ҹ'),
+ ('Һ', 'Һ'),
+ ('Ҽ', 'Ҽ'),
+ ('Ҿ', 'Ҿ'),
+ ('Ӏ', 'Ӂ'),
+ ('Ӄ', 'Ӄ'),
+ ('Ӆ', 'Ӆ'),
+ ('Ӈ', 'Ӈ'),
+ ('Ӊ', 'Ӊ'),
+ ('Ӌ', 'Ӌ'),
+ ('Ӎ', 'Ӎ'),
+ ('Ӑ', 'Ӑ'),
+ ('Ӓ', 'Ӓ'),
+ ('Ӕ', 'Ӕ'),
+ ('Ӗ', 'Ӗ'),
+ ('Ә', 'Ә'),
+ ('Ӛ', 'Ӛ'),
+ ('Ӝ', 'Ӝ'),
+ ('Ӟ', 'Ӟ'),
+ ('Ӡ', 'Ӡ'),
+ ('Ӣ', 'Ӣ'),
+ ('Ӥ', 'Ӥ'),
+ ('Ӧ', 'Ӧ'),
+ ('Ө', 'Ө'),
+ ('Ӫ', 'Ӫ'),
+ ('Ӭ', 'Ӭ'),
+ ('Ӯ', 'Ӯ'),
+ ('Ӱ', 'Ӱ'),
+ ('Ӳ', 'Ӳ'),
+ ('Ӵ', 'Ӵ'),
+ ('Ӷ', 'Ӷ'),
+ ('Ӹ', 'Ӹ'),
+ ('Ӻ', 'Ӻ'),
+ ('Ӽ', 'Ӽ'),
+ ('Ӿ', 'Ӿ'),
+ ('Ԁ', 'Ԁ'),
+ ('Ԃ', 'Ԃ'),
+ ('Ԅ', 'Ԅ'),
+ ('Ԇ', 'Ԇ'),
+ ('Ԉ', 'Ԉ'),
+ ('Ԋ', 'Ԋ'),
+ ('Ԍ', 'Ԍ'),
+ ('Ԏ', 'Ԏ'),
+ ('Ԑ', 'Ԑ'),
+ ('Ԓ', 'Ԓ'),
+ ('Ԕ', 'Ԕ'),
+ ('Ԗ', 'Ԗ'),
+ ('Ԙ', 'Ԙ'),
+ ('Ԛ', 'Ԛ'),
+ ('Ԝ', 'Ԝ'),
+ ('Ԟ', 'Ԟ'),
+ ('Ԡ', 'Ԡ'),
+ ('Ԣ', 'Ԣ'),
+ ('Ԥ', 'Ԥ'),
+ ('Ԧ', 'Ԧ'),
+ ('Ԩ', 'Ԩ'),
+ ('Ԫ', 'Ԫ'),
+ ('Ԭ', 'Ԭ'),
+ ('Ԯ', 'Ԯ'),
+ ('Ա', 'Ֆ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('Ḁ', 'Ḁ'),
+ ('Ḃ', 'Ḃ'),
+ ('Ḅ', 'Ḅ'),
+ ('Ḇ', 'Ḇ'),
+ ('Ḉ', 'Ḉ'),
+ ('Ḋ', 'Ḋ'),
+ ('Ḍ', 'Ḍ'),
+ ('Ḏ', 'Ḏ'),
+ ('Ḑ', 'Ḑ'),
+ ('Ḓ', 'Ḓ'),
+ ('Ḕ', 'Ḕ'),
+ ('Ḗ', 'Ḗ'),
+ ('Ḙ', 'Ḙ'),
+ ('Ḛ', 'Ḛ'),
+ ('Ḝ', 'Ḝ'),
+ ('Ḟ', 'Ḟ'),
+ ('Ḡ', 'Ḡ'),
+ ('Ḣ', 'Ḣ'),
+ ('Ḥ', 'Ḥ'),
+ ('Ḧ', 'Ḧ'),
+ ('Ḩ', 'Ḩ'),
+ ('Ḫ', 'Ḫ'),
+ ('Ḭ', 'Ḭ'),
+ ('Ḯ', 'Ḯ'),
+ ('Ḱ', 'Ḱ'),
+ ('Ḳ', 'Ḳ'),
+ ('Ḵ', 'Ḵ'),
+ ('Ḷ', 'Ḷ'),
+ ('Ḹ', 'Ḹ'),
+ ('Ḻ', 'Ḻ'),
+ ('Ḽ', 'Ḽ'),
+ ('Ḿ', 'Ḿ'),
+ ('Ṁ', 'Ṁ'),
+ ('Ṃ', 'Ṃ'),
+ ('Ṅ', 'Ṅ'),
+ ('Ṇ', 'Ṇ'),
+ ('Ṉ', 'Ṉ'),
+ ('Ṋ', 'Ṋ'),
+ ('Ṍ', 'Ṍ'),
+ ('Ṏ', 'Ṏ'),
+ ('Ṑ', 'Ṑ'),
+ ('Ṓ', 'Ṓ'),
+ ('Ṕ', 'Ṕ'),
+ ('Ṗ', 'Ṗ'),
+ ('Ṙ', 'Ṙ'),
+ ('Ṛ', 'Ṛ'),
+ ('Ṝ', 'Ṝ'),
+ ('Ṟ', 'Ṟ'),
+ ('Ṡ', 'Ṡ'),
+ ('Ṣ', 'Ṣ'),
+ ('Ṥ', 'Ṥ'),
+ ('Ṧ', 'Ṧ'),
+ ('Ṩ', 'Ṩ'),
+ ('Ṫ', 'Ṫ'),
+ ('Ṭ', 'Ṭ'),
+ ('Ṯ', 'Ṯ'),
+ ('Ṱ', 'Ṱ'),
+ ('Ṳ', 'Ṳ'),
+ ('Ṵ', 'Ṵ'),
+ ('Ṷ', 'Ṷ'),
+ ('Ṹ', 'Ṹ'),
+ ('Ṻ', 'Ṻ'),
+ ('Ṽ', 'Ṽ'),
+ ('Ṿ', 'Ṿ'),
+ ('Ẁ', 'Ẁ'),
+ ('Ẃ', 'Ẃ'),
+ ('Ẅ', 'Ẅ'),
+ ('Ẇ', 'Ẇ'),
+ ('Ẉ', 'Ẉ'),
+ ('Ẋ', 'Ẋ'),
+ ('Ẍ', 'Ẍ'),
+ ('Ẏ', 'Ẏ'),
+ ('Ẑ', 'Ẑ'),
+ ('Ẓ', 'Ẓ'),
+ ('Ẕ', 'Ẕ'),
+ ('ẞ', 'ẞ'),
+ ('Ạ', 'Ạ'),
+ ('Ả', 'Ả'),
+ ('Ấ', 'Ấ'),
+ ('Ầ', 'Ầ'),
+ ('Ẩ', 'Ẩ'),
+ ('Ẫ', 'Ẫ'),
+ ('Ậ', 'Ậ'),
+ ('Ắ', 'Ắ'),
+ ('Ằ', 'Ằ'),
+ ('Ẳ', 'Ẳ'),
+ ('Ẵ', 'Ẵ'),
+ ('Ặ', 'Ặ'),
+ ('Ẹ', 'Ẹ'),
+ ('Ẻ', 'Ẻ'),
+ ('Ẽ', 'Ẽ'),
+ ('Ế', 'Ế'),
+ ('Ề', 'Ề'),
+ ('Ể', 'Ể'),
+ ('Ễ', 'Ễ'),
+ ('Ệ', 'Ệ'),
+ ('Ỉ', 'Ỉ'),
+ ('Ị', 'Ị'),
+ ('Ọ', 'Ọ'),
+ ('Ỏ', 'Ỏ'),
+ ('Ố', 'Ố'),
+ ('Ồ', 'Ồ'),
+ ('Ổ', 'Ổ'),
+ ('Ỗ', 'Ỗ'),
+ ('Ộ', 'Ộ'),
+ ('Ớ', 'Ớ'),
+ ('Ờ', 'Ờ'),
+ ('Ở', 'Ở'),
+ ('Ỡ', 'Ỡ'),
+ ('Ợ', 'Ợ'),
+ ('Ụ', 'Ụ'),
+ ('Ủ', 'Ủ'),
+ ('Ứ', 'Ứ'),
+ ('Ừ', 'Ừ'),
+ ('Ử', 'Ử'),
+ ('Ữ', 'Ữ'),
+ ('Ự', 'Ự'),
+ ('Ỳ', 'Ỳ'),
+ ('Ỵ', 'Ỵ'),
+ ('Ỷ', 'Ỷ'),
+ ('Ỹ', 'Ỹ'),
+ ('Ỻ', 'Ỻ'),
+ ('Ỽ', 'Ỽ'),
+ ('Ỿ', 'Ỿ'),
+ ('Ἀ', 'Ἇ'),
+ ('Ἐ', 'Ἕ'),
+ ('Ἠ', 'Ἧ'),
+ ('Ἰ', 'Ἷ'),
+ ('Ὀ', 'Ὅ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'Ὗ'),
+ ('Ὠ', 'Ὧ'),
+ ('Ᾰ', 'Ά'),
+ ('Ὲ', 'Ή'),
+ ('Ῐ', 'Ί'),
+ ('Ῠ', 'Ῥ'),
+ ('Ὸ', 'Ώ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℋ', 'ℍ'),
+ ('ℐ', 'ℒ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℰ', 'ℳ'),
+ ('ℾ', 'ℿ'),
+ ('ⅅ', 'ⅅ'),
+ ('Ↄ', 'Ↄ'),
+ ('Ⰰ', 'Ⱟ'),
+ ('Ⱡ', 'Ⱡ'),
+ ('Ɫ', 'Ɽ'),
+ ('Ⱨ', 'Ⱨ'),
+ ('Ⱪ', 'Ⱪ'),
+ ('Ⱬ', 'Ⱬ'),
+ ('Ɑ', 'Ɒ'),
+ ('Ⱳ', 'Ⱳ'),
+ ('Ⱶ', 'Ⱶ'),
+ ('Ȿ', 'Ⲁ'),
+ ('Ⲃ', 'Ⲃ'),
+ ('Ⲅ', 'Ⲅ'),
+ ('Ⲇ', 'Ⲇ'),
+ ('Ⲉ', 'Ⲉ'),
+ ('Ⲋ', 'Ⲋ'),
+ ('Ⲍ', 'Ⲍ'),
+ ('Ⲏ', 'Ⲏ'),
+ ('Ⲑ', 'Ⲑ'),
+ ('Ⲓ', 'Ⲓ'),
+ ('Ⲕ', 'Ⲕ'),
+ ('Ⲗ', 'Ⲗ'),
+ ('Ⲙ', 'Ⲙ'),
+ ('Ⲛ', 'Ⲛ'),
+ ('Ⲝ', 'Ⲝ'),
+ ('Ⲟ', 'Ⲟ'),
+ ('Ⲡ', 'Ⲡ'),
+ ('Ⲣ', 'Ⲣ'),
+ ('Ⲥ', 'Ⲥ'),
+ ('Ⲧ', 'Ⲧ'),
+ ('Ⲩ', 'Ⲩ'),
+ ('Ⲫ', 'Ⲫ'),
+ ('Ⲭ', 'Ⲭ'),
+ ('Ⲯ', 'Ⲯ'),
+ ('Ⲱ', 'Ⲱ'),
+ ('Ⲳ', 'Ⲳ'),
+ ('Ⲵ', 'Ⲵ'),
+ ('Ⲷ', 'Ⲷ'),
+ ('Ⲹ', 'Ⲹ'),
+ ('Ⲻ', 'Ⲻ'),
+ ('Ⲽ', 'Ⲽ'),
+ ('Ⲿ', 'Ⲿ'),
+ ('Ⳁ', 'Ⳁ'),
+ ('Ⳃ', 'Ⳃ'),
+ ('Ⳅ', 'Ⳅ'),
+ ('Ⳇ', 'Ⳇ'),
+ ('Ⳉ', 'Ⳉ'),
+ ('Ⳋ', 'Ⳋ'),
+ ('Ⳍ', 'Ⳍ'),
+ ('Ⳏ', 'Ⳏ'),
+ ('Ⳑ', 'Ⳑ'),
+ ('Ⳓ', 'Ⳓ'),
+ ('Ⳕ', 'Ⳕ'),
+ ('Ⳗ', 'Ⳗ'),
+ ('Ⳙ', 'Ⳙ'),
+ ('Ⳛ', 'Ⳛ'),
+ ('Ⳝ', 'Ⳝ'),
+ ('Ⳟ', 'Ⳟ'),
+ ('Ⳡ', 'Ⳡ'),
+ ('Ⳣ', 'Ⳣ'),
+ ('Ⳬ', 'Ⳬ'),
+ ('Ⳮ', 'Ⳮ'),
+ ('Ⳳ', 'Ⳳ'),
+ ('Ꙁ', 'Ꙁ'),
+ ('Ꙃ', 'Ꙃ'),
+ ('Ꙅ', 'Ꙅ'),
+ ('Ꙇ', 'Ꙇ'),
+ ('Ꙉ', 'Ꙉ'),
+ ('Ꙋ', 'Ꙋ'),
+ ('Ꙍ', 'Ꙍ'),
+ ('Ꙏ', 'Ꙏ'),
+ ('Ꙑ', 'Ꙑ'),
+ ('Ꙓ', 'Ꙓ'),
+ ('Ꙕ', 'Ꙕ'),
+ ('Ꙗ', 'Ꙗ'),
+ ('Ꙙ', 'Ꙙ'),
+ ('Ꙛ', 'Ꙛ'),
+ ('Ꙝ', 'Ꙝ'),
+ ('Ꙟ', 'Ꙟ'),
+ ('Ꙡ', 'Ꙡ'),
+ ('Ꙣ', 'Ꙣ'),
+ ('Ꙥ', 'Ꙥ'),
+ ('Ꙧ', 'Ꙧ'),
+ ('Ꙩ', 'Ꙩ'),
+ ('Ꙫ', 'Ꙫ'),
+ ('Ꙭ', 'Ꙭ'),
+ ('Ꚁ', 'Ꚁ'),
+ ('Ꚃ', 'Ꚃ'),
+ ('Ꚅ', 'Ꚅ'),
+ ('Ꚇ', 'Ꚇ'),
+ ('Ꚉ', 'Ꚉ'),
+ ('Ꚋ', 'Ꚋ'),
+ ('Ꚍ', 'Ꚍ'),
+ ('Ꚏ', 'Ꚏ'),
+ ('Ꚑ', 'Ꚑ'),
+ ('Ꚓ', 'Ꚓ'),
+ ('Ꚕ', 'Ꚕ'),
+ ('Ꚗ', 'Ꚗ'),
+ ('Ꚙ', 'Ꚙ'),
+ ('Ꚛ', 'Ꚛ'),
+ ('Ꜣ', 'Ꜣ'),
+ ('Ꜥ', 'Ꜥ'),
+ ('Ꜧ', 'Ꜧ'),
+ ('Ꜩ', 'Ꜩ'),
+ ('Ꜫ', 'Ꜫ'),
+ ('Ꜭ', 'Ꜭ'),
+ ('Ꜯ', 'Ꜯ'),
+ ('Ꜳ', 'Ꜳ'),
+ ('Ꜵ', 'Ꜵ'),
+ ('Ꜷ', 'Ꜷ'),
+ ('Ꜹ', 'Ꜹ'),
+ ('Ꜻ', 'Ꜻ'),
+ ('Ꜽ', 'Ꜽ'),
+ ('Ꜿ', 'Ꜿ'),
+ ('Ꝁ', 'Ꝁ'),
+ ('Ꝃ', 'Ꝃ'),
+ ('Ꝅ', 'Ꝅ'),
+ ('Ꝇ', 'Ꝇ'),
+ ('Ꝉ', 'Ꝉ'),
+ ('Ꝋ', 'Ꝋ'),
+ ('Ꝍ', 'Ꝍ'),
+ ('Ꝏ', 'Ꝏ'),
+ ('Ꝑ', 'Ꝑ'),
+ ('Ꝓ', 'Ꝓ'),
+ ('Ꝕ', 'Ꝕ'),
+ ('Ꝗ', 'Ꝗ'),
+ ('Ꝙ', 'Ꝙ'),
+ ('Ꝛ', 'Ꝛ'),
+ ('Ꝝ', 'Ꝝ'),
+ ('Ꝟ', 'Ꝟ'),
+ ('Ꝡ', 'Ꝡ'),
+ ('Ꝣ', 'Ꝣ'),
+ ('Ꝥ', 'Ꝥ'),
+ ('Ꝧ', 'Ꝧ'),
+ ('Ꝩ', 'Ꝩ'),
+ ('Ꝫ', 'Ꝫ'),
+ ('Ꝭ', 'Ꝭ'),
+ ('Ꝯ', 'Ꝯ'),
+ ('Ꝺ', 'Ꝺ'),
+ ('Ꝼ', 'Ꝼ'),
+ ('Ᵹ', 'Ꝿ'),
+ ('Ꞁ', 'Ꞁ'),
+ ('Ꞃ', 'Ꞃ'),
+ ('Ꞅ', 'Ꞅ'),
+ ('Ꞇ', 'Ꞇ'),
+ ('Ꞌ', 'Ꞌ'),
+ ('Ɥ', 'Ɥ'),
+ ('Ꞑ', 'Ꞑ'),
+ ('Ꞓ', 'Ꞓ'),
+ ('Ꞗ', 'Ꞗ'),
+ ('Ꞙ', 'Ꞙ'),
+ ('Ꞛ', 'Ꞛ'),
+ ('Ꞝ', 'Ꞝ'),
+ ('Ꞟ', 'Ꞟ'),
+ ('Ꞡ', 'Ꞡ'),
+ ('Ꞣ', 'Ꞣ'),
+ ('Ꞥ', 'Ꞥ'),
+ ('Ꞧ', 'Ꞧ'),
+ ('Ꞩ', 'Ꞩ'),
+ ('Ɦ', 'Ɪ'),
+ ('Ʞ', 'Ꞵ'),
+ ('Ꞷ', 'Ꞷ'),
+ ('Ꞹ', 'Ꞹ'),
+ ('Ꞻ', 'Ꞻ'),
+ ('Ꞽ', 'Ꞽ'),
+ ('Ꞿ', 'Ꞿ'),
+ ('Ꟁ', 'Ꟁ'),
+ ('Ꟃ', 'Ꟃ'),
+ ('Ꞔ', 'Ꟈ'),
+ ('Ꟊ', 'Ꟊ'),
+ ('Ꟑ', 'Ꟑ'),
+ ('Ꟗ', 'Ꟗ'),
+ ('Ꟙ', 'Ꟙ'),
+ ('Ꟶ', 'Ꟶ'),
+ ('A', 'Z'),
+ ('𐐀', '𐐧'),
+ ('𐒰', '𐓓'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐲀', '𐲲'),
+ ('𑢠', '𑢿'),
+ ('𖹀', '𖹟'),
+ ('𝐀', '𝐙'),
+ ('𝐴', '𝑍'),
+ ('𝑨', '𝒁'),
+ ('𝒜', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒵'),
+ ('𝓐', '𝓩'),
+ ('𝔄', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔸', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕬', '𝖅'),
+ ('𝖠', '𝖹'),
+ ('𝗔', '𝗭'),
+ ('𝘈', '𝘡'),
+ ('𝘼', '𝙕'),
+ ('𝙰', '𝚉'),
+ ('𝚨', '𝛀'),
+ ('𝛢', '𝛺'),
+ ('𝜜', '𝜴'),
+ ('𝝖', '𝝮'),
+ ('𝞐', '𝞨'),
+ ('𝟊', '𝟊'),
+ ('𞤀', '𞤡'),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs b/third_party/rust/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs
new file mode 100644
index 0000000000..294dfbdcc0
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs
@@ -0,0 +1,1416 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate grapheme-cluster-break ucd-15.0.0 --chars
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
+ ("CR", CR),
+ ("Control", CONTROL),
+ ("Extend", EXTEND),
+ ("L", L),
+ ("LF", LF),
+ ("LV", LV),
+ ("LVT", LVT),
+ ("Prepend", PREPEND),
+ ("Regional_Indicator", REGIONAL_INDICATOR),
+ ("SpacingMark", SPACINGMARK),
+ ("T", T),
+ ("V", V),
+ ("ZWJ", ZWJ),
+];
+
+pub const CR: &'static [(char, char)] = &[('\r', '\r')];
+
+pub const CONTROL: &'static [(char, char)] = &[
+ ('\0', '\t'),
+ ('\u{b}', '\u{c}'),
+ ('\u{e}', '\u{1f}'),
+ ('\u{7f}', '\u{9f}'),
+ ('\u{ad}', '\u{ad}'),
+ ('\u{61c}', '\u{61c}'),
+ ('\u{180e}', '\u{180e}'),
+ ('\u{200b}', '\u{200b}'),
+ ('\u{200e}', '\u{200f}'),
+ ('\u{2028}', '\u{202e}'),
+ ('\u{2060}', '\u{206f}'),
+ ('\u{feff}', '\u{feff}'),
+ ('\u{fff0}', '\u{fffb}'),
+ ('\u{13430}', '\u{1343f}'),
+ ('\u{1bca0}', '\u{1bca3}'),
+ ('\u{1d173}', '\u{1d17a}'),
+ ('\u{e0000}', '\u{e001f}'),
+ ('\u{e0080}', '\u{e00ff}'),
+ ('\u{e01f0}', '\u{e0fff}'),
+];
+
+pub const EXTEND: &'static [(char, char)] = &[
+ ('\u{300}', '\u{36f}'),
+ ('\u{483}', '\u{489}'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('\u{610}', '\u{61a}'),
+ ('\u{64b}', '\u{65f}'),
+ ('\u{670}', '\u{670}'),
+ ('\u{6d6}', '\u{6dc}'),
+ ('\u{6df}', '\u{6e4}'),
+ ('\u{6e7}', '\u{6e8}'),
+ ('\u{6ea}', '\u{6ed}'),
+ ('\u{711}', '\u{711}'),
+ ('\u{730}', '\u{74a}'),
+ ('\u{7a6}', '\u{7b0}'),
+ ('\u{7eb}', '\u{7f3}'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('\u{816}', '\u{819}'),
+ ('\u{81b}', '\u{823}'),
+ ('\u{825}', '\u{827}'),
+ ('\u{829}', '\u{82d}'),
+ ('\u{859}', '\u{85b}'),
+ ('\u{898}', '\u{89f}'),
+ ('\u{8ca}', '\u{8e1}'),
+ ('\u{8e3}', '\u{902}'),
+ ('\u{93a}', '\u{93a}'),
+ ('\u{93c}', '\u{93c}'),
+ ('\u{941}', '\u{948}'),
+ ('\u{94d}', '\u{94d}'),
+ ('\u{951}', '\u{957}'),
+ ('\u{962}', '\u{963}'),
+ ('\u{981}', '\u{981}'),
+ ('\u{9bc}', '\u{9bc}'),
+ ('\u{9be}', '\u{9be}'),
+ ('\u{9c1}', '\u{9c4}'),
+ ('\u{9cd}', '\u{9cd}'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('\u{9e2}', '\u{9e3}'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', '\u{a02}'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('\u{a41}', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('\u{a70}', '\u{a71}'),
+ ('\u{a75}', '\u{a75}'),
+ ('\u{a81}', '\u{a82}'),
+ ('\u{abc}', '\u{abc}'),
+ ('\u{ac1}', '\u{ac5}'),
+ ('\u{ac7}', '\u{ac8}'),
+ ('\u{acd}', '\u{acd}'),
+ ('\u{ae2}', '\u{ae3}'),
+ ('\u{afa}', '\u{aff}'),
+ ('\u{b01}', '\u{b01}'),
+ ('\u{b3c}', '\u{b3c}'),
+ ('\u{b3e}', '\u{b3f}'),
+ ('\u{b41}', '\u{b44}'),
+ ('\u{b4d}', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('\u{b62}', '\u{b63}'),
+ ('\u{b82}', '\u{b82}'),
+ ('\u{bbe}', '\u{bbe}'),
+ ('\u{bc0}', '\u{bc0}'),
+ ('\u{bcd}', '\u{bcd}'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('\u{c00}', '\u{c00}'),
+ ('\u{c04}', '\u{c04}'),
+ ('\u{c3c}', '\u{c3c}'),
+ ('\u{c3e}', '\u{c40}'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('\u{c62}', '\u{c63}'),
+ ('\u{c81}', '\u{c81}'),
+ ('\u{cbc}', '\u{cbc}'),
+ ('\u{cbf}', '\u{cbf}'),
+ ('\u{cc2}', '\u{cc2}'),
+ ('\u{cc6}', '\u{cc6}'),
+ ('\u{ccc}', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('\u{ce2}', '\u{ce3}'),
+ ('\u{d00}', '\u{d01}'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('\u{d3e}', '\u{d3e}'),
+ ('\u{d41}', '\u{d44}'),
+ ('\u{d4d}', '\u{d4d}'),
+ ('\u{d57}', '\u{d57}'),
+ ('\u{d62}', '\u{d63}'),
+ ('\u{d81}', '\u{d81}'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dcf}'),
+ ('\u{dd2}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('\u{ddf}', '\u{ddf}'),
+ ('\u{e31}', '\u{e31}'),
+ ('\u{e34}', '\u{e3a}'),
+ ('\u{e47}', '\u{e4e}'),
+ ('\u{eb1}', '\u{eb1}'),
+ ('\u{eb4}', '\u{ebc}'),
+ ('\u{ec8}', '\u{ece}'),
+ ('\u{f18}', '\u{f19}'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('\u{f71}', '\u{f7e}'),
+ ('\u{f80}', '\u{f84}'),
+ ('\u{f86}', '\u{f87}'),
+ ('\u{f8d}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('\u{102d}', '\u{1030}'),
+ ('\u{1032}', '\u{1037}'),
+ ('\u{1039}', '\u{103a}'),
+ ('\u{103d}', '\u{103e}'),
+ ('\u{1058}', '\u{1059}'),
+ ('\u{105e}', '\u{1060}'),
+ ('\u{1071}', '\u{1074}'),
+ ('\u{1082}', '\u{1082}'),
+ ('\u{1085}', '\u{1086}'),
+ ('\u{108d}', '\u{108d}'),
+ ('\u{109d}', '\u{109d}'),
+ ('\u{135d}', '\u{135f}'),
+ ('\u{1712}', '\u{1714}'),
+ ('\u{1732}', '\u{1733}'),
+ ('\u{1752}', '\u{1753}'),
+ ('\u{1772}', '\u{1773}'),
+ ('\u{17b4}', '\u{17b5}'),
+ ('\u{17b7}', '\u{17bd}'),
+ ('\u{17c6}', '\u{17c6}'),
+ ('\u{17c9}', '\u{17d3}'),
+ ('\u{17dd}', '\u{17dd}'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '\u{180f}'),
+ ('\u{1885}', '\u{1886}'),
+ ('\u{18a9}', '\u{18a9}'),
+ ('\u{1920}', '\u{1922}'),
+ ('\u{1927}', '\u{1928}'),
+ ('\u{1932}', '\u{1932}'),
+ ('\u{1939}', '\u{193b}'),
+ ('\u{1a17}', '\u{1a18}'),
+ ('\u{1a1b}', '\u{1a1b}'),
+ ('\u{1a56}', '\u{1a56}'),
+ ('\u{1a58}', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a60}'),
+ ('\u{1a62}', '\u{1a62}'),
+ ('\u{1a65}', '\u{1a6c}'),
+ ('\u{1a73}', '\u{1a7c}'),
+ ('\u{1a7f}', '\u{1a7f}'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1b00}', '\u{1b03}'),
+ ('\u{1b34}', '\u{1b3a}'),
+ ('\u{1b3c}', '\u{1b3c}'),
+ ('\u{1b42}', '\u{1b42}'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', '\u{1b81}'),
+ ('\u{1ba2}', '\u{1ba5}'),
+ ('\u{1ba8}', '\u{1ba9}'),
+ ('\u{1bab}', '\u{1bad}'),
+ ('\u{1be6}', '\u{1be6}'),
+ ('\u{1be8}', '\u{1be9}'),
+ ('\u{1bed}', '\u{1bed}'),
+ ('\u{1bef}', '\u{1bf1}'),
+ ('\u{1c2c}', '\u{1c33}'),
+ ('\u{1c36}', '\u{1c37}'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', '\u{1ce0}'),
+ ('\u{1ce2}', '\u{1ce8}'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('\u{1cf8}', '\u{1cf9}'),
+ ('\u{1dc0}', '\u{1dff}'),
+ ('\u{200c}', '\u{200c}'),
+ ('\u{20d0}', '\u{20f0}'),
+ ('\u{2cef}', '\u{2cf1}'),
+ ('\u{2d7f}', '\u{2d7f}'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('\u{302a}', '\u{302f}'),
+ ('\u{3099}', '\u{309a}'),
+ ('\u{a66f}', '\u{a672}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('\u{a69e}', '\u{a69f}'),
+ ('\u{a6f0}', '\u{a6f1}'),
+ ('\u{a802}', '\u{a802}'),
+ ('\u{a806}', '\u{a806}'),
+ ('\u{a80b}', '\u{a80b}'),
+ ('\u{a825}', '\u{a826}'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('\u{a8c4}', '\u{a8c5}'),
+ ('\u{a8e0}', '\u{a8f1}'),
+ ('\u{a8ff}', '\u{a8ff}'),
+ ('\u{a926}', '\u{a92d}'),
+ ('\u{a947}', '\u{a951}'),
+ ('\u{a980}', '\u{a982}'),
+ ('\u{a9b3}', '\u{a9b3}'),
+ ('\u{a9b6}', '\u{a9b9}'),
+ ('\u{a9bc}', '\u{a9bd}'),
+ ('\u{a9e5}', '\u{a9e5}'),
+ ('\u{aa29}', '\u{aa2e}'),
+ ('\u{aa31}', '\u{aa32}'),
+ ('\u{aa35}', '\u{aa36}'),
+ ('\u{aa43}', '\u{aa43}'),
+ ('\u{aa4c}', '\u{aa4c}'),
+ ('\u{aa7c}', '\u{aa7c}'),
+ ('\u{aab0}', '\u{aab0}'),
+ ('\u{aab2}', '\u{aab4}'),
+ ('\u{aab7}', '\u{aab8}'),
+ ('\u{aabe}', '\u{aabf}'),
+ ('\u{aac1}', '\u{aac1}'),
+ ('\u{aaec}', '\u{aaed}'),
+ ('\u{aaf6}', '\u{aaf6}'),
+ ('\u{abe5}', '\u{abe5}'),
+ ('\u{abe8}', '\u{abe8}'),
+ ('\u{abed}', '\u{abed}'),
+ ('\u{fb1e}', '\u{fb1e}'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('\u{10376}', '\u{1037a}'),
+ ('\u{10a01}', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '\u{10a0f}'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('\u{10ae5}', '\u{10ae6}'),
+ ('\u{10d24}', '\u{10d27}'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('\u{10f46}', '\u{10f50}'),
+ ('\u{10f82}', '\u{10f85}'),
+ ('\u{11001}', '\u{11001}'),
+ ('\u{11038}', '\u{11046}'),
+ ('\u{11070}', '\u{11070}'),
+ ('\u{11073}', '\u{11074}'),
+ ('\u{1107f}', '\u{11081}'),
+ ('\u{110b3}', '\u{110b6}'),
+ ('\u{110b9}', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('\u{11100}', '\u{11102}'),
+ ('\u{11127}', '\u{1112b}'),
+ ('\u{1112d}', '\u{11134}'),
+ ('\u{11173}', '\u{11173}'),
+ ('\u{11180}', '\u{11181}'),
+ ('\u{111b6}', '\u{111be}'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('\u{111cf}', '\u{111cf}'),
+ ('\u{1122f}', '\u{11231}'),
+ ('\u{11234}', '\u{11234}'),
+ ('\u{11236}', '\u{11237}'),
+ ('\u{1123e}', '\u{1123e}'),
+ ('\u{11241}', '\u{11241}'),
+ ('\u{112df}', '\u{112df}'),
+ ('\u{112e3}', '\u{112ea}'),
+ ('\u{11300}', '\u{11301}'),
+ ('\u{1133b}', '\u{1133c}'),
+ ('\u{1133e}', '\u{1133e}'),
+ ('\u{11340}', '\u{11340}'),
+ ('\u{11357}', '\u{11357}'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('\u{11438}', '\u{1143f}'),
+ ('\u{11442}', '\u{11444}'),
+ ('\u{11446}', '\u{11446}'),
+ ('\u{1145e}', '\u{1145e}'),
+ ('\u{114b0}', '\u{114b0}'),
+ ('\u{114b3}', '\u{114b8}'),
+ ('\u{114ba}', '\u{114ba}'),
+ ('\u{114bd}', '\u{114bd}'),
+ ('\u{114bf}', '\u{114c0}'),
+ ('\u{114c2}', '\u{114c3}'),
+ ('\u{115af}', '\u{115af}'),
+ ('\u{115b2}', '\u{115b5}'),
+ ('\u{115bc}', '\u{115bd}'),
+ ('\u{115bf}', '\u{115c0}'),
+ ('\u{115dc}', '\u{115dd}'),
+ ('\u{11633}', '\u{1163a}'),
+ ('\u{1163d}', '\u{1163d}'),
+ ('\u{1163f}', '\u{11640}'),
+ ('\u{116ab}', '\u{116ab}'),
+ ('\u{116ad}', '\u{116ad}'),
+ ('\u{116b0}', '\u{116b5}'),
+ ('\u{116b7}', '\u{116b7}'),
+ ('\u{1171d}', '\u{1171f}'),
+ ('\u{11722}', '\u{11725}'),
+ ('\u{11727}', '\u{1172b}'),
+ ('\u{1182f}', '\u{11837}'),
+ ('\u{11839}', '\u{1183a}'),
+ ('\u{11930}', '\u{11930}'),
+ ('\u{1193b}', '\u{1193c}'),
+ ('\u{1193e}', '\u{1193e}'),
+ ('\u{11943}', '\u{11943}'),
+ ('\u{119d4}', '\u{119d7}'),
+ ('\u{119da}', '\u{119db}'),
+ ('\u{119e0}', '\u{119e0}'),
+ ('\u{11a01}', '\u{11a0a}'),
+ ('\u{11a33}', '\u{11a38}'),
+ ('\u{11a3b}', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('\u{11a51}', '\u{11a56}'),
+ ('\u{11a59}', '\u{11a5b}'),
+ ('\u{11a8a}', '\u{11a96}'),
+ ('\u{11a98}', '\u{11a99}'),
+ ('\u{11c30}', '\u{11c36}'),
+ ('\u{11c38}', '\u{11c3d}'),
+ ('\u{11c3f}', '\u{11c3f}'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('\u{11caa}', '\u{11cb0}'),
+ ('\u{11cb2}', '\u{11cb3}'),
+ ('\u{11cb5}', '\u{11cb6}'),
+ ('\u{11d31}', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d45}'),
+ ('\u{11d47}', '\u{11d47}'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('\u{11d95}', '\u{11d95}'),
+ ('\u{11d97}', '\u{11d97}'),
+ ('\u{11ef3}', '\u{11ef4}'),
+ ('\u{11f00}', '\u{11f01}'),
+ ('\u{11f36}', '\u{11f3a}'),
+ ('\u{11f40}', '\u{11f40}'),
+ ('\u{11f42}', '\u{11f42}'),
+ ('\u{13440}', '\u{13440}'),
+ ('\u{13447}', '\u{13455}'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('\u{16b30}', '\u{16b36}'),
+ ('\u{16f4f}', '\u{16f4f}'),
+ ('\u{16f8f}', '\u{16f92}'),
+ ('\u{16fe4}', '\u{16fe4}'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d165}', '\u{1d165}'),
+ ('\u{1d167}', '\u{1d169}'),
+ ('\u{1d16e}', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('\u{1e130}', '\u{1e136}'),
+ ('\u{1e2ae}', '\u{1e2ae}'),
+ ('\u{1e2ec}', '\u{1e2ef}'),
+ ('\u{1e4ec}', '\u{1e4ef}'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('\u{1e944}', '\u{1e94a}'),
+ ('🏻', '🏿'),
+ ('\u{e0020}', '\u{e007f}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const L: &'static [(char, char)] = &[('ᄀ', 'ᅟ'), ('ꥠ', 'ꥼ')];
+
+pub const LF: &'static [(char, char)] = &[('\n', '\n')];
+
+pub const LV: &'static [(char, char)] = &[
+ ('가', '가'),
+ ('개', '개'),
+ ('갸', '갸'),
+ ('걔', '걔'),
+ ('거', '거'),
+ ('게', '게'),
+ ('겨', '겨'),
+ ('계', '계'),
+ ('고', '고'),
+ ('과', '과'),
+ ('괘', '괘'),
+ ('괴', '괴'),
+ ('교', '교'),
+ ('구', '구'),
+ ('궈', '궈'),
+ ('궤', '궤'),
+ ('귀', '귀'),
+ ('규', '규'),
+ ('그', '그'),
+ ('긔', '긔'),
+ ('기', '기'),
+ ('까', '까'),
+ ('깨', '깨'),
+ ('꺄', '꺄'),
+ ('꺠', '꺠'),
+ ('꺼', '꺼'),
+ ('께', '께'),
+ ('껴', '껴'),
+ ('꼐', '꼐'),
+ ('꼬', '꼬'),
+ ('꽈', '꽈'),
+ ('꽤', '꽤'),
+ ('꾀', '꾀'),
+ ('꾜', '꾜'),
+ ('꾸', '꾸'),
+ ('꿔', '꿔'),
+ ('꿰', '꿰'),
+ ('뀌', '뀌'),
+ ('뀨', '뀨'),
+ ('끄', '끄'),
+ ('끠', '끠'),
+ ('끼', '끼'),
+ ('나', '나'),
+ ('내', '내'),
+ ('냐', '냐'),
+ ('냬', '냬'),
+ ('너', '너'),
+ ('네', '네'),
+ ('녀', '녀'),
+ ('녜', '녜'),
+ ('노', '노'),
+ ('놔', '놔'),
+ ('놰', '놰'),
+ ('뇌', '뇌'),
+ ('뇨', '뇨'),
+ ('누', '누'),
+ ('눠', '눠'),
+ ('눼', '눼'),
+ ('뉘', '뉘'),
+ ('뉴', '뉴'),
+ ('느', '느'),
+ ('늬', '늬'),
+ ('니', '니'),
+ ('다', '다'),
+ ('대', '대'),
+ ('댜', '댜'),
+ ('댸', '댸'),
+ ('더', '더'),
+ ('데', '데'),
+ ('뎌', '뎌'),
+ ('뎨', '뎨'),
+ ('도', '도'),
+ ('돠', '돠'),
+ ('돼', '돼'),
+ ('되', '되'),
+ ('됴', '됴'),
+ ('두', '두'),
+ ('둬', '둬'),
+ ('뒈', '뒈'),
+ ('뒤', '뒤'),
+ ('듀', '듀'),
+ ('드', '드'),
+ ('듸', '듸'),
+ ('디', '디'),
+ ('따', '따'),
+ ('때', '때'),
+ ('땨', '땨'),
+ ('떄', '떄'),
+ ('떠', '떠'),
+ ('떼', '떼'),
+ ('뗘', '뗘'),
+ ('뗴', '뗴'),
+ ('또', '또'),
+ ('똬', '똬'),
+ ('뙈', '뙈'),
+ ('뙤', '뙤'),
+ ('뚀', '뚀'),
+ ('뚜', '뚜'),
+ ('뚸', '뚸'),
+ ('뛔', '뛔'),
+ ('뛰', '뛰'),
+ ('뜌', '뜌'),
+ ('뜨', '뜨'),
+ ('띄', '띄'),
+ ('띠', '띠'),
+ ('라', '라'),
+ ('래', '래'),
+ ('랴', '랴'),
+ ('럐', '럐'),
+ ('러', '러'),
+ ('레', '레'),
+ ('려', '려'),
+ ('례', '례'),
+ ('로', '로'),
+ ('롸', '롸'),
+ ('뢔', '뢔'),
+ ('뢰', '뢰'),
+ ('료', '료'),
+ ('루', '루'),
+ ('뤄', '뤄'),
+ ('뤠', '뤠'),
+ ('뤼', '뤼'),
+ ('류', '류'),
+ ('르', '르'),
+ ('릐', '릐'),
+ ('리', '리'),
+ ('마', '마'),
+ ('매', '매'),
+ ('먀', '먀'),
+ ('먜', '먜'),
+ ('머', '머'),
+ ('메', '메'),
+ ('며', '며'),
+ ('몌', '몌'),
+ ('모', '모'),
+ ('뫄', '뫄'),
+ ('뫠', '뫠'),
+ ('뫼', '뫼'),
+ ('묘', '묘'),
+ ('무', '무'),
+ ('뭐', '뭐'),
+ ('뭬', '뭬'),
+ ('뮈', '뮈'),
+ ('뮤', '뮤'),
+ ('므', '므'),
+ ('믜', '믜'),
+ ('미', '미'),
+ ('바', '바'),
+ ('배', '배'),
+ ('뱌', '뱌'),
+ ('뱨', '뱨'),
+ ('버', '버'),
+ ('베', '베'),
+ ('벼', '벼'),
+ ('볘', '볘'),
+ ('보', '보'),
+ ('봐', '봐'),
+ ('봬', '봬'),
+ ('뵈', '뵈'),
+ ('뵤', '뵤'),
+ ('부', '부'),
+ ('붜', '붜'),
+ ('붸', '붸'),
+ ('뷔', '뷔'),
+ ('뷰', '뷰'),
+ ('브', '브'),
+ ('븨', '븨'),
+ ('비', '비'),
+ ('빠', '빠'),
+ ('빼', '빼'),
+ ('뺘', '뺘'),
+ ('뺴', '뺴'),
+ ('뻐', '뻐'),
+ ('뻬', '뻬'),
+ ('뼈', '뼈'),
+ ('뼤', '뼤'),
+ ('뽀', '뽀'),
+ ('뽜', '뽜'),
+ ('뽸', '뽸'),
+ ('뾔', '뾔'),
+ ('뾰', '뾰'),
+ ('뿌', '뿌'),
+ ('뿨', '뿨'),
+ ('쀄', '쀄'),
+ ('쀠', '쀠'),
+ ('쀼', '쀼'),
+ ('쁘', '쁘'),
+ ('쁴', '쁴'),
+ ('삐', '삐'),
+ ('사', '사'),
+ ('새', '새'),
+ ('샤', '샤'),
+ ('섀', '섀'),
+ ('서', '서'),
+ ('세', '세'),
+ ('셔', '셔'),
+ ('셰', '셰'),
+ ('소', '소'),
+ ('솨', '솨'),
+ ('쇄', '쇄'),
+ ('쇠', '쇠'),
+ ('쇼', '쇼'),
+ ('수', '수'),
+ ('숴', '숴'),
+ ('쉐', '쉐'),
+ ('쉬', '쉬'),
+ ('슈', '슈'),
+ ('스', '스'),
+ ('싀', '싀'),
+ ('시', '시'),
+ ('싸', '싸'),
+ ('쌔', '쌔'),
+ ('쌰', '쌰'),
+ ('썌', '썌'),
+ ('써', '써'),
+ ('쎄', '쎄'),
+ ('쎠', '쎠'),
+ ('쎼', '쎼'),
+ ('쏘', '쏘'),
+ ('쏴', '쏴'),
+ ('쐐', '쐐'),
+ ('쐬', '쐬'),
+ ('쑈', '쑈'),
+ ('쑤', '쑤'),
+ ('쒀', '쒀'),
+ ('쒜', '쒜'),
+ ('쒸', '쒸'),
+ ('쓔', '쓔'),
+ ('쓰', '쓰'),
+ ('씌', '씌'),
+ ('씨', '씨'),
+ ('아', '아'),
+ ('애', '애'),
+ ('야', '야'),
+ ('얘', '얘'),
+ ('어', '어'),
+ ('에', '에'),
+ ('여', '여'),
+ ('예', '예'),
+ ('오', '오'),
+ ('와', '와'),
+ ('왜', '왜'),
+ ('외', '외'),
+ ('요', '요'),
+ ('우', '우'),
+ ('워', '워'),
+ ('웨', '웨'),
+ ('위', '위'),
+ ('유', '유'),
+ ('으', '으'),
+ ('의', '의'),
+ ('이', '이'),
+ ('자', '자'),
+ ('재', '재'),
+ ('쟈', '쟈'),
+ ('쟤', '쟤'),
+ ('저', '저'),
+ ('제', '제'),
+ ('져', '져'),
+ ('졔', '졔'),
+ ('조', '조'),
+ ('좌', '좌'),
+ ('좨', '좨'),
+ ('죄', '죄'),
+ ('죠', '죠'),
+ ('주', '주'),
+ ('줘', '줘'),
+ ('줴', '줴'),
+ ('쥐', '쥐'),
+ ('쥬', '쥬'),
+ ('즈', '즈'),
+ ('즤', '즤'),
+ ('지', '지'),
+ ('짜', '짜'),
+ ('째', '째'),
+ ('쨔', '쨔'),
+ ('쨰', '쨰'),
+ ('쩌', '쩌'),
+ ('쩨', '쩨'),
+ ('쪄', '쪄'),
+ ('쪠', '쪠'),
+ ('쪼', '쪼'),
+ ('쫘', '쫘'),
+ ('쫴', '쫴'),
+ ('쬐', '쬐'),
+ ('쬬', '쬬'),
+ ('쭈', '쭈'),
+ ('쭤', '쭤'),
+ ('쮀', '쮀'),
+ ('쮜', '쮜'),
+ ('쮸', '쮸'),
+ ('쯔', '쯔'),
+ ('쯰', '쯰'),
+ ('찌', '찌'),
+ ('차', '차'),
+ ('채', '채'),
+ ('챠', '챠'),
+ ('챼', '챼'),
+ ('처', '처'),
+ ('체', '체'),
+ ('쳐', '쳐'),
+ ('쳬', '쳬'),
+ ('초', '초'),
+ ('촤', '촤'),
+ ('쵀', '쵀'),
+ ('최', '최'),
+ ('쵸', '쵸'),
+ ('추', '추'),
+ ('춰', '춰'),
+ ('췌', '췌'),
+ ('취', '취'),
+ ('츄', '츄'),
+ ('츠', '츠'),
+ ('츼', '츼'),
+ ('치', '치'),
+ ('카', '카'),
+ ('캐', '캐'),
+ ('캬', '캬'),
+ ('컈', '컈'),
+ ('커', '커'),
+ ('케', '케'),
+ ('켜', '켜'),
+ ('켸', '켸'),
+ ('코', '코'),
+ ('콰', '콰'),
+ ('쾌', '쾌'),
+ ('쾨', '쾨'),
+ ('쿄', '쿄'),
+ ('쿠', '쿠'),
+ ('쿼', '쿼'),
+ ('퀘', '퀘'),
+ ('퀴', '퀴'),
+ ('큐', '큐'),
+ ('크', '크'),
+ ('킈', '킈'),
+ ('키', '키'),
+ ('타', '타'),
+ ('태', '태'),
+ ('탸', '탸'),
+ ('턔', '턔'),
+ ('터', '터'),
+ ('테', '테'),
+ ('텨', '텨'),
+ ('톄', '톄'),
+ ('토', '토'),
+ ('톼', '톼'),
+ ('퇘', '퇘'),
+ ('퇴', '퇴'),
+ ('툐', '툐'),
+ ('투', '투'),
+ ('퉈', '퉈'),
+ ('퉤', '퉤'),
+ ('튀', '튀'),
+ ('튜', '튜'),
+ ('트', '트'),
+ ('틔', '틔'),
+ ('티', '티'),
+ ('파', '파'),
+ ('패', '패'),
+ ('퍄', '퍄'),
+ ('퍠', '퍠'),
+ ('퍼', '퍼'),
+ ('페', '페'),
+ ('펴', '펴'),
+ ('폐', '폐'),
+ ('포', '포'),
+ ('퐈', '퐈'),
+ ('퐤', '퐤'),
+ ('푀', '푀'),
+ ('표', '표'),
+ ('푸', '푸'),
+ ('풔', '풔'),
+ ('풰', '풰'),
+ ('퓌', '퓌'),
+ ('퓨', '퓨'),
+ ('프', '프'),
+ ('픠', '픠'),
+ ('피', '피'),
+ ('하', '하'),
+ ('해', '해'),
+ ('햐', '햐'),
+ ('햬', '햬'),
+ ('허', '허'),
+ ('헤', '헤'),
+ ('혀', '혀'),
+ ('혜', '혜'),
+ ('호', '호'),
+ ('화', '화'),
+ ('홰', '홰'),
+ ('회', '회'),
+ ('효', '효'),
+ ('후', '후'),
+ ('훠', '훠'),
+ ('훼', '훼'),
+ ('휘', '휘'),
+ ('휴', '휴'),
+ ('흐', '흐'),
+ ('희', '희'),
+ ('히', '히'),
+];
+
+pub const LVT: &'static [(char, char)] = &[
+ ('각', '갛'),
+ ('객', '갷'),
+ ('갹', '걓'),
+ ('걕', '걯'),
+ ('걱', '겋'),
+ ('겍', '겧'),
+ ('격', '곃'),
+ ('곅', '곟'),
+ ('곡', '곻'),
+ ('곽', '괗'),
+ ('괙', '괳'),
+ ('괵', '굏'),
+ ('굑', '굫'),
+ ('국', '궇'),
+ ('궉', '궣'),
+ ('궥', '궿'),
+ ('귁', '귛'),
+ ('귝', '귷'),
+ ('극', '긓'),
+ ('긕', '긯'),
+ ('긱', '깋'),
+ ('깍', '깧'),
+ ('깩', '꺃'),
+ ('꺅', '꺟'),
+ ('꺡', '꺻'),
+ ('꺽', '껗'),
+ ('껙', '껳'),
+ ('껵', '꼏'),
+ ('꼑', '꼫'),
+ ('꼭', '꽇'),
+ ('꽉', '꽣'),
+ ('꽥', '꽿'),
+ ('꾁', '꾛'),
+ ('꾝', '꾷'),
+ ('꾹', '꿓'),
+ ('꿕', '꿯'),
+ ('꿱', '뀋'),
+ ('뀍', '뀧'),
+ ('뀩', '끃'),
+ ('끅', '끟'),
+ ('끡', '끻'),
+ ('끽', '낗'),
+ ('낙', '낳'),
+ ('낵', '냏'),
+ ('냑', '냫'),
+ ('냭', '넇'),
+ ('넉', '넣'),
+ ('넥', '넿'),
+ ('녁', '녛'),
+ ('녝', '녷'),
+ ('녹', '놓'),
+ ('놕', '놯'),
+ ('놱', '뇋'),
+ ('뇍', '뇧'),
+ ('뇩', '눃'),
+ ('눅', '눟'),
+ ('눡', '눻'),
+ ('눽', '뉗'),
+ ('뉙', '뉳'),
+ ('뉵', '늏'),
+ ('늑', '늫'),
+ ('늭', '닇'),
+ ('닉', '닣'),
+ ('닥', '닿'),
+ ('댁', '댛'),
+ ('댝', '댷'),
+ ('댹', '덓'),
+ ('덕', '덯'),
+ ('덱', '뎋'),
+ ('뎍', '뎧'),
+ ('뎩', '돃'),
+ ('독', '돟'),
+ ('돡', '돻'),
+ ('돽', '됗'),
+ ('됙', '됳'),
+ ('됵', '둏'),
+ ('둑', '둫'),
+ ('둭', '뒇'),
+ ('뒉', '뒣'),
+ ('뒥', '뒿'),
+ ('듁', '듛'),
+ ('득', '듷'),
+ ('듹', '딓'),
+ ('딕', '딯'),
+ ('딱', '땋'),
+ ('땍', '땧'),
+ ('땩', '떃'),
+ ('떅', '떟'),
+ ('떡', '떻'),
+ ('떽', '뗗'),
+ ('뗙', '뗳'),
+ ('뗵', '똏'),
+ ('똑', '똫'),
+ ('똭', '뙇'),
+ ('뙉', '뙣'),
+ ('뙥', '뙿'),
+ ('뚁', '뚛'),
+ ('뚝', '뚷'),
+ ('뚹', '뛓'),
+ ('뛕', '뛯'),
+ ('뛱', '뜋'),
+ ('뜍', '뜧'),
+ ('뜩', '띃'),
+ ('띅', '띟'),
+ ('띡', '띻'),
+ ('락', '랗'),
+ ('랙', '랳'),
+ ('략', '럏'),
+ ('럑', '럫'),
+ ('럭', '렇'),
+ ('렉', '렣'),
+ ('력', '렿'),
+ ('롁', '롛'),
+ ('록', '롷'),
+ ('롹', '뢓'),
+ ('뢕', '뢯'),
+ ('뢱', '룋'),
+ ('룍', '룧'),
+ ('룩', '뤃'),
+ ('뤅', '뤟'),
+ ('뤡', '뤻'),
+ ('뤽', '륗'),
+ ('륙', '륳'),
+ ('륵', '릏'),
+ ('릑', '릫'),
+ ('릭', '맇'),
+ ('막', '맣'),
+ ('맥', '맿'),
+ ('먁', '먛'),
+ ('먝', '먷'),
+ ('먹', '멓'),
+ ('멕', '멯'),
+ ('멱', '몋'),
+ ('몍', '몧'),
+ ('목', '뫃'),
+ ('뫅', '뫟'),
+ ('뫡', '뫻'),
+ ('뫽', '묗'),
+ ('묙', '묳'),
+ ('묵', '뭏'),
+ ('뭑', '뭫'),
+ ('뭭', '뮇'),
+ ('뮉', '뮣'),
+ ('뮥', '뮿'),
+ ('믁', '믛'),
+ ('믝', '믷'),
+ ('믹', '밓'),
+ ('박', '밯'),
+ ('백', '뱋'),
+ ('뱍', '뱧'),
+ ('뱩', '벃'),
+ ('벅', '벟'),
+ ('벡', '벻'),
+ ('벽', '볗'),
+ ('볙', '볳'),
+ ('복', '봏'),
+ ('봑', '봫'),
+ ('봭', '뵇'),
+ ('뵉', '뵣'),
+ ('뵥', '뵿'),
+ ('북', '붛'),
+ ('붝', '붷'),
+ ('붹', '뷓'),
+ ('뷕', '뷯'),
+ ('뷱', '븋'),
+ ('븍', '븧'),
+ ('븩', '빃'),
+ ('빅', '빟'),
+ ('빡', '빻'),
+ ('빽', '뺗'),
+ ('뺙', '뺳'),
+ ('뺵', '뻏'),
+ ('뻑', '뻫'),
+ ('뻭', '뼇'),
+ ('뼉', '뼣'),
+ ('뼥', '뼿'),
+ ('뽁', '뽛'),
+ ('뽝', '뽷'),
+ ('뽹', '뾓'),
+ ('뾕', '뾯'),
+ ('뾱', '뿋'),
+ ('뿍', '뿧'),
+ ('뿩', '쀃'),
+ ('쀅', '쀟'),
+ ('쀡', '쀻'),
+ ('쀽', '쁗'),
+ ('쁙', '쁳'),
+ ('쁵', '삏'),
+ ('삑', '삫'),
+ ('삭', '샇'),
+ ('색', '샣'),
+ ('샥', '샿'),
+ ('섁', '섛'),
+ ('석', '섷'),
+ ('섹', '셓'),
+ ('셕', '셯'),
+ ('셱', '솋'),
+ ('속', '솧'),
+ ('솩', '쇃'),
+ ('쇅', '쇟'),
+ ('쇡', '쇻'),
+ ('쇽', '숗'),
+ ('숙', '숳'),
+ ('숵', '쉏'),
+ ('쉑', '쉫'),
+ ('쉭', '슇'),
+ ('슉', '슣'),
+ ('슥', '슿'),
+ ('싁', '싛'),
+ ('식', '싷'),
+ ('싹', '쌓'),
+ ('쌕', '쌯'),
+ ('쌱', '썋'),
+ ('썍', '썧'),
+ ('썩', '쎃'),
+ ('쎅', '쎟'),
+ ('쎡', '쎻'),
+ ('쎽', '쏗'),
+ ('쏙', '쏳'),
+ ('쏵', '쐏'),
+ ('쐑', '쐫'),
+ ('쐭', '쑇'),
+ ('쑉', '쑣'),
+ ('쑥', '쑿'),
+ ('쒁', '쒛'),
+ ('쒝', '쒷'),
+ ('쒹', '쓓'),
+ ('쓕', '쓯'),
+ ('쓱', '씋'),
+ ('씍', '씧'),
+ ('씩', '앃'),
+ ('악', '앟'),
+ ('액', '앻'),
+ ('약', '얗'),
+ ('얙', '얳'),
+ ('억', '엏'),
+ ('엑', '엫'),
+ ('역', '옇'),
+ ('옉', '옣'),
+ ('옥', '옿'),
+ ('왁', '왛'),
+ ('왝', '왷'),
+ ('왹', '욓'),
+ ('욕', '욯'),
+ ('욱', '웋'),
+ ('웍', '웧'),
+ ('웩', '윃'),
+ ('윅', '윟'),
+ ('육', '윻'),
+ ('윽', '읗'),
+ ('읙', '읳'),
+ ('익', '잏'),
+ ('작', '잫'),
+ ('잭', '쟇'),
+ ('쟉', '쟣'),
+ ('쟥', '쟿'),
+ ('적', '젛'),
+ ('젝', '젷'),
+ ('젹', '졓'),
+ ('졕', '졯'),
+ ('족', '좋'),
+ ('좍', '좧'),
+ ('좩', '죃'),
+ ('죅', '죟'),
+ ('죡', '죻'),
+ ('죽', '줗'),
+ ('줙', '줳'),
+ ('줵', '쥏'),
+ ('쥑', '쥫'),
+ ('쥭', '즇'),
+ ('즉', '즣'),
+ ('즥', '즿'),
+ ('직', '짛'),
+ ('짝', '짷'),
+ ('짹', '쨓'),
+ ('쨕', '쨯'),
+ ('쨱', '쩋'),
+ ('쩍', '쩧'),
+ ('쩩', '쪃'),
+ ('쪅', '쪟'),
+ ('쪡', '쪻'),
+ ('쪽', '쫗'),
+ ('쫙', '쫳'),
+ ('쫵', '쬏'),
+ ('쬑', '쬫'),
+ ('쬭', '쭇'),
+ ('쭉', '쭣'),
+ ('쭥', '쭿'),
+ ('쮁', '쮛'),
+ ('쮝', '쮷'),
+ ('쮹', '쯓'),
+ ('쯕', '쯯'),
+ ('쯱', '찋'),
+ ('찍', '찧'),
+ ('착', '챃'),
+ ('책', '챟'),
+ ('챡', '챻'),
+ ('챽', '첗'),
+ ('척', '첳'),
+ ('첵', '쳏'),
+ ('쳑', '쳫'),
+ ('쳭', '촇'),
+ ('촉', '촣'),
+ ('촥', '촿'),
+ ('쵁', '쵛'),
+ ('쵝', '쵷'),
+ ('쵹', '춓'),
+ ('축', '춯'),
+ ('춱', '췋'),
+ ('췍', '췧'),
+ ('췩', '츃'),
+ ('츅', '츟'),
+ ('측', '츻'),
+ ('츽', '칗'),
+ ('칙', '칳'),
+ ('칵', '캏'),
+ ('캑', '캫'),
+ ('캭', '컇'),
+ ('컉', '컣'),
+ ('컥', '컿'),
+ ('켁', '켛'),
+ ('켝', '켷'),
+ ('켹', '콓'),
+ ('콕', '콯'),
+ ('콱', '쾋'),
+ ('쾍', '쾧'),
+ ('쾩', '쿃'),
+ ('쿅', '쿟'),
+ ('쿡', '쿻'),
+ ('쿽', '퀗'),
+ ('퀙', '퀳'),
+ ('퀵', '큏'),
+ ('큑', '큫'),
+ ('큭', '킇'),
+ ('킉', '킣'),
+ ('킥', '킿'),
+ ('탁', '탛'),
+ ('택', '탷'),
+ ('탹', '턓'),
+ ('턕', '턯'),
+ ('턱', '텋'),
+ ('텍', '텧'),
+ ('텩', '톃'),
+ ('톅', '톟'),
+ ('톡', '톻'),
+ ('톽', '퇗'),
+ ('퇙', '퇳'),
+ ('퇵', '툏'),
+ ('툑', '툫'),
+ ('툭', '퉇'),
+ ('퉉', '퉣'),
+ ('퉥', '퉿'),
+ ('튁', '튛'),
+ ('튝', '튷'),
+ ('특', '틓'),
+ ('틕', '틯'),
+ ('틱', '팋'),
+ ('팍', '팧'),
+ ('팩', '퍃'),
+ ('퍅', '퍟'),
+ ('퍡', '퍻'),
+ ('퍽', '펗'),
+ ('펙', '펳'),
+ ('펵', '폏'),
+ ('폑', '폫'),
+ ('폭', '퐇'),
+ ('퐉', '퐣'),
+ ('퐥', '퐿'),
+ ('푁', '푛'),
+ ('푝', '푷'),
+ ('푹', '풓'),
+ ('풕', '풯'),
+ ('풱', '퓋'),
+ ('퓍', '퓧'),
+ ('퓩', '픃'),
+ ('픅', '픟'),
+ ('픡', '픻'),
+ ('픽', '핗'),
+ ('학', '핳'),
+ ('핵', '햏'),
+ ('햑', '햫'),
+ ('햭', '헇'),
+ ('헉', '헣'),
+ ('헥', '헿'),
+ ('혁', '혛'),
+ ('혝', '혷'),
+ ('혹', '홓'),
+ ('확', '홯'),
+ ('홱', '횋'),
+ ('획', '횧'),
+ ('횩', '훃'),
+ ('훅', '훟'),
+ ('훡', '훻'),
+ ('훽', '휗'),
+ ('휙', '휳'),
+ ('휵', '흏'),
+ ('흑', '흫'),
+ ('흭', '힇'),
+ ('힉', '힣'),
+];
+
+pub const PREPEND: &'static [(char, char)] = &[
+ ('\u{600}', '\u{605}'),
+ ('\u{6dd}', '\u{6dd}'),
+ ('\u{70f}', '\u{70f}'),
+ ('\u{890}', '\u{891}'),
+ ('\u{8e2}', '\u{8e2}'),
+ ('ൎ', 'ൎ'),
+ ('\u{110bd}', '\u{110bd}'),
+ ('\u{110cd}', '\u{110cd}'),
+ ('𑇂', '𑇃'),
+ ('𑤿', '𑤿'),
+ ('𑥁', '𑥁'),
+ ('𑨺', '𑨺'),
+ ('𑪄', '𑪉'),
+ ('𑵆', '𑵆'),
+ ('𑼂', '𑼂'),
+];
+
+pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')];
+
+pub const SPACINGMARK: &'static [(char, char)] = &[
+ ('ः', 'ः'),
+ ('ऻ', 'ऻ'),
+ ('ा', 'ी'),
+ ('ॉ', 'ौ'),
+ ('ॎ', 'ॏ'),
+ ('ং', 'ঃ'),
+ ('ি', 'ী'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৌ'),
+ ('ਃ', 'ਃ'),
+ ('ਾ', 'ੀ'),
+ ('ઃ', 'ઃ'),
+ ('ા', 'ી'),
+ ('ૉ', 'ૉ'),
+ ('ો', 'ૌ'),
+ ('ଂ', 'ଃ'),
+ ('ୀ', 'ୀ'),
+ ('େ', 'ୈ'),
+ ('ୋ', 'ୌ'),
+ ('ி', 'ி'),
+ ('ு', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', 'ௌ'),
+ ('ఁ', 'ః'),
+ ('ు', 'ౄ'),
+ ('ಂ', 'ಃ'),
+ ('ಾ', 'ಾ'),
+ ('ೀ', 'ು'),
+ ('ೃ', 'ೄ'),
+ ('ೇ', 'ೈ'),
+ ('ೊ', 'ೋ'),
+ ('ೳ', 'ೳ'),
+ ('ം', 'ഃ'),
+ ('ി', 'ീ'),
+ ('െ', 'ൈ'),
+ ('ൊ', 'ൌ'),
+ ('ං', 'ඃ'),
+ ('ැ', 'ෑ'),
+ ('ෘ', 'ෞ'),
+ ('ෲ', 'ෳ'),
+ ('ำ', 'ำ'),
+ ('ຳ', 'ຳ'),
+ ('༾', '༿'),
+ ('ཿ', 'ཿ'),
+ ('ေ', 'ေ'),
+ ('ျ', 'ြ'),
+ ('ၖ', 'ၗ'),
+ ('ႄ', 'ႄ'),
+ ('᜕', '᜕'),
+ ('᜴', '᜴'),
+ ('ា', 'ា'),
+ ('ើ', 'ៅ'),
+ ('ះ', 'ៈ'),
+ ('ᤣ', 'ᤦ'),
+ ('ᤩ', 'ᤫ'),
+ ('ᤰ', 'ᤱ'),
+ ('ᤳ', 'ᤸ'),
+ ('ᨙ', 'ᨚ'),
+ ('ᩕ', 'ᩕ'),
+ ('ᩗ', 'ᩗ'),
+ ('ᩭ', 'ᩲ'),
+ ('ᬄ', 'ᬄ'),
+ ('ᬻ', 'ᬻ'),
+ ('ᬽ', 'ᭁ'),
+ ('ᭃ', '᭄'),
+ ('ᮂ', 'ᮂ'),
+ ('ᮡ', 'ᮡ'),
+ ('ᮦ', 'ᮧ'),
+ ('᮪', '᮪'),
+ ('ᯧ', 'ᯧ'),
+ ('ᯪ', 'ᯬ'),
+ ('ᯮ', 'ᯮ'),
+ ('᯲', '᯳'),
+ ('ᰤ', 'ᰫ'),
+ ('ᰴ', 'ᰵ'),
+ ('᳡', '᳡'),
+ ('᳷', '᳷'),
+ ('ꠣ', 'ꠤ'),
+ ('ꠧ', 'ꠧ'),
+ ('ꢀ', 'ꢁ'),
+ ('ꢴ', 'ꣃ'),
+ ('ꥒ', '꥓'),
+ ('ꦃ', 'ꦃ'),
+ ('ꦴ', 'ꦵ'),
+ ('ꦺ', 'ꦻ'),
+ ('ꦾ', '꧀'),
+ ('ꨯ', 'ꨰ'),
+ ('ꨳ', 'ꨴ'),
+ ('ꩍ', 'ꩍ'),
+ ('ꫫ', 'ꫫ'),
+ ('ꫮ', 'ꫯ'),
+ ('ꫵ', 'ꫵ'),
+ ('ꯣ', 'ꯤ'),
+ ('ꯦ', 'ꯧ'),
+ ('ꯩ', 'ꯪ'),
+ ('꯬', '꯬'),
+ ('𑀀', '𑀀'),
+ ('𑀂', '𑀂'),
+ ('𑂂', '𑂂'),
+ ('𑂰', '𑂲'),
+ ('𑂷', '𑂸'),
+ ('𑄬', '𑄬'),
+ ('𑅅', '𑅆'),
+ ('𑆂', '𑆂'),
+ ('𑆳', '𑆵'),
+ ('𑆿', '𑇀'),
+ ('𑇎', '𑇎'),
+ ('𑈬', '𑈮'),
+ ('𑈲', '𑈳'),
+ ('𑈵', '𑈵'),
+ ('𑋠', '𑋢'),
+ ('𑌂', '𑌃'),
+ ('𑌿', '𑌿'),
+ ('𑍁', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('𑍢', '𑍣'),
+ ('𑐵', '𑐷'),
+ ('𑑀', '𑑁'),
+ ('𑑅', '𑑅'),
+ ('𑒱', '𑒲'),
+ ('𑒹', '𑒹'),
+ ('𑒻', '𑒼'),
+ ('𑒾', '𑒾'),
+ ('𑓁', '𑓁'),
+ ('𑖰', '𑖱'),
+ ('𑖸', '𑖻'),
+ ('𑖾', '𑖾'),
+ ('𑘰', '𑘲'),
+ ('𑘻', '𑘼'),
+ ('𑘾', '𑘾'),
+ ('𑚬', '𑚬'),
+ ('𑚮', '𑚯'),
+ ('𑚶', '𑚶'),
+ ('𑜦', '𑜦'),
+ ('𑠬', '𑠮'),
+ ('𑠸', '𑠸'),
+ ('𑤱', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('𑤽', '𑤽'),
+ ('𑥀', '𑥀'),
+ ('𑥂', '𑥂'),
+ ('𑧑', '𑧓'),
+ ('𑧜', '𑧟'),
+ ('𑧤', '𑧤'),
+ ('𑨹', '𑨹'),
+ ('𑩗', '𑩘'),
+ ('𑪗', '𑪗'),
+ ('𑰯', '𑰯'),
+ ('𑰾', '𑰾'),
+ ('𑲩', '𑲩'),
+ ('𑲱', '𑲱'),
+ ('𑲴', '𑲴'),
+ ('𑶊', '𑶎'),
+ ('𑶓', '𑶔'),
+ ('𑶖', '𑶖'),
+ ('𑻵', '𑻶'),
+ ('𑼃', '𑼃'),
+ ('𑼴', '𑼵'),
+ ('𑼾', '𑼿'),
+ ('𑽁', '𑽁'),
+ ('𖽑', '𖾇'),
+ ('𖿰', '𖿱'),
+ ('𝅦', '𝅦'),
+ ('𝅭', '𝅭'),
+];
+
+pub const T: &'static [(char, char)] = &[('ᆨ', 'ᇿ'), ('ퟋ', 'ퟻ')];
+
+pub const V: &'static [(char, char)] = &[('ᅠ', 'ᆧ'), ('ힰ', 'ퟆ')];
+
+pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/mod.rs b/third_party/rust/regex-syntax/src/unicode_tables/mod.rs
new file mode 100644
index 0000000000..20736c7ac8
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/mod.rs
@@ -0,0 +1,57 @@
+#[cfg(feature = "unicode-age")]
+pub mod age;
+
+#[cfg(feature = "unicode-case")]
+pub mod case_folding_simple;
+
+#[cfg(feature = "unicode-gencat")]
+pub mod general_category;
+
+#[cfg(feature = "unicode-segment")]
+pub mod grapheme_cluster_break;
+
+#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
+#[allow(dead_code)]
+pub mod perl_decimal;
+
+#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
+#[allow(dead_code)]
+pub mod perl_space;
+
+#[cfg(feature = "unicode-perl")]
+pub mod perl_word;
+
+#[cfg(feature = "unicode-bool")]
+pub mod property_bool;
+
+#[cfg(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+))]
+pub mod property_names;
+
+#[cfg(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+))]
+pub mod property_values;
+
+#[cfg(feature = "unicode-script")]
+pub mod script;
+
+#[cfg(feature = "unicode-script")]
+pub mod script_extension;
+
+#[cfg(feature = "unicode-segment")]
+pub mod sentence_break;
+
+#[cfg(feature = "unicode-segment")]
+pub mod word_break;
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/perl_decimal.rs b/third_party/rust/regex-syntax/src/unicode_tables/perl_decimal.rs
new file mode 100644
index 0000000000..4f4c08a128
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/perl_decimal.rs
@@ -0,0 +1,77 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate general-category ucd-15.0.0 --chars --include decimalnumber
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
+ &[("Decimal_Number", DECIMAL_NUMBER)];
+
+pub const DECIMAL_NUMBER: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('٠', '٩'),
+ ('۰', '۹'),
+ ('߀', '߉'),
+ ('०', '९'),
+ ('০', '৯'),
+ ('੦', '੯'),
+ ('૦', '૯'),
+ ('୦', '୯'),
+ ('௦', '௯'),
+ ('౦', '౯'),
+ ('೦', '೯'),
+ ('൦', '൯'),
+ ('෦', '෯'),
+ ('๐', '๙'),
+ ('໐', '໙'),
+ ('༠', '༩'),
+ ('၀', '၉'),
+ ('႐', '႙'),
+ ('០', '៩'),
+ ('᠐', '᠙'),
+ ('᥆', '᥏'),
+ ('᧐', '᧙'),
+ ('᪀', '᪉'),
+ ('᪐', '᪙'),
+ ('᭐', '᭙'),
+ ('᮰', '᮹'),
+ ('᱀', '᱉'),
+ ('᱐', '᱙'),
+ ('꘠', '꘩'),
+ ('꣐', '꣙'),
+ ('꤀', '꤉'),
+ ('꧐', '꧙'),
+ ('꧰', '꧹'),
+ ('꩐', '꩙'),
+ ('꯰', '꯹'),
+ ('0', '9'),
+ ('𐒠', '𐒩'),
+ ('𐴰', '𐴹'),
+ ('𑁦', '𑁯'),
+ ('𑃰', '𑃹'),
+ ('𑄶', '𑄿'),
+ ('𑇐', '𑇙'),
+ ('𑋰', '𑋹'),
+ ('𑑐', '𑑙'),
+ ('𑓐', '𑓙'),
+ ('𑙐', '𑙙'),
+ ('𑛀', '𑛉'),
+ ('𑜰', '𑜹'),
+ ('𑣠', '𑣩'),
+ ('𑥐', '𑥙'),
+ ('𑱐', '𑱙'),
+ ('𑵐', '𑵙'),
+ ('𑶠', '𑶩'),
+ ('𑽐', '𑽙'),
+ ('𖩠', '𖩩'),
+ ('𖫀', '𖫉'),
+ ('𖭐', '𖭙'),
+ ('𝟎', '𝟿'),
+ ('𞅀', '𞅉'),
+ ('𞋰', '𞋹'),
+ ('𞓰', '𞓹'),
+ ('𞥐', '𞥙'),
+ ('🯰', '🯹'),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/perl_space.rs b/third_party/rust/regex-syntax/src/unicode_tables/perl_space.rs
new file mode 100644
index 0000000000..1741695795
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/perl_space.rs
@@ -0,0 +1,23 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate property-bool ucd-15.0.0 --chars --include whitespace
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
+ &[("White_Space", WHITE_SPACE)];
+
+pub const WHITE_SPACE: &'static [(char, char)] = &[
+ ('\t', '\r'),
+ (' ', ' '),
+ ('\u{85}', '\u{85}'),
+ ('\u{a0}', '\u{a0}'),
+ ('\u{1680}', '\u{1680}'),
+ ('\u{2000}', '\u{200a}'),
+ ('\u{2028}', '\u{2029}'),
+ ('\u{202f}', '\u{202f}'),
+ ('\u{205f}', '\u{205f}'),
+ ('\u{3000}', '\u{3000}'),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/perl_word.rs b/third_party/rust/regex-syntax/src/unicode_tables/perl_word.rs
new file mode 100644
index 0000000000..c1b66bd9ab
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/perl_word.rs
@@ -0,0 +1,781 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate perl-word ucd-15.0.0 --chars
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const PERL_WORD: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('A', 'Z'),
+ ('_', '_'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ˁ'),
+ ('ˆ', 'ˑ'),
+ ('ˠ', 'ˤ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('\u{300}', 'ʹ'),
+ ('Ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('\u{483}', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', 'ՙ'),
+ ('ՠ', 'ֈ'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('\u{610}', '\u{61a}'),
+ ('ؠ', '٩'),
+ ('ٮ', 'ۓ'),
+ ('ە', '\u{6dc}'),
+ ('\u{6df}', '\u{6e8}'),
+ ('\u{6ea}', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', '\u{74a}'),
+ ('ݍ', 'ޱ'),
+ ('߀', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('ࠀ', '\u{82d}'),
+ ('ࡀ', '\u{85b}'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('\u{898}', '\u{8e1}'),
+ ('\u{8e3}', '\u{963}'),
+ ('०', '९'),
+ ('ॱ', 'ঃ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('\u{9bc}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৎ'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('ড়', 'ঢ়'),
+ ('য়', '\u{9e3}'),
+ ('০', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', 'ਃ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('੦', '\u{a75}'),
+ ('\u{a81}', 'ઃ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('\u{abc}', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', '\u{ae3}'),
+ ('૦', '૯'),
+ ('ૹ', '\u{aff}'),
+ ('\u{b01}', 'ଃ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('\u{b3c}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', '\u{b63}'),
+ ('୦', '୯'),
+ ('ୱ', 'ୱ'),
+ ('\u{b82}', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('ௐ', 'ௐ'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('௦', '௯'),
+ ('\u{c00}', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('\u{c3c}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', '\u{c63}'),
+ ('౦', '౯'),
+ ('ಀ', 'ಃ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('\u{cbc}', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', '\u{ce3}'),
+ ('೦', '೯'),
+ ('ೱ', 'ೳ'),
+ ('\u{d00}', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', 'ൎ'),
+ ('ൔ', '\u{d57}'),
+ ('ൟ', '\u{d63}'),
+ ('൦', '൯'),
+ ('ൺ', 'ൿ'),
+ ('\u{d81}', 'ඃ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('෦', '෯'),
+ ('ෲ', 'ෳ'),
+ ('ก', '\u{e3a}'),
+ ('เ', '\u{e4e}'),
+ ('๐', '๙'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('\u{ec8}', '\u{ece}'),
+ ('໐', '໙'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('\u{f18}', '\u{f19}'),
+ ('༠', '༩'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('༾', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('\u{f71}', '\u{f84}'),
+ ('\u{f86}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('က', '၉'),
+ ('ၐ', '\u{109d}'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('\u{135d}', '\u{135f}'),
+ ('ᎀ', 'ᎏ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛮ', 'ᛸ'),
+ ('ᜀ', '᜕'),
+ ('ᜟ', '᜴'),
+ ('ᝀ', '\u{1753}'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('\u{1772}', '\u{1773}'),
+ ('ក', '\u{17d3}'),
+ ('ៗ', 'ៗ'),
+ ('ៜ', '\u{17dd}'),
+ ('០', '៩'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '᠙'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('᥆', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('᧐', '᧙'),
+ ('ᨀ', '\u{1a1b}'),
+ ('ᨠ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '᪉'),
+ ('᪐', '᪙'),
+ ('ᪧ', 'ᪧ'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1b00}', 'ᭌ'),
+ ('᭐', '᭙'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', '᯳'),
+ ('ᰀ', '\u{1c37}'),
+ ('᱀', '᱉'),
+ ('ᱍ', 'ᱽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', 'ᳺ'),
+ ('ᴀ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('\u{200c}', '\u{200d}'),
+ ('‿', '⁀'),
+ ('⁔', '⁔'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('\u{20d0}', '\u{20f0}'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℯ', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⓐ', 'ⓩ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('\u{2d7f}', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('ⸯ', 'ⸯ'),
+ ('々', '〇'),
+ ('〡', '\u{302f}'),
+ ('〱', '〵'),
+ ('〸', '〼'),
+ ('ぁ', 'ゖ'),
+ ('\u{3099}', '\u{309a}'),
+ ('ゝ', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘫ'),
+ ('Ꙁ', '\u{a672}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('ꙿ', '\u{a6f1}'),
+ ('ꜗ', 'ꜟ'),
+ ('Ꜣ', 'ꞈ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠧ'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢀ', '\u{a8c5}'),
+ ('꣐', '꣙'),
+ ('\u{a8e0}', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', '\u{a92d}'),
+ ('ꤰ', '꥓'),
+ ('ꥠ', 'ꥼ'),
+ ('\u{a980}', '꧀'),
+ ('ꧏ', '꧙'),
+ ('ꧠ', 'ꧾ'),
+ ('ꨀ', '\u{aa36}'),
+ ('ꩀ', 'ꩍ'),
+ ('꩐', '꩙'),
+ ('ꩠ', 'ꩶ'),
+ ('ꩺ', 'ꫂ'),
+ ('ꫛ', 'ꫝ'),
+ ('ꫠ', 'ꫯ'),
+ ('ꫲ', '\u{aaf6}'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꯪ'),
+ ('꯬', '\u{abed}'),
+ ('꯰', '꯹'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('יִ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('︳', '︴'),
+ ('﹍', '﹏'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('0', '9'),
+ ('A', 'Z'),
+ ('_', '_'),
+ ('a', 'z'),
+ ('ヲ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐅀', '𐅴'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '\u{1037a}'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐏑', '𐏕'),
+ ('𐐀', '𐒝'),
+ ('𐒠', '𐒩'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '\u{10ae6}'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐴀', '\u{10d27}'),
+ ('𐴰', '𐴹'),
+ ('𐺀', '𐺩'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('𐺰', '𐺱'),
+ ('\u{10efd}', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '\u{10f50}'),
+ ('𐽰', '\u{10f85}'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀀', '\u{11046}'),
+ ('𑁦', '𑁵'),
+ ('\u{1107f}', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('𑃐', '𑃨'),
+ ('𑃰', '𑃹'),
+ ('\u{11100}', '\u{11134}'),
+ ('𑄶', '𑄿'),
+ ('𑅄', '𑅇'),
+ ('𑅐', '\u{11173}'),
+ ('𑅶', '𑅶'),
+ ('\u{11180}', '𑇄'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('𑇎', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '\u{11237}'),
+ ('\u{1123e}', '\u{11241}'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '\u{112ea}'),
+ ('𑋰', '𑋹'),
+ ('\u{11300}', '𑌃'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('\u{1133b}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('𑍐', '𑍐'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍝', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('𑐀', '𑑊'),
+ ('𑑐', '𑑙'),
+ ('\u{1145e}', '𑑡'),
+ ('𑒀', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑓐', '𑓙'),
+ ('𑖀', '\u{115b5}'),
+ ('𑖸', '\u{115c0}'),
+ ('𑗘', '\u{115dd}'),
+ ('𑘀', '\u{11640}'),
+ ('𑙄', '𑙄'),
+ ('𑙐', '𑙙'),
+ ('𑚀', '𑚸'),
+ ('𑛀', '𑛉'),
+ ('𑜀', '𑜚'),
+ ('\u{1171d}', '\u{1172b}'),
+ ('𑜰', '𑜹'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '\u{1183a}'),
+ ('𑢠', '𑣩'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '\u{11943}'),
+ ('𑥐', '𑥙'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '\u{119d7}'),
+ ('\u{119da}', '𑧡'),
+ ('𑧣', '𑧤'),
+ ('𑨀', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('𑩐', '\u{11a99}'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '\u{11c36}'),
+ ('\u{11c38}', '𑱀'),
+ ('𑱐', '𑱙'),
+ ('𑱲', '𑲏'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d47}'),
+ ('𑵐', '𑵙'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '𑶘'),
+ ('𑶠', '𑶩'),
+ ('𑻠', '𑻶'),
+ ('\u{11f00}', '𑼐'),
+ ('𑼒', '\u{11f3a}'),
+ ('𑼾', '\u{11f42}'),
+ ('𑽐', '𑽙'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('\u{13440}', '\u{13455}'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩠', '𖩩'),
+ ('𖩰', '𖪾'),
+ ('𖫀', '𖫉'),
+ ('𖫐', '𖫭'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('𖬀', '\u{16b36}'),
+ ('𖭀', '𖭃'),
+ ('𖭐', '𖭙'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖹿'),
+ ('𖼀', '𖽊'),
+ ('\u{16f4f}', '𖾇'),
+ ('\u{16f8f}', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '\u{16fe4}'),
+ ('𖿰', '𖿱'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d165}', '\u{1d169}'),
+ ('𝅭', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝟎', '𝟿'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('𞀰', '𞁭'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('𞄀', '𞄬'),
+ ('\u{1e130}', '𞄽'),
+ ('𞅀', '𞅉'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '\u{1e2ae}'),
+ ('𞋀', '𞋹'),
+ ('𞓐', '𞓹'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('𞤀', '𞥋'),
+ ('𞥐', '𞥙'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('🄰', '🅉'),
+ ('🅐', '🅩'),
+ ('🅰', '🆉'),
+ ('🯰', '🯹'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/property_bool.rs b/third_party/rust/regex-syntax/src/unicode_tables/property_bool.rs
new file mode 100644
index 0000000000..a3e84b519c
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/property_bool.rs
@@ -0,0 +1,11367 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate property-bool ucd-15.0.0 --chars
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
+ ("ASCII_Hex_Digit", ASCII_HEX_DIGIT),
+ ("Alphabetic", ALPHABETIC),
+ ("Bidi_Control", BIDI_CONTROL),
+ ("Bidi_Mirrored", BIDI_MIRRORED),
+ ("Case_Ignorable", CASE_IGNORABLE),
+ ("Cased", CASED),
+ ("Changes_When_Casefolded", CHANGES_WHEN_CASEFOLDED),
+ ("Changes_When_Casemapped", CHANGES_WHEN_CASEMAPPED),
+ ("Changes_When_Lowercased", CHANGES_WHEN_LOWERCASED),
+ ("Changes_When_Titlecased", CHANGES_WHEN_TITLECASED),
+ ("Changes_When_Uppercased", CHANGES_WHEN_UPPERCASED),
+ ("Dash", DASH),
+ ("Default_Ignorable_Code_Point", DEFAULT_IGNORABLE_CODE_POINT),
+ ("Deprecated", DEPRECATED),
+ ("Diacritic", DIACRITIC),
+ ("Emoji", EMOJI),
+ ("Emoji_Component", EMOJI_COMPONENT),
+ ("Emoji_Modifier", EMOJI_MODIFIER),
+ ("Emoji_Modifier_Base", EMOJI_MODIFIER_BASE),
+ ("Emoji_Presentation", EMOJI_PRESENTATION),
+ ("Extended_Pictographic", EXTENDED_PICTOGRAPHIC),
+ ("Extender", EXTENDER),
+ ("Grapheme_Base", GRAPHEME_BASE),
+ ("Grapheme_Extend", GRAPHEME_EXTEND),
+ ("Grapheme_Link", GRAPHEME_LINK),
+ ("Hex_Digit", HEX_DIGIT),
+ ("Hyphen", HYPHEN),
+ ("IDS_Binary_Operator", IDS_BINARY_OPERATOR),
+ ("IDS_Trinary_Operator", IDS_TRINARY_OPERATOR),
+ ("ID_Continue", ID_CONTINUE),
+ ("ID_Start", ID_START),
+ ("Ideographic", IDEOGRAPHIC),
+ ("Join_Control", JOIN_CONTROL),
+ ("Logical_Order_Exception", LOGICAL_ORDER_EXCEPTION),
+ ("Lowercase", LOWERCASE),
+ ("Math", MATH),
+ ("Noncharacter_Code_Point", NONCHARACTER_CODE_POINT),
+ ("Other_Alphabetic", OTHER_ALPHABETIC),
+ ("Other_Default_Ignorable_Code_Point", OTHER_DEFAULT_IGNORABLE_CODE_POINT),
+ ("Other_Grapheme_Extend", OTHER_GRAPHEME_EXTEND),
+ ("Other_ID_Continue", OTHER_ID_CONTINUE),
+ ("Other_ID_Start", OTHER_ID_START),
+ ("Other_Lowercase", OTHER_LOWERCASE),
+ ("Other_Math", OTHER_MATH),
+ ("Other_Uppercase", OTHER_UPPERCASE),
+ ("Pattern_Syntax", PATTERN_SYNTAX),
+ ("Pattern_White_Space", PATTERN_WHITE_SPACE),
+ ("Prepended_Concatenation_Mark", PREPENDED_CONCATENATION_MARK),
+ ("Quotation_Mark", QUOTATION_MARK),
+ ("Radical", RADICAL),
+ ("Regional_Indicator", REGIONAL_INDICATOR),
+ ("Sentence_Terminal", SENTENCE_TERMINAL),
+ ("Soft_Dotted", SOFT_DOTTED),
+ ("Terminal_Punctuation", TERMINAL_PUNCTUATION),
+ ("Unified_Ideograph", UNIFIED_IDEOGRAPH),
+ ("Uppercase", UPPERCASE),
+ ("Variation_Selector", VARIATION_SELECTOR),
+ ("White_Space", WHITE_SPACE),
+ ("XID_Continue", XID_CONTINUE),
+ ("XID_Start", XID_START),
+];
+
+pub const ASCII_HEX_DIGIT: &'static [(char, char)] =
+ &[('0', '9'), ('A', 'F'), ('a', 'f')];
+
+pub const ALPHABETIC: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ˁ'),
+ ('ˆ', 'ˑ'),
+ ('ˠ', 'ˤ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('\u{345}', '\u{345}'),
+ ('Ͱ', 'ʹ'),
+ ('Ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', 'ՙ'),
+ ('ՠ', 'ֈ'),
+ ('\u{5b0}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('\u{610}', '\u{61a}'),
+ ('ؠ', '\u{657}'),
+ ('\u{659}', '\u{65f}'),
+ ('ٮ', 'ۓ'),
+ ('ە', '\u{6dc}'),
+ ('\u{6e1}', '\u{6e8}'),
+ ('\u{6ed}', 'ۯ'),
+ ('ۺ', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', '\u{73f}'),
+ ('ݍ', 'ޱ'),
+ ('ߊ', 'ߪ'),
+ ('ߴ', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('ࠀ', '\u{817}'),
+ ('ࠚ', '\u{82c}'),
+ ('ࡀ', 'ࡘ'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('ࢠ', 'ࣉ'),
+ ('\u{8d4}', '\u{8df}'),
+ ('\u{8e3}', '\u{8e9}'),
+ ('\u{8f0}', 'ऻ'),
+ ('ऽ', 'ौ'),
+ ('ॎ', 'ॐ'),
+ ('\u{955}', '\u{963}'),
+ ('ॱ', 'ঃ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('ঽ', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৌ'),
+ ('ৎ', 'ৎ'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('ড়', 'ঢ়'),
+ ('য়', '\u{9e3}'),
+ ('ৰ', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('\u{a01}', 'ਃ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4c}'),
+ ('\u{a51}', '\u{a51}'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('\u{a70}', '\u{a75}'),
+ ('\u{a81}', 'ઃ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('ઽ', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', 'ૌ'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', '\u{ae3}'),
+ ('ૹ', '\u{afc}'),
+ ('\u{b01}', 'ଃ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('ଽ', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', 'ୌ'),
+ ('\u{b56}', '\u{b57}'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', '\u{b63}'),
+ ('ୱ', 'ୱ'),
+ ('\u{b82}', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', 'ௌ'),
+ ('ௐ', 'ௐ'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('\u{c00}', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('ఽ', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4c}'),
+ ('\u{c55}', '\u{c56}'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', '\u{c63}'),
+ ('ಀ', 'ಃ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('ಽ', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccc}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', '\u{ce3}'),
+ ('ೱ', 'ೳ'),
+ ('\u{d00}', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', 'ഺ'),
+ ('ഽ', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', 'ൌ'),
+ ('ൎ', 'ൎ'),
+ ('ൔ', '\u{d57}'),
+ ('ൟ', '\u{d63}'),
+ ('ൺ', 'ൿ'),
+ ('\u{d81}', 'ඃ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('ෲ', 'ෳ'),
+ ('ก', '\u{e3a}'),
+ ('เ', 'ๆ'),
+ ('\u{e4d}', '\u{e4d}'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', '\u{eb9}'),
+ ('\u{ebb}', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('\u{ecd}', '\u{ecd}'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('ཀ', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('\u{f71}', '\u{f83}'),
+ ('ྈ', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('က', '\u{1036}'),
+ ('း', 'း'),
+ ('ျ', 'ဿ'),
+ ('ၐ', 'ႏ'),
+ ('ႚ', '\u{109d}'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('ᎀ', 'ᎏ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛮ', 'ᛸ'),
+ ('ᜀ', '\u{1713}'),
+ ('ᜟ', '\u{1733}'),
+ ('ᝀ', '\u{1753}'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('\u{1772}', '\u{1773}'),
+ ('ក', 'ឳ'),
+ ('ា', 'ៈ'),
+ ('ៗ', 'ៗ'),
+ ('ៜ', 'ៜ'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', 'ᤸ'),
+ ('ᥐ', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('ᨀ', '\u{1a1b}'),
+ ('ᨠ', '\u{1a5e}'),
+ ('ᩡ', '\u{1a74}'),
+ ('ᪧ', 'ᪧ'),
+ ('\u{1abf}', '\u{1ac0}'),
+ ('\u{1acc}', '\u{1ace}'),
+ ('\u{1b00}', 'ᬳ'),
+ ('\u{1b35}', 'ᭃ'),
+ ('ᭅ', 'ᭌ'),
+ ('\u{1b80}', '\u{1ba9}'),
+ ('\u{1bac}', 'ᮯ'),
+ ('ᮺ', 'ᯥ'),
+ ('ᯧ', '\u{1bf1}'),
+ ('ᰀ', '\u{1c36}'),
+ ('ᱍ', 'ᱏ'),
+ ('ᱚ', 'ᱽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ᳩ', 'ᳬ'),
+ ('ᳮ', 'ᳳ'),
+ ('ᳵ', 'ᳶ'),
+ ('ᳺ', 'ᳺ'),
+ ('ᴀ', 'ᶿ'),
+ ('\u{1de7}', '\u{1df4}'),
+ ('Ḁ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℯ', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⓐ', 'ⓩ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳮ'),
+ ('Ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('ⸯ', 'ⸯ'),
+ ('々', '〇'),
+ ('〡', '〩'),
+ ('〱', '〵'),
+ ('〸', '〼'),
+ ('ぁ', 'ゖ'),
+ ('ゝ', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘟ'),
+ ('ꘪ', 'ꘫ'),
+ ('Ꙁ', 'ꙮ'),
+ ('\u{a674}', '\u{a67b}'),
+ ('ꙿ', 'ꛯ'),
+ ('ꜗ', 'ꜟ'),
+ ('Ꜣ', 'ꞈ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠅ'),
+ ('ꠇ', 'ꠧ'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢀ', 'ꣃ'),
+ ('\u{a8c5}', '\u{a8c5}'),
+ ('ꣲ', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', '\u{a8ff}'),
+ ('ꤊ', '\u{a92a}'),
+ ('ꤰ', 'ꥒ'),
+ ('ꥠ', 'ꥼ'),
+ ('\u{a980}', 'ꦲ'),
+ ('ꦴ', 'ꦿ'),
+ ('ꧏ', 'ꧏ'),
+ ('ꧠ', 'ꧯ'),
+ ('ꧺ', 'ꧾ'),
+ ('ꨀ', '\u{aa36}'),
+ ('ꩀ', 'ꩍ'),
+ ('ꩠ', 'ꩶ'),
+ ('ꩺ', '\u{aabe}'),
+ ('ꫀ', 'ꫀ'),
+ ('ꫂ', 'ꫂ'),
+ ('ꫛ', 'ꫝ'),
+ ('ꫠ', 'ꫯ'),
+ ('ꫲ', 'ꫵ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꯪ'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('יִ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ヲ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐅀', '𐅴'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '\u{1037a}'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐏑', '𐏕'),
+ ('𐐀', '𐒝'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '𐫤'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐴀', '\u{10d27}'),
+ ('𐺀', '𐺩'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('𐺰', '𐺱'),
+ ('𐼀', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '𐽅'),
+ ('𐽰', '𐾁'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀀', '\u{11045}'),
+ ('𑁱', '𑁵'),
+ ('\u{11080}', '𑂸'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('𑃐', '𑃨'),
+ ('\u{11100}', '\u{11132}'),
+ ('𑅄', '𑅇'),
+ ('𑅐', '𑅲'),
+ ('𑅶', '𑅶'),
+ ('\u{11180}', '𑆿'),
+ ('𑇁', '𑇄'),
+ ('𑇎', '\u{111cf}'),
+ ('𑇚', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '\u{11234}'),
+ ('\u{11237}', '\u{11237}'),
+ ('\u{1123e}', '\u{11241}'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '\u{112e8}'),
+ ('\u{11300}', '𑌃'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('𑌽', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍌'),
+ ('𑍐', '𑍐'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍝', '𑍣'),
+ ('𑐀', '𑑁'),
+ ('\u{11443}', '𑑅'),
+ ('𑑇', '𑑊'),
+ ('𑑟', '𑑡'),
+ ('𑒀', '𑓁'),
+ ('𑓄', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑖀', '\u{115b5}'),
+ ('𑖸', '𑖾'),
+ ('𑗘', '\u{115dd}'),
+ ('𑘀', '𑘾'),
+ ('\u{11640}', '\u{11640}'),
+ ('𑙄', '𑙄'),
+ ('𑚀', '\u{116b5}'),
+ ('𑚸', '𑚸'),
+ ('𑜀', '𑜚'),
+ ('\u{1171d}', '\u{1172a}'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '𑠸'),
+ ('𑢠', '𑣟'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '\u{1193c}'),
+ ('𑤿', '𑥂'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '\u{119d7}'),
+ ('\u{119da}', '𑧟'),
+ ('𑧡', '𑧡'),
+ ('𑧣', '𑧤'),
+ ('𑨀', '𑨲'),
+ ('\u{11a35}', '\u{11a3e}'),
+ ('𑩐', '𑪗'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '\u{11c36}'),
+ ('\u{11c38}', '𑰾'),
+ ('𑱀', '𑱀'),
+ ('𑱲', '𑲏'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d41}'),
+ ('\u{11d43}', '\u{11d43}'),
+ ('𑵆', '\u{11d47}'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '𑶖'),
+ ('𑶘', '𑶘'),
+ ('𑻠', '𑻶'),
+ ('\u{11f00}', '𑼐'),
+ ('𑼒', '\u{11f3a}'),
+ ('𑼾', '\u{11f40}'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('𓑁', '𓑆'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩰', '𖪾'),
+ ('𖫐', '𖫭'),
+ ('𖬀', '𖬯'),
+ ('𖭀', '𖭃'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖹿'),
+ ('𖼀', '𖽊'),
+ ('\u{16f4f}', '𖾇'),
+ ('\u{16f8f}', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '𖿣'),
+ ('𖿰', '𖿱'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('\u{1bc9e}', '\u{1bc9e}'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('𞀰', '𞁭'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('𞄀', '𞄬'),
+ ('𞄷', '𞄽'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '𞊭'),
+ ('𞋀', '𞋫'),
+ ('𞓐', '𞓫'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('𞤀', '𞥃'),
+ ('\u{1e947}', '\u{1e947}'),
+ ('𞥋', '𞥋'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('🄰', '🅉'),
+ ('🅐', '🅩'),
+ ('🅰', '🆉'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const BIDI_CONTROL: &'static [(char, char)] = &[
+ ('\u{61c}', '\u{61c}'),
+ ('\u{200e}', '\u{200f}'),
+ ('\u{202a}', '\u{202e}'),
+ ('\u{2066}', '\u{2069}'),
+];
+
+pub const BIDI_MIRRORED: &'static [(char, char)] = &[
+ ('(', ')'),
+ ('<', '<'),
+ ('>', '>'),
+ ('[', '['),
+ (']', ']'),
+ ('{', '{'),
+ ('}', '}'),
+ ('«', '«'),
+ ('»', '»'),
+ ('༺', '༽'),
+ ('᚛', '᚜'),
+ ('‹', '›'),
+ ('⁅', '⁆'),
+ ('⁽', '⁾'),
+ ('₍', '₎'),
+ ('⅀', '⅀'),
+ ('∁', '∄'),
+ ('∈', '∍'),
+ ('∑', '∑'),
+ ('∕', '∖'),
+ ('√', '∝'),
+ ('∟', '∢'),
+ ('∤', '∤'),
+ ('∦', '∦'),
+ ('∫', '∳'),
+ ('∹', '∹'),
+ ('∻', '≌'),
+ ('≒', '≕'),
+ ('≟', '≠'),
+ ('≢', '≢'),
+ ('≤', '≫'),
+ ('≮', '⊌'),
+ ('⊏', '⊒'),
+ ('⊘', '⊘'),
+ ('⊢', '⊣'),
+ ('⊦', '⊸'),
+ ('⊾', '⊿'),
+ ('⋉', '⋍'),
+ ('⋐', '⋑'),
+ ('⋖', '⋭'),
+ ('⋰', '⋿'),
+ ('⌈', '⌋'),
+ ('⌠', '⌡'),
+ ('〈', '〉'),
+ ('❨', '❵'),
+ ('⟀', '⟀'),
+ ('⟃', '⟆'),
+ ('⟈', '⟉'),
+ ('⟋', '⟍'),
+ ('⟓', '⟖'),
+ ('⟜', '⟞'),
+ ('⟢', '⟯'),
+ ('⦃', '⦘'),
+ ('⦛', '⦠'),
+ ('⦢', '⦯'),
+ ('⦸', '⦸'),
+ ('⧀', '⧅'),
+ ('⧉', '⧉'),
+ ('⧎', '⧒'),
+ ('⧔', '⧕'),
+ ('⧘', '⧜'),
+ ('⧡', '⧡'),
+ ('⧣', '⧥'),
+ ('⧨', '⧩'),
+ ('⧴', '⧹'),
+ ('⧼', '⧽'),
+ ('⨊', '⨜'),
+ ('⨞', '⨡'),
+ ('⨤', '⨤'),
+ ('⨦', '⨦'),
+ ('⨩', '⨩'),
+ ('⨫', '⨮'),
+ ('⨴', '⨵'),
+ ('⨼', '⨾'),
+ ('⩗', '⩘'),
+ ('⩤', '⩥'),
+ ('⩪', '⩭'),
+ ('⩯', '⩰'),
+ ('⩳', '⩴'),
+ ('⩹', '⪣'),
+ ('⪦', '⪭'),
+ ('⪯', '⫖'),
+ ('⫝̸', '⫝̸'),
+ ('⫞', '⫞'),
+ ('⫢', '⫦'),
+ ('⫬', '⫮'),
+ ('⫳', '⫳'),
+ ('⫷', '⫻'),
+ ('⫽', '⫽'),
+ ('⯾', '⯾'),
+ ('⸂', '⸅'),
+ ('⸉', '⸊'),
+ ('⸌', '⸍'),
+ ('⸜', '⸝'),
+ ('⸠', '⸩'),
+ ('⹕', '⹜'),
+ ('〈', '】'),
+ ('〔', '〛'),
+ ('﹙', '﹞'),
+ ('﹤', '﹥'),
+ ('(', ')'),
+ ('<', '<'),
+ ('>', '>'),
+ ('[', '['),
+ (']', ']'),
+ ('{', '{'),
+ ('}', '}'),
+ ('⦅', '⦆'),
+ ('「', '」'),
+ ('𝛛', '𝛛'),
+ ('𝜕', '𝜕'),
+ ('𝝏', '𝝏'),
+ ('𝞉', '𝞉'),
+ ('𝟃', '𝟃'),
+];
+
+pub const CASE_IGNORABLE: &'static [(char, char)] = &[
+ ('\'', '\''),
+ ('.', '.'),
+ (':', ':'),
+ ('^', '^'),
+ ('`', '`'),
+ ('¨', '¨'),
+ ('\u{ad}', '\u{ad}'),
+ ('¯', '¯'),
+ ('´', '´'),
+ ('·', '¸'),
+ ('ʰ', '\u{36f}'),
+ ('ʹ', '͵'),
+ ('ͺ', 'ͺ'),
+ ('΄', '΅'),
+ ('·', '·'),
+ ('\u{483}', '\u{489}'),
+ ('ՙ', 'ՙ'),
+ ('՟', '՟'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('״', '״'),
+ ('\u{600}', '\u{605}'),
+ ('\u{610}', '\u{61a}'),
+ ('\u{61c}', '\u{61c}'),
+ ('ـ', 'ـ'),
+ ('\u{64b}', '\u{65f}'),
+ ('\u{670}', '\u{670}'),
+ ('\u{6d6}', '\u{6dd}'),
+ ('\u{6df}', '\u{6e8}'),
+ ('\u{6ea}', '\u{6ed}'),
+ ('\u{70f}', '\u{70f}'),
+ ('\u{711}', '\u{711}'),
+ ('\u{730}', '\u{74a}'),
+ ('\u{7a6}', '\u{7b0}'),
+ ('\u{7eb}', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('\u{816}', '\u{82d}'),
+ ('\u{859}', '\u{85b}'),
+ ('࢈', '࢈'),
+ ('\u{890}', '\u{891}'),
+ ('\u{898}', '\u{89f}'),
+ ('ࣉ', '\u{902}'),
+ ('\u{93a}', '\u{93a}'),
+ ('\u{93c}', '\u{93c}'),
+ ('\u{941}', '\u{948}'),
+ ('\u{94d}', '\u{94d}'),
+ ('\u{951}', '\u{957}'),
+ ('\u{962}', '\u{963}'),
+ ('ॱ', 'ॱ'),
+ ('\u{981}', '\u{981}'),
+ ('\u{9bc}', '\u{9bc}'),
+ ('\u{9c1}', '\u{9c4}'),
+ ('\u{9cd}', '\u{9cd}'),
+ ('\u{9e2}', '\u{9e3}'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', '\u{a02}'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('\u{a41}', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('\u{a70}', '\u{a71}'),
+ ('\u{a75}', '\u{a75}'),
+ ('\u{a81}', '\u{a82}'),
+ ('\u{abc}', '\u{abc}'),
+ ('\u{ac1}', '\u{ac5}'),
+ ('\u{ac7}', '\u{ac8}'),
+ ('\u{acd}', '\u{acd}'),
+ ('\u{ae2}', '\u{ae3}'),
+ ('\u{afa}', '\u{aff}'),
+ ('\u{b01}', '\u{b01}'),
+ ('\u{b3c}', '\u{b3c}'),
+ ('\u{b3f}', '\u{b3f}'),
+ ('\u{b41}', '\u{b44}'),
+ ('\u{b4d}', '\u{b4d}'),
+ ('\u{b55}', '\u{b56}'),
+ ('\u{b62}', '\u{b63}'),
+ ('\u{b82}', '\u{b82}'),
+ ('\u{bc0}', '\u{bc0}'),
+ ('\u{bcd}', '\u{bcd}'),
+ ('\u{c00}', '\u{c00}'),
+ ('\u{c04}', '\u{c04}'),
+ ('\u{c3c}', '\u{c3c}'),
+ ('\u{c3e}', '\u{c40}'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('\u{c62}', '\u{c63}'),
+ ('\u{c81}', '\u{c81}'),
+ ('\u{cbc}', '\u{cbc}'),
+ ('\u{cbf}', '\u{cbf}'),
+ ('\u{cc6}', '\u{cc6}'),
+ ('\u{ccc}', '\u{ccd}'),
+ ('\u{ce2}', '\u{ce3}'),
+ ('\u{d00}', '\u{d01}'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('\u{d41}', '\u{d44}'),
+ ('\u{d4d}', '\u{d4d}'),
+ ('\u{d62}', '\u{d63}'),
+ ('\u{d81}', '\u{d81}'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dd2}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('\u{e31}', '\u{e31}'),
+ ('\u{e34}', '\u{e3a}'),
+ ('ๆ', '\u{e4e}'),
+ ('\u{eb1}', '\u{eb1}'),
+ ('\u{eb4}', '\u{ebc}'),
+ ('ໆ', 'ໆ'),
+ ('\u{ec8}', '\u{ece}'),
+ ('\u{f18}', '\u{f19}'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('\u{f71}', '\u{f7e}'),
+ ('\u{f80}', '\u{f84}'),
+ ('\u{f86}', '\u{f87}'),
+ ('\u{f8d}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('\u{102d}', '\u{1030}'),
+ ('\u{1032}', '\u{1037}'),
+ ('\u{1039}', '\u{103a}'),
+ ('\u{103d}', '\u{103e}'),
+ ('\u{1058}', '\u{1059}'),
+ ('\u{105e}', '\u{1060}'),
+ ('\u{1071}', '\u{1074}'),
+ ('\u{1082}', '\u{1082}'),
+ ('\u{1085}', '\u{1086}'),
+ ('\u{108d}', '\u{108d}'),
+ ('\u{109d}', '\u{109d}'),
+ ('ჼ', 'ჼ'),
+ ('\u{135d}', '\u{135f}'),
+ ('\u{1712}', '\u{1714}'),
+ ('\u{1732}', '\u{1733}'),
+ ('\u{1752}', '\u{1753}'),
+ ('\u{1772}', '\u{1773}'),
+ ('\u{17b4}', '\u{17b5}'),
+ ('\u{17b7}', '\u{17bd}'),
+ ('\u{17c6}', '\u{17c6}'),
+ ('\u{17c9}', '\u{17d3}'),
+ ('ៗ', 'ៗ'),
+ ('\u{17dd}', '\u{17dd}'),
+ ('\u{180b}', '\u{180f}'),
+ ('ᡃ', 'ᡃ'),
+ ('\u{1885}', '\u{1886}'),
+ ('\u{18a9}', '\u{18a9}'),
+ ('\u{1920}', '\u{1922}'),
+ ('\u{1927}', '\u{1928}'),
+ ('\u{1932}', '\u{1932}'),
+ ('\u{1939}', '\u{193b}'),
+ ('\u{1a17}', '\u{1a18}'),
+ ('\u{1a1b}', '\u{1a1b}'),
+ ('\u{1a56}', '\u{1a56}'),
+ ('\u{1a58}', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a60}'),
+ ('\u{1a62}', '\u{1a62}'),
+ ('\u{1a65}', '\u{1a6c}'),
+ ('\u{1a73}', '\u{1a7c}'),
+ ('\u{1a7f}', '\u{1a7f}'),
+ ('ᪧ', 'ᪧ'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1b00}', '\u{1b03}'),
+ ('\u{1b34}', '\u{1b34}'),
+ ('\u{1b36}', '\u{1b3a}'),
+ ('\u{1b3c}', '\u{1b3c}'),
+ ('\u{1b42}', '\u{1b42}'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', '\u{1b81}'),
+ ('\u{1ba2}', '\u{1ba5}'),
+ ('\u{1ba8}', '\u{1ba9}'),
+ ('\u{1bab}', '\u{1bad}'),
+ ('\u{1be6}', '\u{1be6}'),
+ ('\u{1be8}', '\u{1be9}'),
+ ('\u{1bed}', '\u{1bed}'),
+ ('\u{1bef}', '\u{1bf1}'),
+ ('\u{1c2c}', '\u{1c33}'),
+ ('\u{1c36}', '\u{1c37}'),
+ ('ᱸ', 'ᱽ'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', '\u{1ce0}'),
+ ('\u{1ce2}', '\u{1ce8}'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('\u{1cf8}', '\u{1cf9}'),
+ ('ᴬ', 'ᵪ'),
+ ('ᵸ', 'ᵸ'),
+ ('ᶛ', '\u{1dff}'),
+ ('᾽', '᾽'),
+ ('᾿', '῁'),
+ ('῍', '῏'),
+ ('῝', '῟'),
+ ('῭', '`'),
+ ('´', '῾'),
+ ('\u{200b}', '\u{200f}'),
+ ('‘', '’'),
+ ('․', '․'),
+ ('‧', '‧'),
+ ('\u{202a}', '\u{202e}'),
+ ('\u{2060}', '\u{2064}'),
+ ('\u{2066}', '\u{206f}'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('\u{20d0}', '\u{20f0}'),
+ ('ⱼ', 'ⱽ'),
+ ('\u{2cef}', '\u{2cf1}'),
+ ('ⵯ', 'ⵯ'),
+ ('\u{2d7f}', '\u{2d7f}'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('ⸯ', 'ⸯ'),
+ ('々', '々'),
+ ('\u{302a}', '\u{302d}'),
+ ('〱', '〵'),
+ ('〻', '〻'),
+ ('\u{3099}', 'ゞ'),
+ ('ー', 'ヾ'),
+ ('ꀕ', 'ꀕ'),
+ ('ꓸ', 'ꓽ'),
+ ('ꘌ', 'ꘌ'),
+ ('\u{a66f}', '\u{a672}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('ꙿ', 'ꙿ'),
+ ('ꚜ', '\u{a69f}'),
+ ('\u{a6f0}', '\u{a6f1}'),
+ ('꜀', '꜡'),
+ ('ꝰ', 'ꝰ'),
+ ('ꞈ', '꞊'),
+ ('ꟲ', 'ꟴ'),
+ ('ꟸ', 'ꟹ'),
+ ('\u{a802}', '\u{a802}'),
+ ('\u{a806}', '\u{a806}'),
+ ('\u{a80b}', '\u{a80b}'),
+ ('\u{a825}', '\u{a826}'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('\u{a8c4}', '\u{a8c5}'),
+ ('\u{a8e0}', '\u{a8f1}'),
+ ('\u{a8ff}', '\u{a8ff}'),
+ ('\u{a926}', '\u{a92d}'),
+ ('\u{a947}', '\u{a951}'),
+ ('\u{a980}', '\u{a982}'),
+ ('\u{a9b3}', '\u{a9b3}'),
+ ('\u{a9b6}', '\u{a9b9}'),
+ ('\u{a9bc}', '\u{a9bd}'),
+ ('ꧏ', 'ꧏ'),
+ ('\u{a9e5}', 'ꧦ'),
+ ('\u{aa29}', '\u{aa2e}'),
+ ('\u{aa31}', '\u{aa32}'),
+ ('\u{aa35}', '\u{aa36}'),
+ ('\u{aa43}', '\u{aa43}'),
+ ('\u{aa4c}', '\u{aa4c}'),
+ ('ꩰ', 'ꩰ'),
+ ('\u{aa7c}', '\u{aa7c}'),
+ ('\u{aab0}', '\u{aab0}'),
+ ('\u{aab2}', '\u{aab4}'),
+ ('\u{aab7}', '\u{aab8}'),
+ ('\u{aabe}', '\u{aabf}'),
+ ('\u{aac1}', '\u{aac1}'),
+ ('ꫝ', 'ꫝ'),
+ ('\u{aaec}', '\u{aaed}'),
+ ('ꫳ', 'ꫴ'),
+ ('\u{aaf6}', '\u{aaf6}'),
+ ('꭛', 'ꭟ'),
+ ('ꭩ', '꭫'),
+ ('\u{abe5}', '\u{abe5}'),
+ ('\u{abe8}', '\u{abe8}'),
+ ('\u{abed}', '\u{abed}'),
+ ('\u{fb1e}', '\u{fb1e}'),
+ ('﮲', '﯂'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('︓', '︓'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('﹒', '﹒'),
+ ('﹕', '﹕'),
+ ('\u{feff}', '\u{feff}'),
+ (''', '''),
+ ('.', '.'),
+ (':', ':'),
+ ('^', '^'),
+ ('`', '`'),
+ ('ー', 'ー'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ (' ̄', ' ̄'),
+ ('\u{fff9}', '\u{fffb}'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('\u{10376}', '\u{1037a}'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('\u{10a01}', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '\u{10a0f}'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('\u{10ae5}', '\u{10ae6}'),
+ ('\u{10d24}', '\u{10d27}'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('\u{10f46}', '\u{10f50}'),
+ ('\u{10f82}', '\u{10f85}'),
+ ('\u{11001}', '\u{11001}'),
+ ('\u{11038}', '\u{11046}'),
+ ('\u{11070}', '\u{11070}'),
+ ('\u{11073}', '\u{11074}'),
+ ('\u{1107f}', '\u{11081}'),
+ ('\u{110b3}', '\u{110b6}'),
+ ('\u{110b9}', '\u{110ba}'),
+ ('\u{110bd}', '\u{110bd}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('\u{110cd}', '\u{110cd}'),
+ ('\u{11100}', '\u{11102}'),
+ ('\u{11127}', '\u{1112b}'),
+ ('\u{1112d}', '\u{11134}'),
+ ('\u{11173}', '\u{11173}'),
+ ('\u{11180}', '\u{11181}'),
+ ('\u{111b6}', '\u{111be}'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('\u{111cf}', '\u{111cf}'),
+ ('\u{1122f}', '\u{11231}'),
+ ('\u{11234}', '\u{11234}'),
+ ('\u{11236}', '\u{11237}'),
+ ('\u{1123e}', '\u{1123e}'),
+ ('\u{11241}', '\u{11241}'),
+ ('\u{112df}', '\u{112df}'),
+ ('\u{112e3}', '\u{112ea}'),
+ ('\u{11300}', '\u{11301}'),
+ ('\u{1133b}', '\u{1133c}'),
+ ('\u{11340}', '\u{11340}'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('\u{11438}', '\u{1143f}'),
+ ('\u{11442}', '\u{11444}'),
+ ('\u{11446}', '\u{11446}'),
+ ('\u{1145e}', '\u{1145e}'),
+ ('\u{114b3}', '\u{114b8}'),
+ ('\u{114ba}', '\u{114ba}'),
+ ('\u{114bf}', '\u{114c0}'),
+ ('\u{114c2}', '\u{114c3}'),
+ ('\u{115b2}', '\u{115b5}'),
+ ('\u{115bc}', '\u{115bd}'),
+ ('\u{115bf}', '\u{115c0}'),
+ ('\u{115dc}', '\u{115dd}'),
+ ('\u{11633}', '\u{1163a}'),
+ ('\u{1163d}', '\u{1163d}'),
+ ('\u{1163f}', '\u{11640}'),
+ ('\u{116ab}', '\u{116ab}'),
+ ('\u{116ad}', '\u{116ad}'),
+ ('\u{116b0}', '\u{116b5}'),
+ ('\u{116b7}', '\u{116b7}'),
+ ('\u{1171d}', '\u{1171f}'),
+ ('\u{11722}', '\u{11725}'),
+ ('\u{11727}', '\u{1172b}'),
+ ('\u{1182f}', '\u{11837}'),
+ ('\u{11839}', '\u{1183a}'),
+ ('\u{1193b}', '\u{1193c}'),
+ ('\u{1193e}', '\u{1193e}'),
+ ('\u{11943}', '\u{11943}'),
+ ('\u{119d4}', '\u{119d7}'),
+ ('\u{119da}', '\u{119db}'),
+ ('\u{119e0}', '\u{119e0}'),
+ ('\u{11a01}', '\u{11a0a}'),
+ ('\u{11a33}', '\u{11a38}'),
+ ('\u{11a3b}', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('\u{11a51}', '\u{11a56}'),
+ ('\u{11a59}', '\u{11a5b}'),
+ ('\u{11a8a}', '\u{11a96}'),
+ ('\u{11a98}', '\u{11a99}'),
+ ('\u{11c30}', '\u{11c36}'),
+ ('\u{11c38}', '\u{11c3d}'),
+ ('\u{11c3f}', '\u{11c3f}'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('\u{11caa}', '\u{11cb0}'),
+ ('\u{11cb2}', '\u{11cb3}'),
+ ('\u{11cb5}', '\u{11cb6}'),
+ ('\u{11d31}', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d45}'),
+ ('\u{11d47}', '\u{11d47}'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('\u{11d95}', '\u{11d95}'),
+ ('\u{11d97}', '\u{11d97}'),
+ ('\u{11ef3}', '\u{11ef4}'),
+ ('\u{11f00}', '\u{11f01}'),
+ ('\u{11f36}', '\u{11f3a}'),
+ ('\u{11f40}', '\u{11f40}'),
+ ('\u{11f42}', '\u{11f42}'),
+ ('\u{13430}', '\u{13440}'),
+ ('\u{13447}', '\u{13455}'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('\u{16b30}', '\u{16b36}'),
+ ('𖭀', '𖭃'),
+ ('\u{16f4f}', '\u{16f4f}'),
+ ('\u{16f8f}', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '\u{16fe4}'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1bca0}', '\u{1bca3}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d167}', '\u{1d169}'),
+ ('\u{1d173}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('𞀰', '𞁭'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('\u{1e130}', '𞄽'),
+ ('\u{1e2ae}', '\u{1e2ae}'),
+ ('\u{1e2ec}', '\u{1e2ef}'),
+ ('𞓫', '\u{1e4ef}'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('\u{1e944}', '𞥋'),
+ ('🏻', '🏿'),
+ ('\u{e0001}', '\u{e0001}'),
+ ('\u{e0020}', '\u{e007f}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const CASED: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ƺ'),
+ ('Ƽ', 'ƿ'),
+ ('DŽ', 'ʓ'),
+ ('ʕ', 'ʸ'),
+ ('ˀ', 'ˁ'),
+ ('ˠ', 'ˤ'),
+ ('\u{345}', '\u{345}'),
+ ('Ͱ', 'ͳ'),
+ ('Ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՠ', 'ֈ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ჿ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ᴀ', 'ᶿ'),
+ ('Ḁ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℯ', 'ℴ'),
+ ('ℹ', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ⅿ'),
+ ('Ↄ', 'ↄ'),
+ ('Ⓐ', 'ⓩ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳮ'),
+ ('Ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('Ꙁ', 'ꙭ'),
+ ('Ꚁ', 'ꚝ'),
+ ('Ꜣ', 'ꞇ'),
+ ('Ꞌ', 'ꞎ'),
+ ('Ꞑ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꟶ'),
+ ('ꟸ', 'ꟺ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꮿ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('𐐀', '𐑏'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐞀', '𐞀'),
+ ('𐞃', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𑢠', '𑣟'),
+ ('𖹀', '𖹿'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝼀', '𝼉'),
+ ('𝼋', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞀰', '𞁭'),
+ ('𞤀', '𞥃'),
+ ('🄰', '🅉'),
+ ('🅐', '🅩'),
+ ('🅰', '🆉'),
+];
+
+pub const CHANGES_WHEN_CASEFOLDED: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('µ', 'µ'),
+ ('À', 'Ö'),
+ ('Ø', 'ß'),
+ ('Ā', 'Ā'),
+ ('Ă', 'Ă'),
+ ('Ą', 'Ą'),
+ ('Ć', 'Ć'),
+ ('Ĉ', 'Ĉ'),
+ ('Ċ', 'Ċ'),
+ ('Č', 'Č'),
+ ('Ď', 'Ď'),
+ ('Đ', 'Đ'),
+ ('Ē', 'Ē'),
+ ('Ĕ', 'Ĕ'),
+ ('Ė', 'Ė'),
+ ('Ę', 'Ę'),
+ ('Ě', 'Ě'),
+ ('Ĝ', 'Ĝ'),
+ ('Ğ', 'Ğ'),
+ ('Ġ', 'Ġ'),
+ ('Ģ', 'Ģ'),
+ ('Ĥ', 'Ĥ'),
+ ('Ħ', 'Ħ'),
+ ('Ĩ', 'Ĩ'),
+ ('Ī', 'Ī'),
+ ('Ĭ', 'Ĭ'),
+ ('Į', 'Į'),
+ ('İ', 'İ'),
+ ('IJ', 'IJ'),
+ ('Ĵ', 'Ĵ'),
+ ('Ķ', 'Ķ'),
+ ('Ĺ', 'Ĺ'),
+ ('Ļ', 'Ļ'),
+ ('Ľ', 'Ľ'),
+ ('Ŀ', 'Ŀ'),
+ ('Ł', 'Ł'),
+ ('Ń', 'Ń'),
+ ('Ņ', 'Ņ'),
+ ('Ň', 'Ň'),
+ ('ʼn', 'Ŋ'),
+ ('Ō', 'Ō'),
+ ('Ŏ', 'Ŏ'),
+ ('Ő', 'Ő'),
+ ('Œ', 'Œ'),
+ ('Ŕ', 'Ŕ'),
+ ('Ŗ', 'Ŗ'),
+ ('Ř', 'Ř'),
+ ('Ś', 'Ś'),
+ ('Ŝ', 'Ŝ'),
+ ('Ş', 'Ş'),
+ ('Š', 'Š'),
+ ('Ţ', 'Ţ'),
+ ('Ť', 'Ť'),
+ ('Ŧ', 'Ŧ'),
+ ('Ũ', 'Ũ'),
+ ('Ū', 'Ū'),
+ ('Ŭ', 'Ŭ'),
+ ('Ů', 'Ů'),
+ ('Ű', 'Ű'),
+ ('Ų', 'Ų'),
+ ('Ŵ', 'Ŵ'),
+ ('Ŷ', 'Ŷ'),
+ ('Ÿ', 'Ź'),
+ ('Ż', 'Ż'),
+ ('Ž', 'Ž'),
+ ('ſ', 'ſ'),
+ ('Ɓ', 'Ƃ'),
+ ('Ƅ', 'Ƅ'),
+ ('Ɔ', 'Ƈ'),
+ ('Ɖ', 'Ƌ'),
+ ('Ǝ', 'Ƒ'),
+ ('Ɠ', 'Ɣ'),
+ ('Ɩ', 'Ƙ'),
+ ('Ɯ', 'Ɲ'),
+ ('Ɵ', 'Ơ'),
+ ('Ƣ', 'Ƣ'),
+ ('Ƥ', 'Ƥ'),
+ ('Ʀ', 'Ƨ'),
+ ('Ʃ', 'Ʃ'),
+ ('Ƭ', 'Ƭ'),
+ ('Ʈ', 'Ư'),
+ ('Ʊ', 'Ƴ'),
+ ('Ƶ', 'Ƶ'),
+ ('Ʒ', 'Ƹ'),
+ ('Ƽ', 'Ƽ'),
+ ('DŽ', 'Dž'),
+ ('LJ', 'Lj'),
+ ('NJ', 'Nj'),
+ ('Ǎ', 'Ǎ'),
+ ('Ǐ', 'Ǐ'),
+ ('Ǒ', 'Ǒ'),
+ ('Ǔ', 'Ǔ'),
+ ('Ǖ', 'Ǖ'),
+ ('Ǘ', 'Ǘ'),
+ ('Ǚ', 'Ǚ'),
+ ('Ǜ', 'Ǜ'),
+ ('Ǟ', 'Ǟ'),
+ ('Ǡ', 'Ǡ'),
+ ('Ǣ', 'Ǣ'),
+ ('Ǥ', 'Ǥ'),
+ ('Ǧ', 'Ǧ'),
+ ('Ǩ', 'Ǩ'),
+ ('Ǫ', 'Ǫ'),
+ ('Ǭ', 'Ǭ'),
+ ('Ǯ', 'Ǯ'),
+ ('DZ', 'Dz'),
+ ('Ǵ', 'Ǵ'),
+ ('Ƕ', 'Ǹ'),
+ ('Ǻ', 'Ǻ'),
+ ('Ǽ', 'Ǽ'),
+ ('Ǿ', 'Ǿ'),
+ ('Ȁ', 'Ȁ'),
+ ('Ȃ', 'Ȃ'),
+ ('Ȅ', 'Ȅ'),
+ ('Ȇ', 'Ȇ'),
+ ('Ȉ', 'Ȉ'),
+ ('Ȋ', 'Ȋ'),
+ ('Ȍ', 'Ȍ'),
+ ('Ȏ', 'Ȏ'),
+ ('Ȑ', 'Ȑ'),
+ ('Ȓ', 'Ȓ'),
+ ('Ȕ', 'Ȕ'),
+ ('Ȗ', 'Ȗ'),
+ ('Ș', 'Ș'),
+ ('Ț', 'Ț'),
+ ('Ȝ', 'Ȝ'),
+ ('Ȟ', 'Ȟ'),
+ ('Ƞ', 'Ƞ'),
+ ('Ȣ', 'Ȣ'),
+ ('Ȥ', 'Ȥ'),
+ ('Ȧ', 'Ȧ'),
+ ('Ȩ', 'Ȩ'),
+ ('Ȫ', 'Ȫ'),
+ ('Ȭ', 'Ȭ'),
+ ('Ȯ', 'Ȯ'),
+ ('Ȱ', 'Ȱ'),
+ ('Ȳ', 'Ȳ'),
+ ('Ⱥ', 'Ȼ'),
+ ('Ƚ', 'Ⱦ'),
+ ('Ɂ', 'Ɂ'),
+ ('Ƀ', 'Ɇ'),
+ ('Ɉ', 'Ɉ'),
+ ('Ɋ', 'Ɋ'),
+ ('Ɍ', 'Ɍ'),
+ ('Ɏ', 'Ɏ'),
+ ('\u{345}', '\u{345}'),
+ ('Ͱ', 'Ͱ'),
+ ('Ͳ', 'Ͳ'),
+ ('Ͷ', 'Ͷ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ώ'),
+ ('Α', 'Ρ'),
+ ('Σ', 'Ϋ'),
+ ('ς', 'ς'),
+ ('Ϗ', 'ϑ'),
+ ('ϕ', 'ϖ'),
+ ('Ϙ', 'Ϙ'),
+ ('Ϛ', 'Ϛ'),
+ ('Ϝ', 'Ϝ'),
+ ('Ϟ', 'Ϟ'),
+ ('Ϡ', 'Ϡ'),
+ ('Ϣ', 'Ϣ'),
+ ('Ϥ', 'Ϥ'),
+ ('Ϧ', 'Ϧ'),
+ ('Ϩ', 'Ϩ'),
+ ('Ϫ', 'Ϫ'),
+ ('Ϭ', 'Ϭ'),
+ ('Ϯ', 'Ϯ'),
+ ('ϰ', 'ϱ'),
+ ('ϴ', 'ϵ'),
+ ('Ϸ', 'Ϸ'),
+ ('Ϲ', 'Ϻ'),
+ ('Ͻ', 'Я'),
+ ('Ѡ', 'Ѡ'),
+ ('Ѣ', 'Ѣ'),
+ ('Ѥ', 'Ѥ'),
+ ('Ѧ', 'Ѧ'),
+ ('Ѩ', 'Ѩ'),
+ ('Ѫ', 'Ѫ'),
+ ('Ѭ', 'Ѭ'),
+ ('Ѯ', 'Ѯ'),
+ ('Ѱ', 'Ѱ'),
+ ('Ѳ', 'Ѳ'),
+ ('Ѵ', 'Ѵ'),
+ ('Ѷ', 'Ѷ'),
+ ('Ѹ', 'Ѹ'),
+ ('Ѻ', 'Ѻ'),
+ ('Ѽ', 'Ѽ'),
+ ('Ѿ', 'Ѿ'),
+ ('Ҁ', 'Ҁ'),
+ ('Ҋ', 'Ҋ'),
+ ('Ҍ', 'Ҍ'),
+ ('Ҏ', 'Ҏ'),
+ ('Ґ', 'Ґ'),
+ ('Ғ', 'Ғ'),
+ ('Ҕ', 'Ҕ'),
+ ('Җ', 'Җ'),
+ ('Ҙ', 'Ҙ'),
+ ('Қ', 'Қ'),
+ ('Ҝ', 'Ҝ'),
+ ('Ҟ', 'Ҟ'),
+ ('Ҡ', 'Ҡ'),
+ ('Ң', 'Ң'),
+ ('Ҥ', 'Ҥ'),
+ ('Ҧ', 'Ҧ'),
+ ('Ҩ', 'Ҩ'),
+ ('Ҫ', 'Ҫ'),
+ ('Ҭ', 'Ҭ'),
+ ('Ү', 'Ү'),
+ ('Ұ', 'Ұ'),
+ ('Ҳ', 'Ҳ'),
+ ('Ҵ', 'Ҵ'),
+ ('Ҷ', 'Ҷ'),
+ ('Ҹ', 'Ҹ'),
+ ('Һ', 'Һ'),
+ ('Ҽ', 'Ҽ'),
+ ('Ҿ', 'Ҿ'),
+ ('Ӏ', 'Ӂ'),
+ ('Ӄ', 'Ӄ'),
+ ('Ӆ', 'Ӆ'),
+ ('Ӈ', 'Ӈ'),
+ ('Ӊ', 'Ӊ'),
+ ('Ӌ', 'Ӌ'),
+ ('Ӎ', 'Ӎ'),
+ ('Ӑ', 'Ӑ'),
+ ('Ӓ', 'Ӓ'),
+ ('Ӕ', 'Ӕ'),
+ ('Ӗ', 'Ӗ'),
+ ('Ә', 'Ә'),
+ ('Ӛ', 'Ӛ'),
+ ('Ӝ', 'Ӝ'),
+ ('Ӟ', 'Ӟ'),
+ ('Ӡ', 'Ӡ'),
+ ('Ӣ', 'Ӣ'),
+ ('Ӥ', 'Ӥ'),
+ ('Ӧ', 'Ӧ'),
+ ('Ө', 'Ө'),
+ ('Ӫ', 'Ӫ'),
+ ('Ӭ', 'Ӭ'),
+ ('Ӯ', 'Ӯ'),
+ ('Ӱ', 'Ӱ'),
+ ('Ӳ', 'Ӳ'),
+ ('Ӵ', 'Ӵ'),
+ ('Ӷ', 'Ӷ'),
+ ('Ӹ', 'Ӹ'),
+ ('Ӻ', 'Ӻ'),
+ ('Ӽ', 'Ӽ'),
+ ('Ӿ', 'Ӿ'),
+ ('Ԁ', 'Ԁ'),
+ ('Ԃ', 'Ԃ'),
+ ('Ԅ', 'Ԅ'),
+ ('Ԇ', 'Ԇ'),
+ ('Ԉ', 'Ԉ'),
+ ('Ԋ', 'Ԋ'),
+ ('Ԍ', 'Ԍ'),
+ ('Ԏ', 'Ԏ'),
+ ('Ԑ', 'Ԑ'),
+ ('Ԓ', 'Ԓ'),
+ ('Ԕ', 'Ԕ'),
+ ('Ԗ', 'Ԗ'),
+ ('Ԙ', 'Ԙ'),
+ ('Ԛ', 'Ԛ'),
+ ('Ԝ', 'Ԝ'),
+ ('Ԟ', 'Ԟ'),
+ ('Ԡ', 'Ԡ'),
+ ('Ԣ', 'Ԣ'),
+ ('Ԥ', 'Ԥ'),
+ ('Ԧ', 'Ԧ'),
+ ('Ԩ', 'Ԩ'),
+ ('Ԫ', 'Ԫ'),
+ ('Ԭ', 'Ԭ'),
+ ('Ԯ', 'Ԯ'),
+ ('Ա', 'Ֆ'),
+ ('և', 'և'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('Ḁ', 'Ḁ'),
+ ('Ḃ', 'Ḃ'),
+ ('Ḅ', 'Ḅ'),
+ ('Ḇ', 'Ḇ'),
+ ('Ḉ', 'Ḉ'),
+ ('Ḋ', 'Ḋ'),
+ ('Ḍ', 'Ḍ'),
+ ('Ḏ', 'Ḏ'),
+ ('Ḑ', 'Ḑ'),
+ ('Ḓ', 'Ḓ'),
+ ('Ḕ', 'Ḕ'),
+ ('Ḗ', 'Ḗ'),
+ ('Ḙ', 'Ḙ'),
+ ('Ḛ', 'Ḛ'),
+ ('Ḝ', 'Ḝ'),
+ ('Ḟ', 'Ḟ'),
+ ('Ḡ', 'Ḡ'),
+ ('Ḣ', 'Ḣ'),
+ ('Ḥ', 'Ḥ'),
+ ('Ḧ', 'Ḧ'),
+ ('Ḩ', 'Ḩ'),
+ ('Ḫ', 'Ḫ'),
+ ('Ḭ', 'Ḭ'),
+ ('Ḯ', 'Ḯ'),
+ ('Ḱ', 'Ḱ'),
+ ('Ḳ', 'Ḳ'),
+ ('Ḵ', 'Ḵ'),
+ ('Ḷ', 'Ḷ'),
+ ('Ḹ', 'Ḹ'),
+ ('Ḻ', 'Ḻ'),
+ ('Ḽ', 'Ḽ'),
+ ('Ḿ', 'Ḿ'),
+ ('Ṁ', 'Ṁ'),
+ ('Ṃ', 'Ṃ'),
+ ('Ṅ', 'Ṅ'),
+ ('Ṇ', 'Ṇ'),
+ ('Ṉ', 'Ṉ'),
+ ('Ṋ', 'Ṋ'),
+ ('Ṍ', 'Ṍ'),
+ ('Ṏ', 'Ṏ'),
+ ('Ṑ', 'Ṑ'),
+ ('Ṓ', 'Ṓ'),
+ ('Ṕ', 'Ṕ'),
+ ('Ṗ', 'Ṗ'),
+ ('Ṙ', 'Ṙ'),
+ ('Ṛ', 'Ṛ'),
+ ('Ṝ', 'Ṝ'),
+ ('Ṟ', 'Ṟ'),
+ ('Ṡ', 'Ṡ'),
+ ('Ṣ', 'Ṣ'),
+ ('Ṥ', 'Ṥ'),
+ ('Ṧ', 'Ṧ'),
+ ('Ṩ', 'Ṩ'),
+ ('Ṫ', 'Ṫ'),
+ ('Ṭ', 'Ṭ'),
+ ('Ṯ', 'Ṯ'),
+ ('Ṱ', 'Ṱ'),
+ ('Ṳ', 'Ṳ'),
+ ('Ṵ', 'Ṵ'),
+ ('Ṷ', 'Ṷ'),
+ ('Ṹ', 'Ṹ'),
+ ('Ṻ', 'Ṻ'),
+ ('Ṽ', 'Ṽ'),
+ ('Ṿ', 'Ṿ'),
+ ('Ẁ', 'Ẁ'),
+ ('Ẃ', 'Ẃ'),
+ ('Ẅ', 'Ẅ'),
+ ('Ẇ', 'Ẇ'),
+ ('Ẉ', 'Ẉ'),
+ ('Ẋ', 'Ẋ'),
+ ('Ẍ', 'Ẍ'),
+ ('Ẏ', 'Ẏ'),
+ ('Ẑ', 'Ẑ'),
+ ('Ẓ', 'Ẓ'),
+ ('Ẕ', 'Ẕ'),
+ ('ẚ', 'ẛ'),
+ ('ẞ', 'ẞ'),
+ ('Ạ', 'Ạ'),
+ ('Ả', 'Ả'),
+ ('Ấ', 'Ấ'),
+ ('Ầ', 'Ầ'),
+ ('Ẩ', 'Ẩ'),
+ ('Ẫ', 'Ẫ'),
+ ('Ậ', 'Ậ'),
+ ('Ắ', 'Ắ'),
+ ('Ằ', 'Ằ'),
+ ('Ẳ', 'Ẳ'),
+ ('Ẵ', 'Ẵ'),
+ ('Ặ', 'Ặ'),
+ ('Ẹ', 'Ẹ'),
+ ('Ẻ', 'Ẻ'),
+ ('Ẽ', 'Ẽ'),
+ ('Ế', 'Ế'),
+ ('Ề', 'Ề'),
+ ('Ể', 'Ể'),
+ ('Ễ', 'Ễ'),
+ ('Ệ', 'Ệ'),
+ ('Ỉ', 'Ỉ'),
+ ('Ị', 'Ị'),
+ ('Ọ', 'Ọ'),
+ ('Ỏ', 'Ỏ'),
+ ('Ố', 'Ố'),
+ ('Ồ', 'Ồ'),
+ ('Ổ', 'Ổ'),
+ ('Ỗ', 'Ỗ'),
+ ('Ộ', 'Ộ'),
+ ('Ớ', 'Ớ'),
+ ('Ờ', 'Ờ'),
+ ('Ở', 'Ở'),
+ ('Ỡ', 'Ỡ'),
+ ('Ợ', 'Ợ'),
+ ('Ụ', 'Ụ'),
+ ('Ủ', 'Ủ'),
+ ('Ứ', 'Ứ'),
+ ('Ừ', 'Ừ'),
+ ('Ử', 'Ử'),
+ ('Ữ', 'Ữ'),
+ ('Ự', 'Ự'),
+ ('Ỳ', 'Ỳ'),
+ ('Ỵ', 'Ỵ'),
+ ('Ỷ', 'Ỷ'),
+ ('Ỹ', 'Ỹ'),
+ ('Ỻ', 'Ỻ'),
+ ('Ỽ', 'Ỽ'),
+ ('Ỿ', 'Ỿ'),
+ ('Ἀ', 'Ἇ'),
+ ('Ἐ', 'Ἕ'),
+ ('Ἠ', 'Ἧ'),
+ ('Ἰ', 'Ἷ'),
+ ('Ὀ', 'Ὅ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'Ὗ'),
+ ('Ὠ', 'Ὧ'),
+ ('ᾀ', 'ᾯ'),
+ ('ᾲ', 'ᾴ'),
+ ('ᾷ', 'ᾼ'),
+ ('ῂ', 'ῄ'),
+ ('ῇ', 'ῌ'),
+ ('Ῐ', 'Ί'),
+ ('Ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῷ', 'ῼ'),
+ ('Ω', 'Ω'),
+ ('K', 'Å'),
+ ('Ⅎ', 'Ⅎ'),
+ ('Ⅰ', 'Ⅿ'),
+ ('Ↄ', 'Ↄ'),
+ ('Ⓐ', 'Ⓩ'),
+ ('Ⰰ', 'Ⱟ'),
+ ('Ⱡ', 'Ⱡ'),
+ ('Ɫ', 'Ɽ'),
+ ('Ⱨ', 'Ⱨ'),
+ ('Ⱪ', 'Ⱪ'),
+ ('Ⱬ', 'Ⱬ'),
+ ('Ɑ', 'Ɒ'),
+ ('Ⱳ', 'Ⱳ'),
+ ('Ⱶ', 'Ⱶ'),
+ ('Ȿ', 'Ⲁ'),
+ ('Ⲃ', 'Ⲃ'),
+ ('Ⲅ', 'Ⲅ'),
+ ('Ⲇ', 'Ⲇ'),
+ ('Ⲉ', 'Ⲉ'),
+ ('Ⲋ', 'Ⲋ'),
+ ('Ⲍ', 'Ⲍ'),
+ ('Ⲏ', 'Ⲏ'),
+ ('Ⲑ', 'Ⲑ'),
+ ('Ⲓ', 'Ⲓ'),
+ ('Ⲕ', 'Ⲕ'),
+ ('Ⲗ', 'Ⲗ'),
+ ('Ⲙ', 'Ⲙ'),
+ ('Ⲛ', 'Ⲛ'),
+ ('Ⲝ', 'Ⲝ'),
+ ('Ⲟ', 'Ⲟ'),
+ ('Ⲡ', 'Ⲡ'),
+ ('Ⲣ', 'Ⲣ'),
+ ('Ⲥ', 'Ⲥ'),
+ ('Ⲧ', 'Ⲧ'),
+ ('Ⲩ', 'Ⲩ'),
+ ('Ⲫ', 'Ⲫ'),
+ ('Ⲭ', 'Ⲭ'),
+ ('Ⲯ', 'Ⲯ'),
+ ('Ⲱ', 'Ⲱ'),
+ ('Ⲳ', 'Ⲳ'),
+ ('Ⲵ', 'Ⲵ'),
+ ('Ⲷ', 'Ⲷ'),
+ ('Ⲹ', 'Ⲹ'),
+ ('Ⲻ', 'Ⲻ'),
+ ('Ⲽ', 'Ⲽ'),
+ ('Ⲿ', 'Ⲿ'),
+ ('Ⳁ', 'Ⳁ'),
+ ('Ⳃ', 'Ⳃ'),
+ ('Ⳅ', 'Ⳅ'),
+ ('Ⳇ', 'Ⳇ'),
+ ('Ⳉ', 'Ⳉ'),
+ ('Ⳋ', 'Ⳋ'),
+ ('Ⳍ', 'Ⳍ'),
+ ('Ⳏ', 'Ⳏ'),
+ ('Ⳑ', 'Ⳑ'),
+ ('Ⳓ', 'Ⳓ'),
+ ('Ⳕ', 'Ⳕ'),
+ ('Ⳗ', 'Ⳗ'),
+ ('Ⳙ', 'Ⳙ'),
+ ('Ⳛ', 'Ⳛ'),
+ ('Ⳝ', 'Ⳝ'),
+ ('Ⳟ', 'Ⳟ'),
+ ('Ⳡ', 'Ⳡ'),
+ ('Ⳣ', 'Ⳣ'),
+ ('Ⳬ', 'Ⳬ'),
+ ('Ⳮ', 'Ⳮ'),
+ ('Ⳳ', 'Ⳳ'),
+ ('Ꙁ', 'Ꙁ'),
+ ('Ꙃ', 'Ꙃ'),
+ ('Ꙅ', 'Ꙅ'),
+ ('Ꙇ', 'Ꙇ'),
+ ('Ꙉ', 'Ꙉ'),
+ ('Ꙋ', 'Ꙋ'),
+ ('Ꙍ', 'Ꙍ'),
+ ('Ꙏ', 'Ꙏ'),
+ ('Ꙑ', 'Ꙑ'),
+ ('Ꙓ', 'Ꙓ'),
+ ('Ꙕ', 'Ꙕ'),
+ ('Ꙗ', 'Ꙗ'),
+ ('Ꙙ', 'Ꙙ'),
+ ('Ꙛ', 'Ꙛ'),
+ ('Ꙝ', 'Ꙝ'),
+ ('Ꙟ', 'Ꙟ'),
+ ('Ꙡ', 'Ꙡ'),
+ ('Ꙣ', 'Ꙣ'),
+ ('Ꙥ', 'Ꙥ'),
+ ('Ꙧ', 'Ꙧ'),
+ ('Ꙩ', 'Ꙩ'),
+ ('Ꙫ', 'Ꙫ'),
+ ('Ꙭ', 'Ꙭ'),
+ ('Ꚁ', 'Ꚁ'),
+ ('Ꚃ', 'Ꚃ'),
+ ('Ꚅ', 'Ꚅ'),
+ ('Ꚇ', 'Ꚇ'),
+ ('Ꚉ', 'Ꚉ'),
+ ('Ꚋ', 'Ꚋ'),
+ ('Ꚍ', 'Ꚍ'),
+ ('Ꚏ', 'Ꚏ'),
+ ('Ꚑ', 'Ꚑ'),
+ ('Ꚓ', 'Ꚓ'),
+ ('Ꚕ', 'Ꚕ'),
+ ('Ꚗ', 'Ꚗ'),
+ ('Ꚙ', 'Ꚙ'),
+ ('Ꚛ', 'Ꚛ'),
+ ('Ꜣ', 'Ꜣ'),
+ ('Ꜥ', 'Ꜥ'),
+ ('Ꜧ', 'Ꜧ'),
+ ('Ꜩ', 'Ꜩ'),
+ ('Ꜫ', 'Ꜫ'),
+ ('Ꜭ', 'Ꜭ'),
+ ('Ꜯ', 'Ꜯ'),
+ ('Ꜳ', 'Ꜳ'),
+ ('Ꜵ', 'Ꜵ'),
+ ('Ꜷ', 'Ꜷ'),
+ ('Ꜹ', 'Ꜹ'),
+ ('Ꜻ', 'Ꜻ'),
+ ('Ꜽ', 'Ꜽ'),
+ ('Ꜿ', 'Ꜿ'),
+ ('Ꝁ', 'Ꝁ'),
+ ('Ꝃ', 'Ꝃ'),
+ ('Ꝅ', 'Ꝅ'),
+ ('Ꝇ', 'Ꝇ'),
+ ('Ꝉ', 'Ꝉ'),
+ ('Ꝋ', 'Ꝋ'),
+ ('Ꝍ', 'Ꝍ'),
+ ('Ꝏ', 'Ꝏ'),
+ ('Ꝑ', 'Ꝑ'),
+ ('Ꝓ', 'Ꝓ'),
+ ('Ꝕ', 'Ꝕ'),
+ ('Ꝗ', 'Ꝗ'),
+ ('Ꝙ', 'Ꝙ'),
+ ('Ꝛ', 'Ꝛ'),
+ ('Ꝝ', 'Ꝝ'),
+ ('Ꝟ', 'Ꝟ'),
+ ('Ꝡ', 'Ꝡ'),
+ ('Ꝣ', 'Ꝣ'),
+ ('Ꝥ', 'Ꝥ'),
+ ('Ꝧ', 'Ꝧ'),
+ ('Ꝩ', 'Ꝩ'),
+ ('Ꝫ', 'Ꝫ'),
+ ('Ꝭ', 'Ꝭ'),
+ ('Ꝯ', 'Ꝯ'),
+ ('Ꝺ', 'Ꝺ'),
+ ('Ꝼ', 'Ꝼ'),
+ ('Ᵹ', 'Ꝿ'),
+ ('Ꞁ', 'Ꞁ'),
+ ('Ꞃ', 'Ꞃ'),
+ ('Ꞅ', 'Ꞅ'),
+ ('Ꞇ', 'Ꞇ'),
+ ('Ꞌ', 'Ꞌ'),
+ ('Ɥ', 'Ɥ'),
+ ('Ꞑ', 'Ꞑ'),
+ ('Ꞓ', 'Ꞓ'),
+ ('Ꞗ', 'Ꞗ'),
+ ('Ꞙ', 'Ꞙ'),
+ ('Ꞛ', 'Ꞛ'),
+ ('Ꞝ', 'Ꞝ'),
+ ('Ꞟ', 'Ꞟ'),
+ ('Ꞡ', 'Ꞡ'),
+ ('Ꞣ', 'Ꞣ'),
+ ('Ꞥ', 'Ꞥ'),
+ ('Ꞧ', 'Ꞧ'),
+ ('Ꞩ', 'Ꞩ'),
+ ('Ɦ', 'Ɪ'),
+ ('Ʞ', 'Ꞵ'),
+ ('Ꞷ', 'Ꞷ'),
+ ('Ꞹ', 'Ꞹ'),
+ ('Ꞻ', 'Ꞻ'),
+ ('Ꞽ', 'Ꞽ'),
+ ('Ꞿ', 'Ꞿ'),
+ ('Ꟁ', 'Ꟁ'),
+ ('Ꟃ', 'Ꟃ'),
+ ('Ꞔ', 'Ꟈ'),
+ ('Ꟊ', 'Ꟊ'),
+ ('Ꟑ', 'Ꟑ'),
+ ('Ꟗ', 'Ꟗ'),
+ ('Ꟙ', 'Ꟙ'),
+ ('Ꟶ', 'Ꟶ'),
+ ('ꭰ', 'ꮿ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('A', 'Z'),
+ ('𐐀', '𐐧'),
+ ('𐒰', '𐓓'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐲀', '𐲲'),
+ ('𑢠', '𑢿'),
+ ('𖹀', '𖹟'),
+ ('𞤀', '𞤡'),
+];
+
+pub const CHANGES_WHEN_CASEMAPPED: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('µ', 'µ'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ķ'),
+ ('Ĺ', 'ƌ'),
+ ('Ǝ', 'ƚ'),
+ ('Ɯ', 'Ʃ'),
+ ('Ƭ', 'ƹ'),
+ ('Ƽ', 'ƽ'),
+ ('ƿ', 'ƿ'),
+ ('DŽ', 'Ƞ'),
+ ('Ȣ', 'ȳ'),
+ ('Ⱥ', 'ɔ'),
+ ('ɖ', 'ɗ'),
+ ('ə', 'ə'),
+ ('ɛ', 'ɜ'),
+ ('ɠ', 'ɡ'),
+ ('ɣ', 'ɣ'),
+ ('ɥ', 'ɦ'),
+ ('ɨ', 'ɬ'),
+ ('ɯ', 'ɯ'),
+ ('ɱ', 'ɲ'),
+ ('ɵ', 'ɵ'),
+ ('ɽ', 'ɽ'),
+ ('ʀ', 'ʀ'),
+ ('ʂ', 'ʃ'),
+ ('ʇ', 'ʌ'),
+ ('ʒ', 'ʒ'),
+ ('ʝ', 'ʞ'),
+ ('\u{345}', '\u{345}'),
+ ('Ͱ', 'ͳ'),
+ ('Ͷ', 'ͷ'),
+ ('ͻ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϑ'),
+ ('ϕ', 'ϵ'),
+ ('Ϸ', 'ϻ'),
+ ('Ͻ', 'ҁ'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ա', 'և'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჽ', 'ჿ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ᵹ', 'ᵹ'),
+ ('ᵽ', 'ᵽ'),
+ ('ᶎ', 'ᶎ'),
+ ('Ḁ', 'ẛ'),
+ ('ẞ', 'ẞ'),
+ ('Ạ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('Ω', 'Ω'),
+ ('K', 'Å'),
+ ('Ⅎ', 'Ⅎ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ⅿ'),
+ ('Ↄ', 'ↄ'),
+ ('Ⓐ', 'ⓩ'),
+ ('Ⰰ', 'Ɒ'),
+ ('Ⱳ', 'ⱳ'),
+ ('Ⱶ', 'ⱶ'),
+ ('Ȿ', 'ⳣ'),
+ ('Ⳬ', 'ⳮ'),
+ ('Ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('Ꙁ', 'ꙭ'),
+ ('Ꚁ', 'ꚛ'),
+ ('Ꜣ', 'ꜯ'),
+ ('Ꜳ', 'ꝯ'),
+ ('Ꝺ', 'ꞇ'),
+ ('Ꞌ', 'Ɥ'),
+ ('Ꞑ', 'ꞔ'),
+ ('Ꞗ', 'Ɪ'),
+ ('Ʞ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('Ꟗ', 'ꟙ'),
+ ('Ꟶ', 'ꟶ'),
+ ('ꭓ', 'ꭓ'),
+ ('ꭰ', 'ꮿ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('𐐀', '𐑏'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𑢠', '𑣟'),
+ ('𖹀', '𖹿'),
+ ('𞤀', '𞥃'),
+];
+
+pub const CHANGES_WHEN_LOWERCASED: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('À', 'Ö'),
+ ('Ø', 'Þ'),
+ ('Ā', 'Ā'),
+ ('Ă', 'Ă'),
+ ('Ą', 'Ą'),
+ ('Ć', 'Ć'),
+ ('Ĉ', 'Ĉ'),
+ ('Ċ', 'Ċ'),
+ ('Č', 'Č'),
+ ('Ď', 'Ď'),
+ ('Đ', 'Đ'),
+ ('Ē', 'Ē'),
+ ('Ĕ', 'Ĕ'),
+ ('Ė', 'Ė'),
+ ('Ę', 'Ę'),
+ ('Ě', 'Ě'),
+ ('Ĝ', 'Ĝ'),
+ ('Ğ', 'Ğ'),
+ ('Ġ', 'Ġ'),
+ ('Ģ', 'Ģ'),
+ ('Ĥ', 'Ĥ'),
+ ('Ħ', 'Ħ'),
+ ('Ĩ', 'Ĩ'),
+ ('Ī', 'Ī'),
+ ('Ĭ', 'Ĭ'),
+ ('Į', 'Į'),
+ ('İ', 'İ'),
+ ('IJ', 'IJ'),
+ ('Ĵ', 'Ĵ'),
+ ('Ķ', 'Ķ'),
+ ('Ĺ', 'Ĺ'),
+ ('Ļ', 'Ļ'),
+ ('Ľ', 'Ľ'),
+ ('Ŀ', 'Ŀ'),
+ ('Ł', 'Ł'),
+ ('Ń', 'Ń'),
+ ('Ņ', 'Ņ'),
+ ('Ň', 'Ň'),
+ ('Ŋ', 'Ŋ'),
+ ('Ō', 'Ō'),
+ ('Ŏ', 'Ŏ'),
+ ('Ő', 'Ő'),
+ ('Œ', 'Œ'),
+ ('Ŕ', 'Ŕ'),
+ ('Ŗ', 'Ŗ'),
+ ('Ř', 'Ř'),
+ ('Ś', 'Ś'),
+ ('Ŝ', 'Ŝ'),
+ ('Ş', 'Ş'),
+ ('Š', 'Š'),
+ ('Ţ', 'Ţ'),
+ ('Ť', 'Ť'),
+ ('Ŧ', 'Ŧ'),
+ ('Ũ', 'Ũ'),
+ ('Ū', 'Ū'),
+ ('Ŭ', 'Ŭ'),
+ ('Ů', 'Ů'),
+ ('Ű', 'Ű'),
+ ('Ų', 'Ų'),
+ ('Ŵ', 'Ŵ'),
+ ('Ŷ', 'Ŷ'),
+ ('Ÿ', 'Ź'),
+ ('Ż', 'Ż'),
+ ('Ž', 'Ž'),
+ ('Ɓ', 'Ƃ'),
+ ('Ƅ', 'Ƅ'),
+ ('Ɔ', 'Ƈ'),
+ ('Ɖ', 'Ƌ'),
+ ('Ǝ', 'Ƒ'),
+ ('Ɠ', 'Ɣ'),
+ ('Ɩ', 'Ƙ'),
+ ('Ɯ', 'Ɲ'),
+ ('Ɵ', 'Ơ'),
+ ('Ƣ', 'Ƣ'),
+ ('Ƥ', 'Ƥ'),
+ ('Ʀ', 'Ƨ'),
+ ('Ʃ', 'Ʃ'),
+ ('Ƭ', 'Ƭ'),
+ ('Ʈ', 'Ư'),
+ ('Ʊ', 'Ƴ'),
+ ('Ƶ', 'Ƶ'),
+ ('Ʒ', 'Ƹ'),
+ ('Ƽ', 'Ƽ'),
+ ('DŽ', 'Dž'),
+ ('LJ', 'Lj'),
+ ('NJ', 'Nj'),
+ ('Ǎ', 'Ǎ'),
+ ('Ǐ', 'Ǐ'),
+ ('Ǒ', 'Ǒ'),
+ ('Ǔ', 'Ǔ'),
+ ('Ǖ', 'Ǖ'),
+ ('Ǘ', 'Ǘ'),
+ ('Ǚ', 'Ǚ'),
+ ('Ǜ', 'Ǜ'),
+ ('Ǟ', 'Ǟ'),
+ ('Ǡ', 'Ǡ'),
+ ('Ǣ', 'Ǣ'),
+ ('Ǥ', 'Ǥ'),
+ ('Ǧ', 'Ǧ'),
+ ('Ǩ', 'Ǩ'),
+ ('Ǫ', 'Ǫ'),
+ ('Ǭ', 'Ǭ'),
+ ('Ǯ', 'Ǯ'),
+ ('DZ', 'Dz'),
+ ('Ǵ', 'Ǵ'),
+ ('Ƕ', 'Ǹ'),
+ ('Ǻ', 'Ǻ'),
+ ('Ǽ', 'Ǽ'),
+ ('Ǿ', 'Ǿ'),
+ ('Ȁ', 'Ȁ'),
+ ('Ȃ', 'Ȃ'),
+ ('Ȅ', 'Ȅ'),
+ ('Ȇ', 'Ȇ'),
+ ('Ȉ', 'Ȉ'),
+ ('Ȋ', 'Ȋ'),
+ ('Ȍ', 'Ȍ'),
+ ('Ȏ', 'Ȏ'),
+ ('Ȑ', 'Ȑ'),
+ ('Ȓ', 'Ȓ'),
+ ('Ȕ', 'Ȕ'),
+ ('Ȗ', 'Ȗ'),
+ ('Ș', 'Ș'),
+ ('Ț', 'Ț'),
+ ('Ȝ', 'Ȝ'),
+ ('Ȟ', 'Ȟ'),
+ ('Ƞ', 'Ƞ'),
+ ('Ȣ', 'Ȣ'),
+ ('Ȥ', 'Ȥ'),
+ ('Ȧ', 'Ȧ'),
+ ('Ȩ', 'Ȩ'),
+ ('Ȫ', 'Ȫ'),
+ ('Ȭ', 'Ȭ'),
+ ('Ȯ', 'Ȯ'),
+ ('Ȱ', 'Ȱ'),
+ ('Ȳ', 'Ȳ'),
+ ('Ⱥ', 'Ȼ'),
+ ('Ƚ', 'Ⱦ'),
+ ('Ɂ', 'Ɂ'),
+ ('Ƀ', 'Ɇ'),
+ ('Ɉ', 'Ɉ'),
+ ('Ɋ', 'Ɋ'),
+ ('Ɍ', 'Ɍ'),
+ ('Ɏ', 'Ɏ'),
+ ('Ͱ', 'Ͱ'),
+ ('Ͳ', 'Ͳ'),
+ ('Ͷ', 'Ͷ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ώ'),
+ ('Α', 'Ρ'),
+ ('Σ', 'Ϋ'),
+ ('Ϗ', 'Ϗ'),
+ ('Ϙ', 'Ϙ'),
+ ('Ϛ', 'Ϛ'),
+ ('Ϝ', 'Ϝ'),
+ ('Ϟ', 'Ϟ'),
+ ('Ϡ', 'Ϡ'),
+ ('Ϣ', 'Ϣ'),
+ ('Ϥ', 'Ϥ'),
+ ('Ϧ', 'Ϧ'),
+ ('Ϩ', 'Ϩ'),
+ ('Ϫ', 'Ϫ'),
+ ('Ϭ', 'Ϭ'),
+ ('Ϯ', 'Ϯ'),
+ ('ϴ', 'ϴ'),
+ ('Ϸ', 'Ϸ'),
+ ('Ϲ', 'Ϻ'),
+ ('Ͻ', 'Я'),
+ ('Ѡ', 'Ѡ'),
+ ('Ѣ', 'Ѣ'),
+ ('Ѥ', 'Ѥ'),
+ ('Ѧ', 'Ѧ'),
+ ('Ѩ', 'Ѩ'),
+ ('Ѫ', 'Ѫ'),
+ ('Ѭ', 'Ѭ'),
+ ('Ѯ', 'Ѯ'),
+ ('Ѱ', 'Ѱ'),
+ ('Ѳ', 'Ѳ'),
+ ('Ѵ', 'Ѵ'),
+ ('Ѷ', 'Ѷ'),
+ ('Ѹ', 'Ѹ'),
+ ('Ѻ', 'Ѻ'),
+ ('Ѽ', 'Ѽ'),
+ ('Ѿ', 'Ѿ'),
+ ('Ҁ', 'Ҁ'),
+ ('Ҋ', 'Ҋ'),
+ ('Ҍ', 'Ҍ'),
+ ('Ҏ', 'Ҏ'),
+ ('Ґ', 'Ґ'),
+ ('Ғ', 'Ғ'),
+ ('Ҕ', 'Ҕ'),
+ ('Җ', 'Җ'),
+ ('Ҙ', 'Ҙ'),
+ ('Қ', 'Қ'),
+ ('Ҝ', 'Ҝ'),
+ ('Ҟ', 'Ҟ'),
+ ('Ҡ', 'Ҡ'),
+ ('Ң', 'Ң'),
+ ('Ҥ', 'Ҥ'),
+ ('Ҧ', 'Ҧ'),
+ ('Ҩ', 'Ҩ'),
+ ('Ҫ', 'Ҫ'),
+ ('Ҭ', 'Ҭ'),
+ ('Ү', 'Ү'),
+ ('Ұ', 'Ұ'),
+ ('Ҳ', 'Ҳ'),
+ ('Ҵ', 'Ҵ'),
+ ('Ҷ', 'Ҷ'),
+ ('Ҹ', 'Ҹ'),
+ ('Һ', 'Һ'),
+ ('Ҽ', 'Ҽ'),
+ ('Ҿ', 'Ҿ'),
+ ('Ӏ', 'Ӂ'),
+ ('Ӄ', 'Ӄ'),
+ ('Ӆ', 'Ӆ'),
+ ('Ӈ', 'Ӈ'),
+ ('Ӊ', 'Ӊ'),
+ ('Ӌ', 'Ӌ'),
+ ('Ӎ', 'Ӎ'),
+ ('Ӑ', 'Ӑ'),
+ ('Ӓ', 'Ӓ'),
+ ('Ӕ', 'Ӕ'),
+ ('Ӗ', 'Ӗ'),
+ ('Ә', 'Ә'),
+ ('Ӛ', 'Ӛ'),
+ ('Ӝ', 'Ӝ'),
+ ('Ӟ', 'Ӟ'),
+ ('Ӡ', 'Ӡ'),
+ ('Ӣ', 'Ӣ'),
+ ('Ӥ', 'Ӥ'),
+ ('Ӧ', 'Ӧ'),
+ ('Ө', 'Ө'),
+ ('Ӫ', 'Ӫ'),
+ ('Ӭ', 'Ӭ'),
+ ('Ӯ', 'Ӯ'),
+ ('Ӱ', 'Ӱ'),
+ ('Ӳ', 'Ӳ'),
+ ('Ӵ', 'Ӵ'),
+ ('Ӷ', 'Ӷ'),
+ ('Ӹ', 'Ӹ'),
+ ('Ӻ', 'Ӻ'),
+ ('Ӽ', 'Ӽ'),
+ ('Ӿ', 'Ӿ'),
+ ('Ԁ', 'Ԁ'),
+ ('Ԃ', 'Ԃ'),
+ ('Ԅ', 'Ԅ'),
+ ('Ԇ', 'Ԇ'),
+ ('Ԉ', 'Ԉ'),
+ ('Ԋ', 'Ԋ'),
+ ('Ԍ', 'Ԍ'),
+ ('Ԏ', 'Ԏ'),
+ ('Ԑ', 'Ԑ'),
+ ('Ԓ', 'Ԓ'),
+ ('Ԕ', 'Ԕ'),
+ ('Ԗ', 'Ԗ'),
+ ('Ԙ', 'Ԙ'),
+ ('Ԛ', 'Ԛ'),
+ ('Ԝ', 'Ԝ'),
+ ('Ԟ', 'Ԟ'),
+ ('Ԡ', 'Ԡ'),
+ ('Ԣ', 'Ԣ'),
+ ('Ԥ', 'Ԥ'),
+ ('Ԧ', 'Ԧ'),
+ ('Ԩ', 'Ԩ'),
+ ('Ԫ', 'Ԫ'),
+ ('Ԭ', 'Ԭ'),
+ ('Ԯ', 'Ԯ'),
+ ('Ա', 'Ֆ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('Ḁ', 'Ḁ'),
+ ('Ḃ', 'Ḃ'),
+ ('Ḅ', 'Ḅ'),
+ ('Ḇ', 'Ḇ'),
+ ('Ḉ', 'Ḉ'),
+ ('Ḋ', 'Ḋ'),
+ ('Ḍ', 'Ḍ'),
+ ('Ḏ', 'Ḏ'),
+ ('Ḑ', 'Ḑ'),
+ ('Ḓ', 'Ḓ'),
+ ('Ḕ', 'Ḕ'),
+ ('Ḗ', 'Ḗ'),
+ ('Ḙ', 'Ḙ'),
+ ('Ḛ', 'Ḛ'),
+ ('Ḝ', 'Ḝ'),
+ ('Ḟ', 'Ḟ'),
+ ('Ḡ', 'Ḡ'),
+ ('Ḣ', 'Ḣ'),
+ ('Ḥ', 'Ḥ'),
+ ('Ḧ', 'Ḧ'),
+ ('Ḩ', 'Ḩ'),
+ ('Ḫ', 'Ḫ'),
+ ('Ḭ', 'Ḭ'),
+ ('Ḯ', 'Ḯ'),
+ ('Ḱ', 'Ḱ'),
+ ('Ḳ', 'Ḳ'),
+ ('Ḵ', 'Ḵ'),
+ ('Ḷ', 'Ḷ'),
+ ('Ḹ', 'Ḹ'),
+ ('Ḻ', 'Ḻ'),
+ ('Ḽ', 'Ḽ'),
+ ('Ḿ', 'Ḿ'),
+ ('Ṁ', 'Ṁ'),
+ ('Ṃ', 'Ṃ'),
+ ('Ṅ', 'Ṅ'),
+ ('Ṇ', 'Ṇ'),
+ ('Ṉ', 'Ṉ'),
+ ('Ṋ', 'Ṋ'),
+ ('Ṍ', 'Ṍ'),
+ ('Ṏ', 'Ṏ'),
+ ('Ṑ', 'Ṑ'),
+ ('Ṓ', 'Ṓ'),
+ ('Ṕ', 'Ṕ'),
+ ('Ṗ', 'Ṗ'),
+ ('Ṙ', 'Ṙ'),
+ ('Ṛ', 'Ṛ'),
+ ('Ṝ', 'Ṝ'),
+ ('Ṟ', 'Ṟ'),
+ ('Ṡ', 'Ṡ'),
+ ('Ṣ', 'Ṣ'),
+ ('Ṥ', 'Ṥ'),
+ ('Ṧ', 'Ṧ'),
+ ('Ṩ', 'Ṩ'),
+ ('Ṫ', 'Ṫ'),
+ ('Ṭ', 'Ṭ'),
+ ('Ṯ', 'Ṯ'),
+ ('Ṱ', 'Ṱ'),
+ ('Ṳ', 'Ṳ'),
+ ('Ṵ', 'Ṵ'),
+ ('Ṷ', 'Ṷ'),
+ ('Ṹ', 'Ṹ'),
+ ('Ṻ', 'Ṻ'),
+ ('Ṽ', 'Ṽ'),
+ ('Ṿ', 'Ṿ'),
+ ('Ẁ', 'Ẁ'),
+ ('Ẃ', 'Ẃ'),
+ ('Ẅ', 'Ẅ'),
+ ('Ẇ', 'Ẇ'),
+ ('Ẉ', 'Ẉ'),
+ ('Ẋ', 'Ẋ'),
+ ('Ẍ', 'Ẍ'),
+ ('Ẏ', 'Ẏ'),
+ ('Ẑ', 'Ẑ'),
+ ('Ẓ', 'Ẓ'),
+ ('Ẕ', 'Ẕ'),
+ ('ẞ', 'ẞ'),
+ ('Ạ', 'Ạ'),
+ ('Ả', 'Ả'),
+ ('Ấ', 'Ấ'),
+ ('Ầ', 'Ầ'),
+ ('Ẩ', 'Ẩ'),
+ ('Ẫ', 'Ẫ'),
+ ('Ậ', 'Ậ'),
+ ('Ắ', 'Ắ'),
+ ('Ằ', 'Ằ'),
+ ('Ẳ', 'Ẳ'),
+ ('Ẵ', 'Ẵ'),
+ ('Ặ', 'Ặ'),
+ ('Ẹ', 'Ẹ'),
+ ('Ẻ', 'Ẻ'),
+ ('Ẽ', 'Ẽ'),
+ ('Ế', 'Ế'),
+ ('Ề', 'Ề'),
+ ('Ể', 'Ể'),
+ ('Ễ', 'Ễ'),
+ ('Ệ', 'Ệ'),
+ ('Ỉ', 'Ỉ'),
+ ('Ị', 'Ị'),
+ ('Ọ', 'Ọ'),
+ ('Ỏ', 'Ỏ'),
+ ('Ố', 'Ố'),
+ ('Ồ', 'Ồ'),
+ ('Ổ', 'Ổ'),
+ ('Ỗ', 'Ỗ'),
+ ('Ộ', 'Ộ'),
+ ('Ớ', 'Ớ'),
+ ('Ờ', 'Ờ'),
+ ('Ở', 'Ở'),
+ ('Ỡ', 'Ỡ'),
+ ('Ợ', 'Ợ'),
+ ('Ụ', 'Ụ'),
+ ('Ủ', 'Ủ'),
+ ('Ứ', 'Ứ'),
+ ('Ừ', 'Ừ'),
+ ('Ử', 'Ử'),
+ ('Ữ', 'Ữ'),
+ ('Ự', 'Ự'),
+ ('Ỳ', 'Ỳ'),
+ ('Ỵ', 'Ỵ'),
+ ('Ỷ', 'Ỷ'),
+ ('Ỹ', 'Ỹ'),
+ ('Ỻ', 'Ỻ'),
+ ('Ỽ', 'Ỽ'),
+ ('Ỿ', 'Ỿ'),
+ ('Ἀ', 'Ἇ'),
+ ('Ἐ', 'Ἕ'),
+ ('Ἠ', 'Ἧ'),
+ ('Ἰ', 'Ἷ'),
+ ('Ὀ', 'Ὅ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'Ὗ'),
+ ('Ὠ', 'Ὧ'),
+ ('ᾈ', 'ᾏ'),
+ ('ᾘ', 'ᾟ'),
+ ('ᾨ', 'ᾯ'),
+ ('Ᾰ', 'ᾼ'),
+ ('Ὲ', 'ῌ'),
+ ('Ῐ', 'Ί'),
+ ('Ῠ', 'Ῥ'),
+ ('Ὸ', 'ῼ'),
+ ('Ω', 'Ω'),
+ ('K', 'Å'),
+ ('Ⅎ', 'Ⅎ'),
+ ('Ⅰ', 'Ⅿ'),
+ ('Ↄ', 'Ↄ'),
+ ('Ⓐ', 'Ⓩ'),
+ ('Ⰰ', 'Ⱟ'),
+ ('Ⱡ', 'Ⱡ'),
+ ('Ɫ', 'Ɽ'),
+ ('Ⱨ', 'Ⱨ'),
+ ('Ⱪ', 'Ⱪ'),
+ ('Ⱬ', 'Ⱬ'),
+ ('Ɑ', 'Ɒ'),
+ ('Ⱳ', 'Ⱳ'),
+ ('Ⱶ', 'Ⱶ'),
+ ('Ȿ', 'Ⲁ'),
+ ('Ⲃ', 'Ⲃ'),
+ ('Ⲅ', 'Ⲅ'),
+ ('Ⲇ', 'Ⲇ'),
+ ('Ⲉ', 'Ⲉ'),
+ ('Ⲋ', 'Ⲋ'),
+ ('Ⲍ', 'Ⲍ'),
+ ('Ⲏ', 'Ⲏ'),
+ ('Ⲑ', 'Ⲑ'),
+ ('Ⲓ', 'Ⲓ'),
+ ('Ⲕ', 'Ⲕ'),
+ ('Ⲗ', 'Ⲗ'),
+ ('Ⲙ', 'Ⲙ'),
+ ('Ⲛ', 'Ⲛ'),
+ ('Ⲝ', 'Ⲝ'),
+ ('Ⲟ', 'Ⲟ'),
+ ('Ⲡ', 'Ⲡ'),
+ ('Ⲣ', 'Ⲣ'),
+ ('Ⲥ', 'Ⲥ'),
+ ('Ⲧ', 'Ⲧ'),
+ ('Ⲩ', 'Ⲩ'),
+ ('Ⲫ', 'Ⲫ'),
+ ('Ⲭ', 'Ⲭ'),
+ ('Ⲯ', 'Ⲯ'),
+ ('Ⲱ', 'Ⲱ'),
+ ('Ⲳ', 'Ⲳ'),
+ ('Ⲵ', 'Ⲵ'),
+ ('Ⲷ', 'Ⲷ'),
+ ('Ⲹ', 'Ⲹ'),
+ ('Ⲻ', 'Ⲻ'),
+ ('Ⲽ', 'Ⲽ'),
+ ('Ⲿ', 'Ⲿ'),
+ ('Ⳁ', 'Ⳁ'),
+ ('Ⳃ', 'Ⳃ'),
+ ('Ⳅ', 'Ⳅ'),
+ ('Ⳇ', 'Ⳇ'),
+ ('Ⳉ', 'Ⳉ'),
+ ('Ⳋ', 'Ⳋ'),
+ ('Ⳍ', 'Ⳍ'),
+ ('Ⳏ', 'Ⳏ'),
+ ('Ⳑ', 'Ⳑ'),
+ ('Ⳓ', 'Ⳓ'),
+ ('Ⳕ', 'Ⳕ'),
+ ('Ⳗ', 'Ⳗ'),
+ ('Ⳙ', 'Ⳙ'),
+ ('Ⳛ', 'Ⳛ'),
+ ('Ⳝ', 'Ⳝ'),
+ ('Ⳟ', 'Ⳟ'),
+ ('Ⳡ', 'Ⳡ'),
+ ('Ⳣ', 'Ⳣ'),
+ ('Ⳬ', 'Ⳬ'),
+ ('Ⳮ', 'Ⳮ'),
+ ('Ⳳ', 'Ⳳ'),
+ ('Ꙁ', 'Ꙁ'),
+ ('Ꙃ', 'Ꙃ'),
+ ('Ꙅ', 'Ꙅ'),
+ ('Ꙇ', 'Ꙇ'),
+ ('Ꙉ', 'Ꙉ'),
+ ('Ꙋ', 'Ꙋ'),
+ ('Ꙍ', 'Ꙍ'),
+ ('Ꙏ', 'Ꙏ'),
+ ('Ꙑ', 'Ꙑ'),
+ ('Ꙓ', 'Ꙓ'),
+ ('Ꙕ', 'Ꙕ'),
+ ('Ꙗ', 'Ꙗ'),
+ ('Ꙙ', 'Ꙙ'),
+ ('Ꙛ', 'Ꙛ'),
+ ('Ꙝ', 'Ꙝ'),
+ ('Ꙟ', 'Ꙟ'),
+ ('Ꙡ', 'Ꙡ'),
+ ('Ꙣ', 'Ꙣ'),
+ ('Ꙥ', 'Ꙥ'),
+ ('Ꙧ', 'Ꙧ'),
+ ('Ꙩ', 'Ꙩ'),
+ ('Ꙫ', 'Ꙫ'),
+ ('Ꙭ', 'Ꙭ'),
+ ('Ꚁ', 'Ꚁ'),
+ ('Ꚃ', 'Ꚃ'),
+ ('Ꚅ', 'Ꚅ'),
+ ('Ꚇ', 'Ꚇ'),
+ ('Ꚉ', 'Ꚉ'),
+ ('Ꚋ', 'Ꚋ'),
+ ('Ꚍ', 'Ꚍ'),
+ ('Ꚏ', 'Ꚏ'),
+ ('Ꚑ', 'Ꚑ'),
+ ('Ꚓ', 'Ꚓ'),
+ ('Ꚕ', 'Ꚕ'),
+ ('Ꚗ', 'Ꚗ'),
+ ('Ꚙ', 'Ꚙ'),
+ ('Ꚛ', 'Ꚛ'),
+ ('Ꜣ', 'Ꜣ'),
+ ('Ꜥ', 'Ꜥ'),
+ ('Ꜧ', 'Ꜧ'),
+ ('Ꜩ', 'Ꜩ'),
+ ('Ꜫ', 'Ꜫ'),
+ ('Ꜭ', 'Ꜭ'),
+ ('Ꜯ', 'Ꜯ'),
+ ('Ꜳ', 'Ꜳ'),
+ ('Ꜵ', 'Ꜵ'),
+ ('Ꜷ', 'Ꜷ'),
+ ('Ꜹ', 'Ꜹ'),
+ ('Ꜻ', 'Ꜻ'),
+ ('Ꜽ', 'Ꜽ'),
+ ('Ꜿ', 'Ꜿ'),
+ ('Ꝁ', 'Ꝁ'),
+ ('Ꝃ', 'Ꝃ'),
+ ('Ꝅ', 'Ꝅ'),
+ ('Ꝇ', 'Ꝇ'),
+ ('Ꝉ', 'Ꝉ'),
+ ('Ꝋ', 'Ꝋ'),
+ ('Ꝍ', 'Ꝍ'),
+ ('Ꝏ', 'Ꝏ'),
+ ('Ꝑ', 'Ꝑ'),
+ ('Ꝓ', 'Ꝓ'),
+ ('Ꝕ', 'Ꝕ'),
+ ('Ꝗ', 'Ꝗ'),
+ ('Ꝙ', 'Ꝙ'),
+ ('Ꝛ', 'Ꝛ'),
+ ('Ꝝ', 'Ꝝ'),
+ ('Ꝟ', 'Ꝟ'),
+ ('Ꝡ', 'Ꝡ'),
+ ('Ꝣ', 'Ꝣ'),
+ ('Ꝥ', 'Ꝥ'),
+ ('Ꝧ', 'Ꝧ'),
+ ('Ꝩ', 'Ꝩ'),
+ ('Ꝫ', 'Ꝫ'),
+ ('Ꝭ', 'Ꝭ'),
+ ('Ꝯ', 'Ꝯ'),
+ ('Ꝺ', 'Ꝺ'),
+ ('Ꝼ', 'Ꝼ'),
+ ('Ᵹ', 'Ꝿ'),
+ ('Ꞁ', 'Ꞁ'),
+ ('Ꞃ', 'Ꞃ'),
+ ('Ꞅ', 'Ꞅ'),
+ ('Ꞇ', 'Ꞇ'),
+ ('Ꞌ', 'Ꞌ'),
+ ('Ɥ', 'Ɥ'),
+ ('Ꞑ', 'Ꞑ'),
+ ('Ꞓ', 'Ꞓ'),
+ ('Ꞗ', 'Ꞗ'),
+ ('Ꞙ', 'Ꞙ'),
+ ('Ꞛ', 'Ꞛ'),
+ ('Ꞝ', 'Ꞝ'),
+ ('Ꞟ', 'Ꞟ'),
+ ('Ꞡ', 'Ꞡ'),
+ ('Ꞣ', 'Ꞣ'),
+ ('Ꞥ', 'Ꞥ'),
+ ('Ꞧ', 'Ꞧ'),
+ ('Ꞩ', 'Ꞩ'),
+ ('Ɦ', 'Ɪ'),
+ ('Ʞ', 'Ꞵ'),
+ ('Ꞷ', 'Ꞷ'),
+ ('Ꞹ', 'Ꞹ'),
+ ('Ꞻ', 'Ꞻ'),
+ ('Ꞽ', 'Ꞽ'),
+ ('Ꞿ', 'Ꞿ'),
+ ('Ꟁ', 'Ꟁ'),
+ ('Ꟃ', 'Ꟃ'),
+ ('Ꞔ', 'Ꟈ'),
+ ('Ꟊ', 'Ꟊ'),
+ ('Ꟑ', 'Ꟑ'),
+ ('Ꟗ', 'Ꟗ'),
+ ('Ꟙ', 'Ꟙ'),
+ ('Ꟶ', 'Ꟶ'),
+ ('A', 'Z'),
+ ('𐐀', '𐐧'),
+ ('𐒰', '𐓓'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐲀', '𐲲'),
+ ('𑢠', '𑢿'),
+ ('𖹀', '𖹟'),
+ ('𞤀', '𞤡'),
+];
+
+pub const CHANGES_WHEN_TITLECASED: &'static [(char, char)] = &[
+ ('a', 'z'),
+ ('µ', 'µ'),
+ ('ß', 'ö'),
+ ('ø', 'ÿ'),
+ ('ā', 'ā'),
+ ('ă', 'ă'),
+ ('ą', 'ą'),
+ ('ć', 'ć'),
+ ('ĉ', 'ĉ'),
+ ('ċ', 'ċ'),
+ ('č', 'č'),
+ ('ď', 'ď'),
+ ('đ', 'đ'),
+ ('ē', 'ē'),
+ ('ĕ', 'ĕ'),
+ ('ė', 'ė'),
+ ('ę', 'ę'),
+ ('ě', 'ě'),
+ ('ĝ', 'ĝ'),
+ ('ğ', 'ğ'),
+ ('ġ', 'ġ'),
+ ('ģ', 'ģ'),
+ ('ĥ', 'ĥ'),
+ ('ħ', 'ħ'),
+ ('ĩ', 'ĩ'),
+ ('ī', 'ī'),
+ ('ĭ', 'ĭ'),
+ ('į', 'į'),
+ ('ı', 'ı'),
+ ('ij', 'ij'),
+ ('ĵ', 'ĵ'),
+ ('ķ', 'ķ'),
+ ('ĺ', 'ĺ'),
+ ('ļ', 'ļ'),
+ ('ľ', 'ľ'),
+ ('ŀ', 'ŀ'),
+ ('ł', 'ł'),
+ ('ń', 'ń'),
+ ('ņ', 'ņ'),
+ ('ň', 'ʼn'),
+ ('ŋ', 'ŋ'),
+ ('ō', 'ō'),
+ ('ŏ', 'ŏ'),
+ ('ő', 'ő'),
+ ('œ', 'œ'),
+ ('ŕ', 'ŕ'),
+ ('ŗ', 'ŗ'),
+ ('ř', 'ř'),
+ ('ś', 'ś'),
+ ('ŝ', 'ŝ'),
+ ('ş', 'ş'),
+ ('š', 'š'),
+ ('ţ', 'ţ'),
+ ('ť', 'ť'),
+ ('ŧ', 'ŧ'),
+ ('ũ', 'ũ'),
+ ('ū', 'ū'),
+ ('ŭ', 'ŭ'),
+ ('ů', 'ů'),
+ ('ű', 'ű'),
+ ('ų', 'ų'),
+ ('ŵ', 'ŵ'),
+ ('ŷ', 'ŷ'),
+ ('ź', 'ź'),
+ ('ż', 'ż'),
+ ('ž', 'ƀ'),
+ ('ƃ', 'ƃ'),
+ ('ƅ', 'ƅ'),
+ ('ƈ', 'ƈ'),
+ ('ƌ', 'ƌ'),
+ ('ƒ', 'ƒ'),
+ ('ƕ', 'ƕ'),
+ ('ƙ', 'ƚ'),
+ ('ƞ', 'ƞ'),
+ ('ơ', 'ơ'),
+ ('ƣ', 'ƣ'),
+ ('ƥ', 'ƥ'),
+ ('ƨ', 'ƨ'),
+ ('ƭ', 'ƭ'),
+ ('ư', 'ư'),
+ ('ƴ', 'ƴ'),
+ ('ƶ', 'ƶ'),
+ ('ƹ', 'ƹ'),
+ ('ƽ', 'ƽ'),
+ ('ƿ', 'ƿ'),
+ ('DŽ', 'DŽ'),
+ ('dž', 'LJ'),
+ ('lj', 'NJ'),
+ ('nj', 'nj'),
+ ('ǎ', 'ǎ'),
+ ('ǐ', 'ǐ'),
+ ('ǒ', 'ǒ'),
+ ('ǔ', 'ǔ'),
+ ('ǖ', 'ǖ'),
+ ('ǘ', 'ǘ'),
+ ('ǚ', 'ǚ'),
+ ('ǜ', 'ǝ'),
+ ('ǟ', 'ǟ'),
+ ('ǡ', 'ǡ'),
+ ('ǣ', 'ǣ'),
+ ('ǥ', 'ǥ'),
+ ('ǧ', 'ǧ'),
+ ('ǩ', 'ǩ'),
+ ('ǫ', 'ǫ'),
+ ('ǭ', 'ǭ'),
+ ('ǯ', 'DZ'),
+ ('dz', 'dz'),
+ ('ǵ', 'ǵ'),
+ ('ǹ', 'ǹ'),
+ ('ǻ', 'ǻ'),
+ ('ǽ', 'ǽ'),
+ ('ǿ', 'ǿ'),
+ ('ȁ', 'ȁ'),
+ ('ȃ', 'ȃ'),
+ ('ȅ', 'ȅ'),
+ ('ȇ', 'ȇ'),
+ ('ȉ', 'ȉ'),
+ ('ȋ', 'ȋ'),
+ ('ȍ', 'ȍ'),
+ ('ȏ', 'ȏ'),
+ ('ȑ', 'ȑ'),
+ ('ȓ', 'ȓ'),
+ ('ȕ', 'ȕ'),
+ ('ȗ', 'ȗ'),
+ ('ș', 'ș'),
+ ('ț', 'ț'),
+ ('ȝ', 'ȝ'),
+ ('ȟ', 'ȟ'),
+ ('ȣ', 'ȣ'),
+ ('ȥ', 'ȥ'),
+ ('ȧ', 'ȧ'),
+ ('ȩ', 'ȩ'),
+ ('ȫ', 'ȫ'),
+ ('ȭ', 'ȭ'),
+ ('ȯ', 'ȯ'),
+ ('ȱ', 'ȱ'),
+ ('ȳ', 'ȳ'),
+ ('ȼ', 'ȼ'),
+ ('ȿ', 'ɀ'),
+ ('ɂ', 'ɂ'),
+ ('ɇ', 'ɇ'),
+ ('ɉ', 'ɉ'),
+ ('ɋ', 'ɋ'),
+ ('ɍ', 'ɍ'),
+ ('ɏ', 'ɔ'),
+ ('ɖ', 'ɗ'),
+ ('ə', 'ə'),
+ ('ɛ', 'ɜ'),
+ ('ɠ', 'ɡ'),
+ ('ɣ', 'ɣ'),
+ ('ɥ', 'ɦ'),
+ ('ɨ', 'ɬ'),
+ ('ɯ', 'ɯ'),
+ ('ɱ', 'ɲ'),
+ ('ɵ', 'ɵ'),
+ ('ɽ', 'ɽ'),
+ ('ʀ', 'ʀ'),
+ ('ʂ', 'ʃ'),
+ ('ʇ', 'ʌ'),
+ ('ʒ', 'ʒ'),
+ ('ʝ', 'ʞ'),
+ ('\u{345}', '\u{345}'),
+ ('ͱ', 'ͱ'),
+ ('ͳ', 'ͳ'),
+ ('ͷ', 'ͷ'),
+ ('ͻ', 'ͽ'),
+ ('ΐ', 'ΐ'),
+ ('ά', 'ώ'),
+ ('ϐ', 'ϑ'),
+ ('ϕ', 'ϗ'),
+ ('ϙ', 'ϙ'),
+ ('ϛ', 'ϛ'),
+ ('ϝ', 'ϝ'),
+ ('ϟ', 'ϟ'),
+ ('ϡ', 'ϡ'),
+ ('ϣ', 'ϣ'),
+ ('ϥ', 'ϥ'),
+ ('ϧ', 'ϧ'),
+ ('ϩ', 'ϩ'),
+ ('ϫ', 'ϫ'),
+ ('ϭ', 'ϭ'),
+ ('ϯ', 'ϳ'),
+ ('ϵ', 'ϵ'),
+ ('ϸ', 'ϸ'),
+ ('ϻ', 'ϻ'),
+ ('а', 'џ'),
+ ('ѡ', 'ѡ'),
+ ('ѣ', 'ѣ'),
+ ('ѥ', 'ѥ'),
+ ('ѧ', 'ѧ'),
+ ('ѩ', 'ѩ'),
+ ('ѫ', 'ѫ'),
+ ('ѭ', 'ѭ'),
+ ('ѯ', 'ѯ'),
+ ('ѱ', 'ѱ'),
+ ('ѳ', 'ѳ'),
+ ('ѵ', 'ѵ'),
+ ('ѷ', 'ѷ'),
+ ('ѹ', 'ѹ'),
+ ('ѻ', 'ѻ'),
+ ('ѽ', 'ѽ'),
+ ('ѿ', 'ѿ'),
+ ('ҁ', 'ҁ'),
+ ('ҋ', 'ҋ'),
+ ('ҍ', 'ҍ'),
+ ('ҏ', 'ҏ'),
+ ('ґ', 'ґ'),
+ ('ғ', 'ғ'),
+ ('ҕ', 'ҕ'),
+ ('җ', 'җ'),
+ ('ҙ', 'ҙ'),
+ ('қ', 'қ'),
+ ('ҝ', 'ҝ'),
+ ('ҟ', 'ҟ'),
+ ('ҡ', 'ҡ'),
+ ('ң', 'ң'),
+ ('ҥ', 'ҥ'),
+ ('ҧ', 'ҧ'),
+ ('ҩ', 'ҩ'),
+ ('ҫ', 'ҫ'),
+ ('ҭ', 'ҭ'),
+ ('ү', 'ү'),
+ ('ұ', 'ұ'),
+ ('ҳ', 'ҳ'),
+ ('ҵ', 'ҵ'),
+ ('ҷ', 'ҷ'),
+ ('ҹ', 'ҹ'),
+ ('һ', 'һ'),
+ ('ҽ', 'ҽ'),
+ ('ҿ', 'ҿ'),
+ ('ӂ', 'ӂ'),
+ ('ӄ', 'ӄ'),
+ ('ӆ', 'ӆ'),
+ ('ӈ', 'ӈ'),
+ ('ӊ', 'ӊ'),
+ ('ӌ', 'ӌ'),
+ ('ӎ', 'ӏ'),
+ ('ӑ', 'ӑ'),
+ ('ӓ', 'ӓ'),
+ ('ӕ', 'ӕ'),
+ ('ӗ', 'ӗ'),
+ ('ә', 'ә'),
+ ('ӛ', 'ӛ'),
+ ('ӝ', 'ӝ'),
+ ('ӟ', 'ӟ'),
+ ('ӡ', 'ӡ'),
+ ('ӣ', 'ӣ'),
+ ('ӥ', 'ӥ'),
+ ('ӧ', 'ӧ'),
+ ('ө', 'ө'),
+ ('ӫ', 'ӫ'),
+ ('ӭ', 'ӭ'),
+ ('ӯ', 'ӯ'),
+ ('ӱ', 'ӱ'),
+ ('ӳ', 'ӳ'),
+ ('ӵ', 'ӵ'),
+ ('ӷ', 'ӷ'),
+ ('ӹ', 'ӹ'),
+ ('ӻ', 'ӻ'),
+ ('ӽ', 'ӽ'),
+ ('ӿ', 'ӿ'),
+ ('ԁ', 'ԁ'),
+ ('ԃ', 'ԃ'),
+ ('ԅ', 'ԅ'),
+ ('ԇ', 'ԇ'),
+ ('ԉ', 'ԉ'),
+ ('ԋ', 'ԋ'),
+ ('ԍ', 'ԍ'),
+ ('ԏ', 'ԏ'),
+ ('ԑ', 'ԑ'),
+ ('ԓ', 'ԓ'),
+ ('ԕ', 'ԕ'),
+ ('ԗ', 'ԗ'),
+ ('ԙ', 'ԙ'),
+ ('ԛ', 'ԛ'),
+ ('ԝ', 'ԝ'),
+ ('ԟ', 'ԟ'),
+ ('ԡ', 'ԡ'),
+ ('ԣ', 'ԣ'),
+ ('ԥ', 'ԥ'),
+ ('ԧ', 'ԧ'),
+ ('ԩ', 'ԩ'),
+ ('ԫ', 'ԫ'),
+ ('ԭ', 'ԭ'),
+ ('ԯ', 'ԯ'),
+ ('ա', 'և'),
+ ('ᏸ', 'ᏽ'),
+ ('ᲀ', 'ᲈ'),
+ ('ᵹ', 'ᵹ'),
+ ('ᵽ', 'ᵽ'),
+ ('ᶎ', 'ᶎ'),
+ ('ḁ', 'ḁ'),
+ ('ḃ', 'ḃ'),
+ ('ḅ', 'ḅ'),
+ ('ḇ', 'ḇ'),
+ ('ḉ', 'ḉ'),
+ ('ḋ', 'ḋ'),
+ ('ḍ', 'ḍ'),
+ ('ḏ', 'ḏ'),
+ ('ḑ', 'ḑ'),
+ ('ḓ', 'ḓ'),
+ ('ḕ', 'ḕ'),
+ ('ḗ', 'ḗ'),
+ ('ḙ', 'ḙ'),
+ ('ḛ', 'ḛ'),
+ ('ḝ', 'ḝ'),
+ ('ḟ', 'ḟ'),
+ ('ḡ', 'ḡ'),
+ ('ḣ', 'ḣ'),
+ ('ḥ', 'ḥ'),
+ ('ḧ', 'ḧ'),
+ ('ḩ', 'ḩ'),
+ ('ḫ', 'ḫ'),
+ ('ḭ', 'ḭ'),
+ ('ḯ', 'ḯ'),
+ ('ḱ', 'ḱ'),
+ ('ḳ', 'ḳ'),
+ ('ḵ', 'ḵ'),
+ ('ḷ', 'ḷ'),
+ ('ḹ', 'ḹ'),
+ ('ḻ', 'ḻ'),
+ ('ḽ', 'ḽ'),
+ ('ḿ', 'ḿ'),
+ ('ṁ', 'ṁ'),
+ ('ṃ', 'ṃ'),
+ ('ṅ', 'ṅ'),
+ ('ṇ', 'ṇ'),
+ ('ṉ', 'ṉ'),
+ ('ṋ', 'ṋ'),
+ ('ṍ', 'ṍ'),
+ ('ṏ', 'ṏ'),
+ ('ṑ', 'ṑ'),
+ ('ṓ', 'ṓ'),
+ ('ṕ', 'ṕ'),
+ ('ṗ', 'ṗ'),
+ ('ṙ', 'ṙ'),
+ ('ṛ', 'ṛ'),
+ ('ṝ', 'ṝ'),
+ ('ṟ', 'ṟ'),
+ ('ṡ', 'ṡ'),
+ ('ṣ', 'ṣ'),
+ ('ṥ', 'ṥ'),
+ ('ṧ', 'ṧ'),
+ ('ṩ', 'ṩ'),
+ ('ṫ', 'ṫ'),
+ ('ṭ', 'ṭ'),
+ ('ṯ', 'ṯ'),
+ ('ṱ', 'ṱ'),
+ ('ṳ', 'ṳ'),
+ ('ṵ', 'ṵ'),
+ ('ṷ', 'ṷ'),
+ ('ṹ', 'ṹ'),
+ ('ṻ', 'ṻ'),
+ ('ṽ', 'ṽ'),
+ ('ṿ', 'ṿ'),
+ ('ẁ', 'ẁ'),
+ ('ẃ', 'ẃ'),
+ ('ẅ', 'ẅ'),
+ ('ẇ', 'ẇ'),
+ ('ẉ', 'ẉ'),
+ ('ẋ', 'ẋ'),
+ ('ẍ', 'ẍ'),
+ ('ẏ', 'ẏ'),
+ ('ẑ', 'ẑ'),
+ ('ẓ', 'ẓ'),
+ ('ẕ', 'ẛ'),
+ ('ạ', 'ạ'),
+ ('ả', 'ả'),
+ ('ấ', 'ấ'),
+ ('ầ', 'ầ'),
+ ('ẩ', 'ẩ'),
+ ('ẫ', 'ẫ'),
+ ('ậ', 'ậ'),
+ ('ắ', 'ắ'),
+ ('ằ', 'ằ'),
+ ('ẳ', 'ẳ'),
+ ('ẵ', 'ẵ'),
+ ('ặ', 'ặ'),
+ ('ẹ', 'ẹ'),
+ ('ẻ', 'ẻ'),
+ ('ẽ', 'ẽ'),
+ ('ế', 'ế'),
+ ('ề', 'ề'),
+ ('ể', 'ể'),
+ ('ễ', 'ễ'),
+ ('ệ', 'ệ'),
+ ('ỉ', 'ỉ'),
+ ('ị', 'ị'),
+ ('ọ', 'ọ'),
+ ('ỏ', 'ỏ'),
+ ('ố', 'ố'),
+ ('ồ', 'ồ'),
+ ('ổ', 'ổ'),
+ ('ỗ', 'ỗ'),
+ ('ộ', 'ộ'),
+ ('ớ', 'ớ'),
+ ('ờ', 'ờ'),
+ ('ở', 'ở'),
+ ('ỡ', 'ỡ'),
+ ('ợ', 'ợ'),
+ ('ụ', 'ụ'),
+ ('ủ', 'ủ'),
+ ('ứ', 'ứ'),
+ ('ừ', 'ừ'),
+ ('ử', 'ử'),
+ ('ữ', 'ữ'),
+ ('ự', 'ự'),
+ ('ỳ', 'ỳ'),
+ ('ỵ', 'ỵ'),
+ ('ỷ', 'ỷ'),
+ ('ỹ', 'ỹ'),
+ ('ỻ', 'ỻ'),
+ ('ỽ', 'ỽ'),
+ ('ỿ', 'ἇ'),
+ ('ἐ', 'ἕ'),
+ ('ἠ', 'ἧ'),
+ ('ἰ', 'ἷ'),
+ ('ὀ', 'ὅ'),
+ ('ὐ', 'ὗ'),
+ ('ὠ', 'ὧ'),
+ ('ὰ', 'ώ'),
+ ('ᾀ', 'ᾇ'),
+ ('ᾐ', 'ᾗ'),
+ ('ᾠ', 'ᾧ'),
+ ('ᾰ', 'ᾴ'),
+ ('ᾶ', 'ᾷ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῇ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'ῗ'),
+ ('ῠ', 'ῧ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῷ'),
+ ('ⅎ', 'ⅎ'),
+ ('ⅰ', 'ⅿ'),
+ ('ↄ', 'ↄ'),
+ ('ⓐ', 'ⓩ'),
+ ('ⰰ', 'ⱟ'),
+ ('ⱡ', 'ⱡ'),
+ ('ⱥ', 'ⱦ'),
+ ('ⱨ', 'ⱨ'),
+ ('ⱪ', 'ⱪ'),
+ ('ⱬ', 'ⱬ'),
+ ('ⱳ', 'ⱳ'),
+ ('ⱶ', 'ⱶ'),
+ ('ⲁ', 'ⲁ'),
+ ('ⲃ', 'ⲃ'),
+ ('ⲅ', 'ⲅ'),
+ ('ⲇ', 'ⲇ'),
+ ('ⲉ', 'ⲉ'),
+ ('ⲋ', 'ⲋ'),
+ ('ⲍ', 'ⲍ'),
+ ('ⲏ', 'ⲏ'),
+ ('ⲑ', 'ⲑ'),
+ ('ⲓ', 'ⲓ'),
+ ('ⲕ', 'ⲕ'),
+ ('ⲗ', 'ⲗ'),
+ ('ⲙ', 'ⲙ'),
+ ('ⲛ', 'ⲛ'),
+ ('ⲝ', 'ⲝ'),
+ ('ⲟ', 'ⲟ'),
+ ('ⲡ', 'ⲡ'),
+ ('ⲣ', 'ⲣ'),
+ ('ⲥ', 'ⲥ'),
+ ('ⲧ', 'ⲧ'),
+ ('ⲩ', 'ⲩ'),
+ ('ⲫ', 'ⲫ'),
+ ('ⲭ', 'ⲭ'),
+ ('ⲯ', 'ⲯ'),
+ ('ⲱ', 'ⲱ'),
+ ('ⲳ', 'ⲳ'),
+ ('ⲵ', 'ⲵ'),
+ ('ⲷ', 'ⲷ'),
+ ('ⲹ', 'ⲹ'),
+ ('ⲻ', 'ⲻ'),
+ ('ⲽ', 'ⲽ'),
+ ('ⲿ', 'ⲿ'),
+ ('ⳁ', 'ⳁ'),
+ ('ⳃ', 'ⳃ'),
+ ('ⳅ', 'ⳅ'),
+ ('ⳇ', 'ⳇ'),
+ ('ⳉ', 'ⳉ'),
+ ('ⳋ', 'ⳋ'),
+ ('ⳍ', 'ⳍ'),
+ ('ⳏ', 'ⳏ'),
+ ('ⳑ', 'ⳑ'),
+ ('ⳓ', 'ⳓ'),
+ ('ⳕ', 'ⳕ'),
+ ('ⳗ', 'ⳗ'),
+ ('ⳙ', 'ⳙ'),
+ ('ⳛ', 'ⳛ'),
+ ('ⳝ', 'ⳝ'),
+ ('ⳟ', 'ⳟ'),
+ ('ⳡ', 'ⳡ'),
+ ('ⳣ', 'ⳣ'),
+ ('ⳬ', 'ⳬ'),
+ ('ⳮ', 'ⳮ'),
+ ('ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ꙁ', 'ꙁ'),
+ ('ꙃ', 'ꙃ'),
+ ('ꙅ', 'ꙅ'),
+ ('ꙇ', 'ꙇ'),
+ ('ꙉ', 'ꙉ'),
+ ('ꙋ', 'ꙋ'),
+ ('ꙍ', 'ꙍ'),
+ ('ꙏ', 'ꙏ'),
+ ('ꙑ', 'ꙑ'),
+ ('ꙓ', 'ꙓ'),
+ ('ꙕ', 'ꙕ'),
+ ('ꙗ', 'ꙗ'),
+ ('ꙙ', 'ꙙ'),
+ ('ꙛ', 'ꙛ'),
+ ('ꙝ', 'ꙝ'),
+ ('ꙟ', 'ꙟ'),
+ ('ꙡ', 'ꙡ'),
+ ('ꙣ', 'ꙣ'),
+ ('ꙥ', 'ꙥ'),
+ ('ꙧ', 'ꙧ'),
+ ('ꙩ', 'ꙩ'),
+ ('ꙫ', 'ꙫ'),
+ ('ꙭ', 'ꙭ'),
+ ('ꚁ', 'ꚁ'),
+ ('ꚃ', 'ꚃ'),
+ ('ꚅ', 'ꚅ'),
+ ('ꚇ', 'ꚇ'),
+ ('ꚉ', 'ꚉ'),
+ ('ꚋ', 'ꚋ'),
+ ('ꚍ', 'ꚍ'),
+ ('ꚏ', 'ꚏ'),
+ ('ꚑ', 'ꚑ'),
+ ('ꚓ', 'ꚓ'),
+ ('ꚕ', 'ꚕ'),
+ ('ꚗ', 'ꚗ'),
+ ('ꚙ', 'ꚙ'),
+ ('ꚛ', 'ꚛ'),
+ ('ꜣ', 'ꜣ'),
+ ('ꜥ', 'ꜥ'),
+ ('ꜧ', 'ꜧ'),
+ ('ꜩ', 'ꜩ'),
+ ('ꜫ', 'ꜫ'),
+ ('ꜭ', 'ꜭ'),
+ ('ꜯ', 'ꜯ'),
+ ('ꜳ', 'ꜳ'),
+ ('ꜵ', 'ꜵ'),
+ ('ꜷ', 'ꜷ'),
+ ('ꜹ', 'ꜹ'),
+ ('ꜻ', 'ꜻ'),
+ ('ꜽ', 'ꜽ'),
+ ('ꜿ', 'ꜿ'),
+ ('ꝁ', 'ꝁ'),
+ ('ꝃ', 'ꝃ'),
+ ('ꝅ', 'ꝅ'),
+ ('ꝇ', 'ꝇ'),
+ ('ꝉ', 'ꝉ'),
+ ('ꝋ', 'ꝋ'),
+ ('ꝍ', 'ꝍ'),
+ ('ꝏ', 'ꝏ'),
+ ('ꝑ', 'ꝑ'),
+ ('ꝓ', 'ꝓ'),
+ ('ꝕ', 'ꝕ'),
+ ('ꝗ', 'ꝗ'),
+ ('ꝙ', 'ꝙ'),
+ ('ꝛ', 'ꝛ'),
+ ('ꝝ', 'ꝝ'),
+ ('ꝟ', 'ꝟ'),
+ ('ꝡ', 'ꝡ'),
+ ('ꝣ', 'ꝣ'),
+ ('ꝥ', 'ꝥ'),
+ ('ꝧ', 'ꝧ'),
+ ('ꝩ', 'ꝩ'),
+ ('ꝫ', 'ꝫ'),
+ ('ꝭ', 'ꝭ'),
+ ('ꝯ', 'ꝯ'),
+ ('ꝺ', 'ꝺ'),
+ ('ꝼ', 'ꝼ'),
+ ('ꝿ', 'ꝿ'),
+ ('ꞁ', 'ꞁ'),
+ ('ꞃ', 'ꞃ'),
+ ('ꞅ', 'ꞅ'),
+ ('ꞇ', 'ꞇ'),
+ ('ꞌ', 'ꞌ'),
+ ('ꞑ', 'ꞑ'),
+ ('ꞓ', 'ꞔ'),
+ ('ꞗ', 'ꞗ'),
+ ('ꞙ', 'ꞙ'),
+ ('ꞛ', 'ꞛ'),
+ ('ꞝ', 'ꞝ'),
+ ('ꞟ', 'ꞟ'),
+ ('ꞡ', 'ꞡ'),
+ ('ꞣ', 'ꞣ'),
+ ('ꞥ', 'ꞥ'),
+ ('ꞧ', 'ꞧ'),
+ ('ꞩ', 'ꞩ'),
+ ('ꞵ', 'ꞵ'),
+ ('ꞷ', 'ꞷ'),
+ ('ꞹ', 'ꞹ'),
+ ('ꞻ', 'ꞻ'),
+ ('ꞽ', 'ꞽ'),
+ ('ꞿ', 'ꞿ'),
+ ('ꟁ', 'ꟁ'),
+ ('ꟃ', 'ꟃ'),
+ ('ꟈ', 'ꟈ'),
+ ('ꟊ', 'ꟊ'),
+ ('ꟑ', 'ꟑ'),
+ ('ꟗ', 'ꟗ'),
+ ('ꟙ', 'ꟙ'),
+ ('ꟶ', 'ꟶ'),
+ ('ꭓ', 'ꭓ'),
+ ('ꭰ', 'ꮿ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('a', 'z'),
+ ('𐐨', '𐑏'),
+ ('𐓘', '𐓻'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐳀', '𐳲'),
+ ('𑣀', '𑣟'),
+ ('𖹠', '𖹿'),
+ ('𞤢', '𞥃'),
+];
+
+pub const CHANGES_WHEN_UPPERCASED: &'static [(char, char)] = &[
+ ('a', 'z'),
+ ('µ', 'µ'),
+ ('ß', 'ö'),
+ ('ø', 'ÿ'),
+ ('ā', 'ā'),
+ ('ă', 'ă'),
+ ('ą', 'ą'),
+ ('ć', 'ć'),
+ ('ĉ', 'ĉ'),
+ ('ċ', 'ċ'),
+ ('č', 'č'),
+ ('ď', 'ď'),
+ ('đ', 'đ'),
+ ('ē', 'ē'),
+ ('ĕ', 'ĕ'),
+ ('ė', 'ė'),
+ ('ę', 'ę'),
+ ('ě', 'ě'),
+ ('ĝ', 'ĝ'),
+ ('ğ', 'ğ'),
+ ('ġ', 'ġ'),
+ ('ģ', 'ģ'),
+ ('ĥ', 'ĥ'),
+ ('ħ', 'ħ'),
+ ('ĩ', 'ĩ'),
+ ('ī', 'ī'),
+ ('ĭ', 'ĭ'),
+ ('į', 'į'),
+ ('ı', 'ı'),
+ ('ij', 'ij'),
+ ('ĵ', 'ĵ'),
+ ('ķ', 'ķ'),
+ ('ĺ', 'ĺ'),
+ ('ļ', 'ļ'),
+ ('ľ', 'ľ'),
+ ('ŀ', 'ŀ'),
+ ('ł', 'ł'),
+ ('ń', 'ń'),
+ ('ņ', 'ņ'),
+ ('ň', 'ʼn'),
+ ('ŋ', 'ŋ'),
+ ('ō', 'ō'),
+ ('ŏ', 'ŏ'),
+ ('ő', 'ő'),
+ ('œ', 'œ'),
+ ('ŕ', 'ŕ'),
+ ('ŗ', 'ŗ'),
+ ('ř', 'ř'),
+ ('ś', 'ś'),
+ ('ŝ', 'ŝ'),
+ ('ş', 'ş'),
+ ('š', 'š'),
+ ('ţ', 'ţ'),
+ ('ť', 'ť'),
+ ('ŧ', 'ŧ'),
+ ('ũ', 'ũ'),
+ ('ū', 'ū'),
+ ('ŭ', 'ŭ'),
+ ('ů', 'ů'),
+ ('ű', 'ű'),
+ ('ų', 'ų'),
+ ('ŵ', 'ŵ'),
+ ('ŷ', 'ŷ'),
+ ('ź', 'ź'),
+ ('ż', 'ż'),
+ ('ž', 'ƀ'),
+ ('ƃ', 'ƃ'),
+ ('ƅ', 'ƅ'),
+ ('ƈ', 'ƈ'),
+ ('ƌ', 'ƌ'),
+ ('ƒ', 'ƒ'),
+ ('ƕ', 'ƕ'),
+ ('ƙ', 'ƚ'),
+ ('ƞ', 'ƞ'),
+ ('ơ', 'ơ'),
+ ('ƣ', 'ƣ'),
+ ('ƥ', 'ƥ'),
+ ('ƨ', 'ƨ'),
+ ('ƭ', 'ƭ'),
+ ('ư', 'ư'),
+ ('ƴ', 'ƴ'),
+ ('ƶ', 'ƶ'),
+ ('ƹ', 'ƹ'),
+ ('ƽ', 'ƽ'),
+ ('ƿ', 'ƿ'),
+ ('Dž', 'dž'),
+ ('Lj', 'lj'),
+ ('Nj', 'nj'),
+ ('ǎ', 'ǎ'),
+ ('ǐ', 'ǐ'),
+ ('ǒ', 'ǒ'),
+ ('ǔ', 'ǔ'),
+ ('ǖ', 'ǖ'),
+ ('ǘ', 'ǘ'),
+ ('ǚ', 'ǚ'),
+ ('ǜ', 'ǝ'),
+ ('ǟ', 'ǟ'),
+ ('ǡ', 'ǡ'),
+ ('ǣ', 'ǣ'),
+ ('ǥ', 'ǥ'),
+ ('ǧ', 'ǧ'),
+ ('ǩ', 'ǩ'),
+ ('ǫ', 'ǫ'),
+ ('ǭ', 'ǭ'),
+ ('ǯ', 'ǰ'),
+ ('Dz', 'dz'),
+ ('ǵ', 'ǵ'),
+ ('ǹ', 'ǹ'),
+ ('ǻ', 'ǻ'),
+ ('ǽ', 'ǽ'),
+ ('ǿ', 'ǿ'),
+ ('ȁ', 'ȁ'),
+ ('ȃ', 'ȃ'),
+ ('ȅ', 'ȅ'),
+ ('ȇ', 'ȇ'),
+ ('ȉ', 'ȉ'),
+ ('ȋ', 'ȋ'),
+ ('ȍ', 'ȍ'),
+ ('ȏ', 'ȏ'),
+ ('ȑ', 'ȑ'),
+ ('ȓ', 'ȓ'),
+ ('ȕ', 'ȕ'),
+ ('ȗ', 'ȗ'),
+ ('ș', 'ș'),
+ ('ț', 'ț'),
+ ('ȝ', 'ȝ'),
+ ('ȟ', 'ȟ'),
+ ('ȣ', 'ȣ'),
+ ('ȥ', 'ȥ'),
+ ('ȧ', 'ȧ'),
+ ('ȩ', 'ȩ'),
+ ('ȫ', 'ȫ'),
+ ('ȭ', 'ȭ'),
+ ('ȯ', 'ȯ'),
+ ('ȱ', 'ȱ'),
+ ('ȳ', 'ȳ'),
+ ('ȼ', 'ȼ'),
+ ('ȿ', 'ɀ'),
+ ('ɂ', 'ɂ'),
+ ('ɇ', 'ɇ'),
+ ('ɉ', 'ɉ'),
+ ('ɋ', 'ɋ'),
+ ('ɍ', 'ɍ'),
+ ('ɏ', 'ɔ'),
+ ('ɖ', 'ɗ'),
+ ('ə', 'ə'),
+ ('ɛ', 'ɜ'),
+ ('ɠ', 'ɡ'),
+ ('ɣ', 'ɣ'),
+ ('ɥ', 'ɦ'),
+ ('ɨ', 'ɬ'),
+ ('ɯ', 'ɯ'),
+ ('ɱ', 'ɲ'),
+ ('ɵ', 'ɵ'),
+ ('ɽ', 'ɽ'),
+ ('ʀ', 'ʀ'),
+ ('ʂ', 'ʃ'),
+ ('ʇ', 'ʌ'),
+ ('ʒ', 'ʒ'),
+ ('ʝ', 'ʞ'),
+ ('\u{345}', '\u{345}'),
+ ('ͱ', 'ͱ'),
+ ('ͳ', 'ͳ'),
+ ('ͷ', 'ͷ'),
+ ('ͻ', 'ͽ'),
+ ('ΐ', 'ΐ'),
+ ('ά', 'ώ'),
+ ('ϐ', 'ϑ'),
+ ('ϕ', 'ϗ'),
+ ('ϙ', 'ϙ'),
+ ('ϛ', 'ϛ'),
+ ('ϝ', 'ϝ'),
+ ('ϟ', 'ϟ'),
+ ('ϡ', 'ϡ'),
+ ('ϣ', 'ϣ'),
+ ('ϥ', 'ϥ'),
+ ('ϧ', 'ϧ'),
+ ('ϩ', 'ϩ'),
+ ('ϫ', 'ϫ'),
+ ('ϭ', 'ϭ'),
+ ('ϯ', 'ϳ'),
+ ('ϵ', 'ϵ'),
+ ('ϸ', 'ϸ'),
+ ('ϻ', 'ϻ'),
+ ('а', 'џ'),
+ ('ѡ', 'ѡ'),
+ ('ѣ', 'ѣ'),
+ ('ѥ', 'ѥ'),
+ ('ѧ', 'ѧ'),
+ ('ѩ', 'ѩ'),
+ ('ѫ', 'ѫ'),
+ ('ѭ', 'ѭ'),
+ ('ѯ', 'ѯ'),
+ ('ѱ', 'ѱ'),
+ ('ѳ', 'ѳ'),
+ ('ѵ', 'ѵ'),
+ ('ѷ', 'ѷ'),
+ ('ѹ', 'ѹ'),
+ ('ѻ', 'ѻ'),
+ ('ѽ', 'ѽ'),
+ ('ѿ', 'ѿ'),
+ ('ҁ', 'ҁ'),
+ ('ҋ', 'ҋ'),
+ ('ҍ', 'ҍ'),
+ ('ҏ', 'ҏ'),
+ ('ґ', 'ґ'),
+ ('ғ', 'ғ'),
+ ('ҕ', 'ҕ'),
+ ('җ', 'җ'),
+ ('ҙ', 'ҙ'),
+ ('қ', 'қ'),
+ ('ҝ', 'ҝ'),
+ ('ҟ', 'ҟ'),
+ ('ҡ', 'ҡ'),
+ ('ң', 'ң'),
+ ('ҥ', 'ҥ'),
+ ('ҧ', 'ҧ'),
+ ('ҩ', 'ҩ'),
+ ('ҫ', 'ҫ'),
+ ('ҭ', 'ҭ'),
+ ('ү', 'ү'),
+ ('ұ', 'ұ'),
+ ('ҳ', 'ҳ'),
+ ('ҵ', 'ҵ'),
+ ('ҷ', 'ҷ'),
+ ('ҹ', 'ҹ'),
+ ('һ', 'һ'),
+ ('ҽ', 'ҽ'),
+ ('ҿ', 'ҿ'),
+ ('ӂ', 'ӂ'),
+ ('ӄ', 'ӄ'),
+ ('ӆ', 'ӆ'),
+ ('ӈ', 'ӈ'),
+ ('ӊ', 'ӊ'),
+ ('ӌ', 'ӌ'),
+ ('ӎ', 'ӏ'),
+ ('ӑ', 'ӑ'),
+ ('ӓ', 'ӓ'),
+ ('ӕ', 'ӕ'),
+ ('ӗ', 'ӗ'),
+ ('ә', 'ә'),
+ ('ӛ', 'ӛ'),
+ ('ӝ', 'ӝ'),
+ ('ӟ', 'ӟ'),
+ ('ӡ', 'ӡ'),
+ ('ӣ', 'ӣ'),
+ ('ӥ', 'ӥ'),
+ ('ӧ', 'ӧ'),
+ ('ө', 'ө'),
+ ('ӫ', 'ӫ'),
+ ('ӭ', 'ӭ'),
+ ('ӯ', 'ӯ'),
+ ('ӱ', 'ӱ'),
+ ('ӳ', 'ӳ'),
+ ('ӵ', 'ӵ'),
+ ('ӷ', 'ӷ'),
+ ('ӹ', 'ӹ'),
+ ('ӻ', 'ӻ'),
+ ('ӽ', 'ӽ'),
+ ('ӿ', 'ӿ'),
+ ('ԁ', 'ԁ'),
+ ('ԃ', 'ԃ'),
+ ('ԅ', 'ԅ'),
+ ('ԇ', 'ԇ'),
+ ('ԉ', 'ԉ'),
+ ('ԋ', 'ԋ'),
+ ('ԍ', 'ԍ'),
+ ('ԏ', 'ԏ'),
+ ('ԑ', 'ԑ'),
+ ('ԓ', 'ԓ'),
+ ('ԕ', 'ԕ'),
+ ('ԗ', 'ԗ'),
+ ('ԙ', 'ԙ'),
+ ('ԛ', 'ԛ'),
+ ('ԝ', 'ԝ'),
+ ('ԟ', 'ԟ'),
+ ('ԡ', 'ԡ'),
+ ('ԣ', 'ԣ'),
+ ('ԥ', 'ԥ'),
+ ('ԧ', 'ԧ'),
+ ('ԩ', 'ԩ'),
+ ('ԫ', 'ԫ'),
+ ('ԭ', 'ԭ'),
+ ('ԯ', 'ԯ'),
+ ('ա', 'և'),
+ ('ა', 'ჺ'),
+ ('ჽ', 'ჿ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᲀ', 'ᲈ'),
+ ('ᵹ', 'ᵹ'),
+ ('ᵽ', 'ᵽ'),
+ ('ᶎ', 'ᶎ'),
+ ('ḁ', 'ḁ'),
+ ('ḃ', 'ḃ'),
+ ('ḅ', 'ḅ'),
+ ('ḇ', 'ḇ'),
+ ('ḉ', 'ḉ'),
+ ('ḋ', 'ḋ'),
+ ('ḍ', 'ḍ'),
+ ('ḏ', 'ḏ'),
+ ('ḑ', 'ḑ'),
+ ('ḓ', 'ḓ'),
+ ('ḕ', 'ḕ'),
+ ('ḗ', 'ḗ'),
+ ('ḙ', 'ḙ'),
+ ('ḛ', 'ḛ'),
+ ('ḝ', 'ḝ'),
+ ('ḟ', 'ḟ'),
+ ('ḡ', 'ḡ'),
+ ('ḣ', 'ḣ'),
+ ('ḥ', 'ḥ'),
+ ('ḧ', 'ḧ'),
+ ('ḩ', 'ḩ'),
+ ('ḫ', 'ḫ'),
+ ('ḭ', 'ḭ'),
+ ('ḯ', 'ḯ'),
+ ('ḱ', 'ḱ'),
+ ('ḳ', 'ḳ'),
+ ('ḵ', 'ḵ'),
+ ('ḷ', 'ḷ'),
+ ('ḹ', 'ḹ'),
+ ('ḻ', 'ḻ'),
+ ('ḽ', 'ḽ'),
+ ('ḿ', 'ḿ'),
+ ('ṁ', 'ṁ'),
+ ('ṃ', 'ṃ'),
+ ('ṅ', 'ṅ'),
+ ('ṇ', 'ṇ'),
+ ('ṉ', 'ṉ'),
+ ('ṋ', 'ṋ'),
+ ('ṍ', 'ṍ'),
+ ('ṏ', 'ṏ'),
+ ('ṑ', 'ṑ'),
+ ('ṓ', 'ṓ'),
+ ('ṕ', 'ṕ'),
+ ('ṗ', 'ṗ'),
+ ('ṙ', 'ṙ'),
+ ('ṛ', 'ṛ'),
+ ('ṝ', 'ṝ'),
+ ('ṟ', 'ṟ'),
+ ('ṡ', 'ṡ'),
+ ('ṣ', 'ṣ'),
+ ('ṥ', 'ṥ'),
+ ('ṧ', 'ṧ'),
+ ('ṩ', 'ṩ'),
+ ('ṫ', 'ṫ'),
+ ('ṭ', 'ṭ'),
+ ('ṯ', 'ṯ'),
+ ('ṱ', 'ṱ'),
+ ('ṳ', 'ṳ'),
+ ('ṵ', 'ṵ'),
+ ('ṷ', 'ṷ'),
+ ('ṹ', 'ṹ'),
+ ('ṻ', 'ṻ'),
+ ('ṽ', 'ṽ'),
+ ('ṿ', 'ṿ'),
+ ('ẁ', 'ẁ'),
+ ('ẃ', 'ẃ'),
+ ('ẅ', 'ẅ'),
+ ('ẇ', 'ẇ'),
+ ('ẉ', 'ẉ'),
+ ('ẋ', 'ẋ'),
+ ('ẍ', 'ẍ'),
+ ('ẏ', 'ẏ'),
+ ('ẑ', 'ẑ'),
+ ('ẓ', 'ẓ'),
+ ('ẕ', 'ẛ'),
+ ('ạ', 'ạ'),
+ ('ả', 'ả'),
+ ('ấ', 'ấ'),
+ ('ầ', 'ầ'),
+ ('ẩ', 'ẩ'),
+ ('ẫ', 'ẫ'),
+ ('ậ', 'ậ'),
+ ('ắ', 'ắ'),
+ ('ằ', 'ằ'),
+ ('ẳ', 'ẳ'),
+ ('ẵ', 'ẵ'),
+ ('ặ', 'ặ'),
+ ('ẹ', 'ẹ'),
+ ('ẻ', 'ẻ'),
+ ('ẽ', 'ẽ'),
+ ('ế', 'ế'),
+ ('ề', 'ề'),
+ ('ể', 'ể'),
+ ('ễ', 'ễ'),
+ ('ệ', 'ệ'),
+ ('ỉ', 'ỉ'),
+ ('ị', 'ị'),
+ ('ọ', 'ọ'),
+ ('ỏ', 'ỏ'),
+ ('ố', 'ố'),
+ ('ồ', 'ồ'),
+ ('ổ', 'ổ'),
+ ('ỗ', 'ỗ'),
+ ('ộ', 'ộ'),
+ ('ớ', 'ớ'),
+ ('ờ', 'ờ'),
+ ('ở', 'ở'),
+ ('ỡ', 'ỡ'),
+ ('ợ', 'ợ'),
+ ('ụ', 'ụ'),
+ ('ủ', 'ủ'),
+ ('ứ', 'ứ'),
+ ('ừ', 'ừ'),
+ ('ử', 'ử'),
+ ('ữ', 'ữ'),
+ ('ự', 'ự'),
+ ('ỳ', 'ỳ'),
+ ('ỵ', 'ỵ'),
+ ('ỷ', 'ỷ'),
+ ('ỹ', 'ỹ'),
+ ('ỻ', 'ỻ'),
+ ('ỽ', 'ỽ'),
+ ('ỿ', 'ἇ'),
+ ('ἐ', 'ἕ'),
+ ('ἠ', 'ἧ'),
+ ('ἰ', 'ἷ'),
+ ('ὀ', 'ὅ'),
+ ('ὐ', 'ὗ'),
+ ('ὠ', 'ὧ'),
+ ('ὰ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾷ'),
+ ('ᾼ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῇ'),
+ ('ῌ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'ῗ'),
+ ('ῠ', 'ῧ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῷ'),
+ ('ῼ', 'ῼ'),
+ ('ⅎ', 'ⅎ'),
+ ('ⅰ', 'ⅿ'),
+ ('ↄ', 'ↄ'),
+ ('ⓐ', 'ⓩ'),
+ ('ⰰ', 'ⱟ'),
+ ('ⱡ', 'ⱡ'),
+ ('ⱥ', 'ⱦ'),
+ ('ⱨ', 'ⱨ'),
+ ('ⱪ', 'ⱪ'),
+ ('ⱬ', 'ⱬ'),
+ ('ⱳ', 'ⱳ'),
+ ('ⱶ', 'ⱶ'),
+ ('ⲁ', 'ⲁ'),
+ ('ⲃ', 'ⲃ'),
+ ('ⲅ', 'ⲅ'),
+ ('ⲇ', 'ⲇ'),
+ ('ⲉ', 'ⲉ'),
+ ('ⲋ', 'ⲋ'),
+ ('ⲍ', 'ⲍ'),
+ ('ⲏ', 'ⲏ'),
+ ('ⲑ', 'ⲑ'),
+ ('ⲓ', 'ⲓ'),
+ ('ⲕ', 'ⲕ'),
+ ('ⲗ', 'ⲗ'),
+ ('ⲙ', 'ⲙ'),
+ ('ⲛ', 'ⲛ'),
+ ('ⲝ', 'ⲝ'),
+ ('ⲟ', 'ⲟ'),
+ ('ⲡ', 'ⲡ'),
+ ('ⲣ', 'ⲣ'),
+ ('ⲥ', 'ⲥ'),
+ ('ⲧ', 'ⲧ'),
+ ('ⲩ', 'ⲩ'),
+ ('ⲫ', 'ⲫ'),
+ ('ⲭ', 'ⲭ'),
+ ('ⲯ', 'ⲯ'),
+ ('ⲱ', 'ⲱ'),
+ ('ⲳ', 'ⲳ'),
+ ('ⲵ', 'ⲵ'),
+ ('ⲷ', 'ⲷ'),
+ ('ⲹ', 'ⲹ'),
+ ('ⲻ', 'ⲻ'),
+ ('ⲽ', 'ⲽ'),
+ ('ⲿ', 'ⲿ'),
+ ('ⳁ', 'ⳁ'),
+ ('ⳃ', 'ⳃ'),
+ ('ⳅ', 'ⳅ'),
+ ('ⳇ', 'ⳇ'),
+ ('ⳉ', 'ⳉ'),
+ ('ⳋ', 'ⳋ'),
+ ('ⳍ', 'ⳍ'),
+ ('ⳏ', 'ⳏ'),
+ ('ⳑ', 'ⳑ'),
+ ('ⳓ', 'ⳓ'),
+ ('ⳕ', 'ⳕ'),
+ ('ⳗ', 'ⳗ'),
+ ('ⳙ', 'ⳙ'),
+ ('ⳛ', 'ⳛ'),
+ ('ⳝ', 'ⳝ'),
+ ('ⳟ', 'ⳟ'),
+ ('ⳡ', 'ⳡ'),
+ ('ⳣ', 'ⳣ'),
+ ('ⳬ', 'ⳬ'),
+ ('ⳮ', 'ⳮ'),
+ ('ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ꙁ', 'ꙁ'),
+ ('ꙃ', 'ꙃ'),
+ ('ꙅ', 'ꙅ'),
+ ('ꙇ', 'ꙇ'),
+ ('ꙉ', 'ꙉ'),
+ ('ꙋ', 'ꙋ'),
+ ('ꙍ', 'ꙍ'),
+ ('ꙏ', 'ꙏ'),
+ ('ꙑ', 'ꙑ'),
+ ('ꙓ', 'ꙓ'),
+ ('ꙕ', 'ꙕ'),
+ ('ꙗ', 'ꙗ'),
+ ('ꙙ', 'ꙙ'),
+ ('ꙛ', 'ꙛ'),
+ ('ꙝ', 'ꙝ'),
+ ('ꙟ', 'ꙟ'),
+ ('ꙡ', 'ꙡ'),
+ ('ꙣ', 'ꙣ'),
+ ('ꙥ', 'ꙥ'),
+ ('ꙧ', 'ꙧ'),
+ ('ꙩ', 'ꙩ'),
+ ('ꙫ', 'ꙫ'),
+ ('ꙭ', 'ꙭ'),
+ ('ꚁ', 'ꚁ'),
+ ('ꚃ', 'ꚃ'),
+ ('ꚅ', 'ꚅ'),
+ ('ꚇ', 'ꚇ'),
+ ('ꚉ', 'ꚉ'),
+ ('ꚋ', 'ꚋ'),
+ ('ꚍ', 'ꚍ'),
+ ('ꚏ', 'ꚏ'),
+ ('ꚑ', 'ꚑ'),
+ ('ꚓ', 'ꚓ'),
+ ('ꚕ', 'ꚕ'),
+ ('ꚗ', 'ꚗ'),
+ ('ꚙ', 'ꚙ'),
+ ('ꚛ', 'ꚛ'),
+ ('ꜣ', 'ꜣ'),
+ ('ꜥ', 'ꜥ'),
+ ('ꜧ', 'ꜧ'),
+ ('ꜩ', 'ꜩ'),
+ ('ꜫ', 'ꜫ'),
+ ('ꜭ', 'ꜭ'),
+ ('ꜯ', 'ꜯ'),
+ ('ꜳ', 'ꜳ'),
+ ('ꜵ', 'ꜵ'),
+ ('ꜷ', 'ꜷ'),
+ ('ꜹ', 'ꜹ'),
+ ('ꜻ', 'ꜻ'),
+ ('ꜽ', 'ꜽ'),
+ ('ꜿ', 'ꜿ'),
+ ('ꝁ', 'ꝁ'),
+ ('ꝃ', 'ꝃ'),
+ ('ꝅ', 'ꝅ'),
+ ('ꝇ', 'ꝇ'),
+ ('ꝉ', 'ꝉ'),
+ ('ꝋ', 'ꝋ'),
+ ('ꝍ', 'ꝍ'),
+ ('ꝏ', 'ꝏ'),
+ ('ꝑ', 'ꝑ'),
+ ('ꝓ', 'ꝓ'),
+ ('ꝕ', 'ꝕ'),
+ ('ꝗ', 'ꝗ'),
+ ('ꝙ', 'ꝙ'),
+ ('ꝛ', 'ꝛ'),
+ ('ꝝ', 'ꝝ'),
+ ('ꝟ', 'ꝟ'),
+ ('ꝡ', 'ꝡ'),
+ ('ꝣ', 'ꝣ'),
+ ('ꝥ', 'ꝥ'),
+ ('ꝧ', 'ꝧ'),
+ ('ꝩ', 'ꝩ'),
+ ('ꝫ', 'ꝫ'),
+ ('ꝭ', 'ꝭ'),
+ ('ꝯ', 'ꝯ'),
+ ('ꝺ', 'ꝺ'),
+ ('ꝼ', 'ꝼ'),
+ ('ꝿ', 'ꝿ'),
+ ('ꞁ', 'ꞁ'),
+ ('ꞃ', 'ꞃ'),
+ ('ꞅ', 'ꞅ'),
+ ('ꞇ', 'ꞇ'),
+ ('ꞌ', 'ꞌ'),
+ ('ꞑ', 'ꞑ'),
+ ('ꞓ', 'ꞔ'),
+ ('ꞗ', 'ꞗ'),
+ ('ꞙ', 'ꞙ'),
+ ('ꞛ', 'ꞛ'),
+ ('ꞝ', 'ꞝ'),
+ ('ꞟ', 'ꞟ'),
+ ('ꞡ', 'ꞡ'),
+ ('ꞣ', 'ꞣ'),
+ ('ꞥ', 'ꞥ'),
+ ('ꞧ', 'ꞧ'),
+ ('ꞩ', 'ꞩ'),
+ ('ꞵ', 'ꞵ'),
+ ('ꞷ', 'ꞷ'),
+ ('ꞹ', 'ꞹ'),
+ ('ꞻ', 'ꞻ'),
+ ('ꞽ', 'ꞽ'),
+ ('ꞿ', 'ꞿ'),
+ ('ꟁ', 'ꟁ'),
+ ('ꟃ', 'ꟃ'),
+ ('ꟈ', 'ꟈ'),
+ ('ꟊ', 'ꟊ'),
+ ('ꟑ', 'ꟑ'),
+ ('ꟗ', 'ꟗ'),
+ ('ꟙ', 'ꟙ'),
+ ('ꟶ', 'ꟶ'),
+ ('ꭓ', 'ꭓ'),
+ ('ꭰ', 'ꮿ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('a', 'z'),
+ ('𐐨', '𐑏'),
+ ('𐓘', '𐓻'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐳀', '𐳲'),
+ ('𑣀', '𑣟'),
+ ('𖹠', '𖹿'),
+ ('𞤢', '𞥃'),
+];
+
+pub const DASH: &'static [(char, char)] = &[
+ ('-', '-'),
+ ('֊', '֊'),
+ ('־', '־'),
+ ('᐀', '᐀'),
+ ('᠆', '᠆'),
+ ('‐', '―'),
+ ('⁓', '⁓'),
+ ('⁻', '⁻'),
+ ('₋', '₋'),
+ ('−', '−'),
+ ('⸗', '⸗'),
+ ('⸚', '⸚'),
+ ('⸺', '⸻'),
+ ('⹀', '⹀'),
+ ('⹝', '⹝'),
+ ('〜', '〜'),
+ ('〰', '〰'),
+ ('゠', '゠'),
+ ('︱', '︲'),
+ ('﹘', '﹘'),
+ ('﹣', '﹣'),
+ ('-', '-'),
+ ('𐺭', '𐺭'),
+];
+
+pub const DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[
+ ('\u{ad}', '\u{ad}'),
+ ('\u{34f}', '\u{34f}'),
+ ('\u{61c}', '\u{61c}'),
+ ('ᅟ', 'ᅠ'),
+ ('\u{17b4}', '\u{17b5}'),
+ ('\u{180b}', '\u{180f}'),
+ ('\u{200b}', '\u{200f}'),
+ ('\u{202a}', '\u{202e}'),
+ ('\u{2060}', '\u{206f}'),
+ ('ㅤ', 'ㅤ'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{feff}', '\u{feff}'),
+ ('ᅠ', 'ᅠ'),
+ ('\u{fff0}', '\u{fff8}'),
+ ('\u{1bca0}', '\u{1bca3}'),
+ ('\u{1d173}', '\u{1d17a}'),
+ ('\u{e0000}', '\u{e0fff}'),
+];
+
+pub const DEPRECATED: &'static [(char, char)] = &[
+ ('ʼn', 'ʼn'),
+ ('ٳ', 'ٳ'),
+ ('\u{f77}', '\u{f77}'),
+ ('\u{f79}', '\u{f79}'),
+ ('ឣ', 'ឤ'),
+ ('\u{206a}', '\u{206f}'),
+ ('〈', '〉'),
+ ('\u{e0001}', '\u{e0001}'),
+];
+
+pub const DIACRITIC: &'static [(char, char)] = &[
+ ('^', '^'),
+ ('`', '`'),
+ ('¨', '¨'),
+ ('¯', '¯'),
+ ('´', '´'),
+ ('·', '¸'),
+ ('ʰ', '\u{34e}'),
+ ('\u{350}', '\u{357}'),
+ ('\u{35d}', '\u{362}'),
+ ('ʹ', '͵'),
+ ('ͺ', 'ͺ'),
+ ('΄', '΅'),
+ ('\u{483}', '\u{487}'),
+ ('ՙ', 'ՙ'),
+ ('\u{591}', '\u{5a1}'),
+ ('\u{5a3}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c4}'),
+ ('\u{64b}', '\u{652}'),
+ ('\u{657}', '\u{658}'),
+ ('\u{6df}', '\u{6e0}'),
+ ('ۥ', 'ۦ'),
+ ('\u{6ea}', '\u{6ec}'),
+ ('\u{730}', '\u{74a}'),
+ ('\u{7a6}', '\u{7b0}'),
+ ('\u{7eb}', 'ߵ'),
+ ('\u{818}', '\u{819}'),
+ ('\u{898}', '\u{89f}'),
+ ('ࣉ', '\u{8d2}'),
+ ('\u{8e3}', '\u{8fe}'),
+ ('\u{93c}', '\u{93c}'),
+ ('\u{94d}', '\u{94d}'),
+ ('\u{951}', '\u{954}'),
+ ('ॱ', 'ॱ'),
+ ('\u{9bc}', '\u{9bc}'),
+ ('\u{9cd}', '\u{9cd}'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('\u{a4d}', '\u{a4d}'),
+ ('\u{abc}', '\u{abc}'),
+ ('\u{acd}', '\u{acd}'),
+ ('\u{afd}', '\u{aff}'),
+ ('\u{b3c}', '\u{b3c}'),
+ ('\u{b4d}', '\u{b4d}'),
+ ('\u{b55}', '\u{b55}'),
+ ('\u{bcd}', '\u{bcd}'),
+ ('\u{c3c}', '\u{c3c}'),
+ ('\u{c4d}', '\u{c4d}'),
+ ('\u{cbc}', '\u{cbc}'),
+ ('\u{ccd}', '\u{ccd}'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('\u{d4d}', '\u{d4d}'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{e47}', '\u{e4c}'),
+ ('\u{e4e}', '\u{e4e}'),
+ ('\u{eba}', '\u{eba}'),
+ ('\u{ec8}', '\u{ecc}'),
+ ('\u{f18}', '\u{f19}'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('༾', '༿'),
+ ('\u{f82}', '\u{f84}'),
+ ('\u{f86}', '\u{f87}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('\u{1037}', '\u{1037}'),
+ ('\u{1039}', '\u{103a}'),
+ ('ၣ', 'ၤ'),
+ ('ၩ', 'ၭ'),
+ ('ႇ', '\u{108d}'),
+ ('ႏ', 'ႏ'),
+ ('ႚ', 'ႛ'),
+ ('\u{135d}', '\u{135f}'),
+ ('\u{1714}', '᜕'),
+ ('\u{17c9}', '\u{17d3}'),
+ ('\u{17dd}', '\u{17dd}'),
+ ('\u{1939}', '\u{193b}'),
+ ('\u{1a75}', '\u{1a7c}'),
+ ('\u{1a7f}', '\u{1a7f}'),
+ ('\u{1ab0}', '\u{1abe}'),
+ ('\u{1ac1}', '\u{1acb}'),
+ ('\u{1b34}', '\u{1b34}'),
+ ('᭄', '᭄'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('᮪', '\u{1bab}'),
+ ('\u{1c36}', '\u{1c37}'),
+ ('ᱸ', 'ᱽ'),
+ ('\u{1cd0}', '\u{1ce8}'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('᳷', '\u{1cf9}'),
+ ('ᴬ', 'ᵪ'),
+ ('\u{1dc4}', '\u{1dcf}'),
+ ('\u{1df5}', '\u{1dff}'),
+ ('᾽', '᾽'),
+ ('᾿', '῁'),
+ ('῍', '῏'),
+ ('῝', '῟'),
+ ('῭', '`'),
+ ('´', '῾'),
+ ('\u{2cef}', '\u{2cf1}'),
+ ('ⸯ', 'ⸯ'),
+ ('\u{302a}', '\u{302f}'),
+ ('\u{3099}', '゜'),
+ ('ー', 'ー'),
+ ('\u{a66f}', '\u{a66f}'),
+ ('\u{a67c}', '\u{a67d}'),
+ ('ꙿ', 'ꙿ'),
+ ('ꚜ', 'ꚝ'),
+ ('\u{a6f0}', '\u{a6f1}'),
+ ('꜀', '꜡'),
+ ('ꞈ', '꞊'),
+ ('ꟸ', 'ꟹ'),
+ ('\u{a8c4}', '\u{a8c4}'),
+ ('\u{a8e0}', '\u{a8f1}'),
+ ('\u{a92b}', '꤮'),
+ ('꥓', '꥓'),
+ ('\u{a9b3}', '\u{a9b3}'),
+ ('꧀', '꧀'),
+ ('\u{a9e5}', '\u{a9e5}'),
+ ('ꩻ', 'ꩽ'),
+ ('\u{aabf}', 'ꫂ'),
+ ('\u{aaf6}', '\u{aaf6}'),
+ ('꭛', 'ꭟ'),
+ ('ꭩ', '꭫'),
+ ('꯬', '\u{abed}'),
+ ('\u{fb1e}', '\u{fb1e}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('^', '^'),
+ ('`', '`'),
+ ('ー', 'ー'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ (' ̄', ' ̄'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('\u{10ae5}', '\u{10ae6}'),
+ ('𐴢', '\u{10d27}'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('\u{10f46}', '\u{10f50}'),
+ ('\u{10f82}', '\u{10f85}'),
+ ('\u{11046}', '\u{11046}'),
+ ('\u{11070}', '\u{11070}'),
+ ('\u{110b9}', '\u{110ba}'),
+ ('\u{11133}', '\u{11134}'),
+ ('\u{11173}', '\u{11173}'),
+ ('𑇀', '𑇀'),
+ ('\u{111ca}', '\u{111cc}'),
+ ('𑈵', '\u{11236}'),
+ ('\u{112e9}', '\u{112ea}'),
+ ('\u{1133c}', '\u{1133c}'),
+ ('𑍍', '𑍍'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('\u{11442}', '\u{11442}'),
+ ('\u{11446}', '\u{11446}'),
+ ('\u{114c2}', '\u{114c3}'),
+ ('\u{115bf}', '\u{115c0}'),
+ ('\u{1163f}', '\u{1163f}'),
+ ('𑚶', '\u{116b7}'),
+ ('\u{1172b}', '\u{1172b}'),
+ ('\u{11839}', '\u{1183a}'),
+ ('𑤽', '\u{1193e}'),
+ ('\u{11943}', '\u{11943}'),
+ ('\u{119e0}', '\u{119e0}'),
+ ('\u{11a34}', '\u{11a34}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('\u{11a99}', '\u{11a99}'),
+ ('\u{11c3f}', '\u{11c3f}'),
+ ('\u{11d42}', '\u{11d42}'),
+ ('\u{11d44}', '\u{11d45}'),
+ ('\u{11d97}', '\u{11d97}'),
+ ('\u{13447}', '\u{13455}'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('\u{16b30}', '\u{16b36}'),
+ ('\u{16f8f}', '𖾟'),
+ ('𖿰', '𖿱'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d167}', '\u{1d169}'),
+ ('𝅭', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('𞀰', '𞁭'),
+ ('\u{1e130}', '\u{1e136}'),
+ ('\u{1e2ae}', '\u{1e2ae}'),
+ ('\u{1e2ec}', '\u{1e2ef}'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('\u{1e944}', '\u{1e946}'),
+ ('\u{1e948}', '\u{1e94a}'),
+];
+
+pub const EMOJI: &'static [(char, char)] = &[
+ ('#', '#'),
+ ('*', '*'),
+ ('0', '9'),
+ ('©', '©'),
+ ('®', '®'),
+ ('‼', '‼'),
+ ('⁉', '⁉'),
+ ('™', '™'),
+ ('ℹ', 'ℹ'),
+ ('↔', '↙'),
+ ('↩', '↪'),
+ ('⌚', '⌛'),
+ ('⌨', '⌨'),
+ ('⏏', '⏏'),
+ ('⏩', '⏳'),
+ ('⏸', '⏺'),
+ ('Ⓜ', 'Ⓜ'),
+ ('▪', '▫'),
+ ('▶', '▶'),
+ ('◀', '◀'),
+ ('◻', '◾'),
+ ('☀', '☄'),
+ ('☎', '☎'),
+ ('☑', '☑'),
+ ('☔', '☕'),
+ ('☘', '☘'),
+ ('☝', '☝'),
+ ('☠', '☠'),
+ ('☢', '☣'),
+ ('☦', '☦'),
+ ('☪', '☪'),
+ ('☮', '☯'),
+ ('☸', '☺'),
+ ('♀', '♀'),
+ ('♂', '♂'),
+ ('♈', '♓'),
+ ('♟', '♠'),
+ ('♣', '♣'),
+ ('♥', '♦'),
+ ('♨', '♨'),
+ ('♻', '♻'),
+ ('♾', '♿'),
+ ('⚒', '⚗'),
+ ('⚙', '⚙'),
+ ('⚛', '⚜'),
+ ('⚠', '⚡'),
+ ('⚧', '⚧'),
+ ('⚪', '⚫'),
+ ('⚰', '⚱'),
+ ('⚽', '⚾'),
+ ('⛄', '⛅'),
+ ('⛈', '⛈'),
+ ('⛎', '⛏'),
+ ('⛑', '⛑'),
+ ('⛓', '⛔'),
+ ('⛩', '⛪'),
+ ('⛰', '⛵'),
+ ('⛷', '⛺'),
+ ('⛽', '⛽'),
+ ('✂', '✂'),
+ ('✅', '✅'),
+ ('✈', '✍'),
+ ('✏', '✏'),
+ ('✒', '✒'),
+ ('✔', '✔'),
+ ('✖', '✖'),
+ ('✝', '✝'),
+ ('✡', '✡'),
+ ('✨', '✨'),
+ ('✳', '✴'),
+ ('❄', '❄'),
+ ('❇', '❇'),
+ ('❌', '❌'),
+ ('❎', '❎'),
+ ('❓', '❕'),
+ ('❗', '❗'),
+ ('❣', '❤'),
+ ('➕', '➗'),
+ ('➡', '➡'),
+ ('➰', '➰'),
+ ('➿', '➿'),
+ ('⤴', '⤵'),
+ ('⬅', '⬇'),
+ ('⬛', '⬜'),
+ ('⭐', '⭐'),
+ ('⭕', '⭕'),
+ ('〰', '〰'),
+ ('〽', '〽'),
+ ('㊗', '㊗'),
+ ('㊙', '㊙'),
+ ('🀄', '🀄'),
+ ('🃏', '🃏'),
+ ('🅰', '🅱'),
+ ('🅾', '🅿'),
+ ('🆎', '🆎'),
+ ('🆑', '🆚'),
+ ('🇦', '🇿'),
+ ('🈁', '🈂'),
+ ('🈚', '🈚'),
+ ('🈯', '🈯'),
+ ('🈲', '🈺'),
+ ('🉐', '🉑'),
+ ('🌀', '🌡'),
+ ('🌤', '🎓'),
+ ('🎖', '🎗'),
+ ('🎙', '🎛'),
+ ('🎞', '🏰'),
+ ('🏳', '🏵'),
+ ('🏷', '📽'),
+ ('📿', '🔽'),
+ ('🕉', '🕎'),
+ ('🕐', '🕧'),
+ ('🕯', '🕰'),
+ ('🕳', '🕺'),
+ ('🖇', '🖇'),
+ ('🖊', '🖍'),
+ ('🖐', '🖐'),
+ ('🖕', '🖖'),
+ ('🖤', '🖥'),
+ ('🖨', '🖨'),
+ ('🖱', '🖲'),
+ ('🖼', '🖼'),
+ ('🗂', '🗄'),
+ ('🗑', '🗓'),
+ ('🗜', '🗞'),
+ ('🗡', '🗡'),
+ ('🗣', '🗣'),
+ ('🗨', '🗨'),
+ ('🗯', '🗯'),
+ ('🗳', '🗳'),
+ ('🗺', '🙏'),
+ ('🚀', '🛅'),
+ ('🛋', '🛒'),
+ ('🛕', '🛗'),
+ ('🛜', '🛥'),
+ ('🛩', '🛩'),
+ ('🛫', '🛬'),
+ ('🛰', '🛰'),
+ ('🛳', '🛼'),
+ ('🟠', '🟫'),
+ ('🟰', '🟰'),
+ ('🤌', '🤺'),
+ ('🤼', '🥅'),
+ ('🥇', '🧿'),
+ ('🩰', '🩼'),
+ ('🪀', '🪈'),
+ ('🪐', '🪽'),
+ ('🪿', '🫅'),
+ ('🫎', '🫛'),
+ ('🫠', '🫨'),
+ ('🫰', '🫸'),
+];
+
+pub const EMOJI_COMPONENT: &'static [(char, char)] = &[
+ ('#', '#'),
+ ('*', '*'),
+ ('0', '9'),
+ ('\u{200d}', '\u{200d}'),
+ ('\u{20e3}', '\u{20e3}'),
+ ('\u{fe0f}', '\u{fe0f}'),
+ ('🇦', '🇿'),
+ ('🏻', '🏿'),
+ ('🦰', '🦳'),
+ ('\u{e0020}', '\u{e007f}'),
+];
+
+pub const EMOJI_MODIFIER: &'static [(char, char)] = &[('🏻', '🏿')];
+
+pub const EMOJI_MODIFIER_BASE: &'static [(char, char)] = &[
+ ('☝', '☝'),
+ ('⛹', '⛹'),
+ ('✊', '✍'),
+ ('🎅', '🎅'),
+ ('🏂', '🏄'),
+ ('🏇', '🏇'),
+ ('🏊', '🏌'),
+ ('👂', '👃'),
+ ('👆', '👐'),
+ ('👦', '👸'),
+ ('👼', '👼'),
+ ('💁', '💃'),
+ ('💅', '💇'),
+ ('💏', '💏'),
+ ('💑', '💑'),
+ ('💪', '💪'),
+ ('🕴', '🕵'),
+ ('🕺', '🕺'),
+ ('🖐', '🖐'),
+ ('🖕', '🖖'),
+ ('🙅', '🙇'),
+ ('🙋', '🙏'),
+ ('🚣', '🚣'),
+ ('🚴', '🚶'),
+ ('🛀', '🛀'),
+ ('🛌', '🛌'),
+ ('🤌', '🤌'),
+ ('🤏', '🤏'),
+ ('🤘', '🤟'),
+ ('🤦', '🤦'),
+ ('🤰', '🤹'),
+ ('🤼', '🤾'),
+ ('🥷', '🥷'),
+ ('🦵', '🦶'),
+ ('🦸', '🦹'),
+ ('🦻', '🦻'),
+ ('🧍', '🧏'),
+ ('🧑', '🧝'),
+ ('🫃', '🫅'),
+ ('🫰', '🫸'),
+];
+
+pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[
+ ('⌚', '⌛'),
+ ('⏩', '⏬'),
+ ('⏰', '⏰'),
+ ('⏳', '⏳'),
+ ('◽', '◾'),
+ ('☔', '☕'),
+ ('♈', '♓'),
+ ('♿', '♿'),
+ ('⚓', '⚓'),
+ ('⚡', '⚡'),
+ ('⚪', '⚫'),
+ ('⚽', '⚾'),
+ ('⛄', '⛅'),
+ ('⛎', '⛎'),
+ ('⛔', '⛔'),
+ ('⛪', '⛪'),
+ ('⛲', '⛳'),
+ ('⛵', '⛵'),
+ ('⛺', '⛺'),
+ ('⛽', '⛽'),
+ ('✅', '✅'),
+ ('✊', '✋'),
+ ('✨', '✨'),
+ ('❌', '❌'),
+ ('❎', '❎'),
+ ('❓', '❕'),
+ ('❗', '❗'),
+ ('➕', '➗'),
+ ('➰', '➰'),
+ ('➿', '➿'),
+ ('⬛', '⬜'),
+ ('⭐', '⭐'),
+ ('⭕', '⭕'),
+ ('🀄', '🀄'),
+ ('🃏', '🃏'),
+ ('🆎', '🆎'),
+ ('🆑', '🆚'),
+ ('🇦', '🇿'),
+ ('🈁', '🈁'),
+ ('🈚', '🈚'),
+ ('🈯', '🈯'),
+ ('🈲', '🈶'),
+ ('🈸', '🈺'),
+ ('🉐', '🉑'),
+ ('🌀', '🌠'),
+ ('🌭', '🌵'),
+ ('🌷', '🍼'),
+ ('🍾', '🎓'),
+ ('🎠', '🏊'),
+ ('🏏', '🏓'),
+ ('🏠', '🏰'),
+ ('🏴', '🏴'),
+ ('🏸', '🐾'),
+ ('👀', '👀'),
+ ('👂', '📼'),
+ ('📿', '🔽'),
+ ('🕋', '🕎'),
+ ('🕐', '🕧'),
+ ('🕺', '🕺'),
+ ('🖕', '🖖'),
+ ('🖤', '🖤'),
+ ('🗻', '🙏'),
+ ('🚀', '🛅'),
+ ('🛌', '🛌'),
+ ('🛐', '🛒'),
+ ('🛕', '🛗'),
+ ('🛜', '🛟'),
+ ('🛫', '🛬'),
+ ('🛴', '🛼'),
+ ('🟠', '🟫'),
+ ('🟰', '🟰'),
+ ('🤌', '🤺'),
+ ('🤼', '🥅'),
+ ('🥇', '🧿'),
+ ('🩰', '🩼'),
+ ('🪀', '🪈'),
+ ('🪐', '🪽'),
+ ('🪿', '🫅'),
+ ('🫎', '🫛'),
+ ('🫠', '🫨'),
+ ('🫰', '🫸'),
+];
+
+pub const EXTENDED_PICTOGRAPHIC: &'static [(char, char)] = &[
+ ('©', '©'),
+ ('®', '®'),
+ ('‼', '‼'),
+ ('⁉', '⁉'),
+ ('™', '™'),
+ ('ℹ', 'ℹ'),
+ ('↔', '↙'),
+ ('↩', '↪'),
+ ('⌚', '⌛'),
+ ('⌨', '⌨'),
+ ('⎈', '⎈'),
+ ('⏏', '⏏'),
+ ('⏩', '⏳'),
+ ('⏸', '⏺'),
+ ('Ⓜ', 'Ⓜ'),
+ ('▪', '▫'),
+ ('▶', '▶'),
+ ('◀', '◀'),
+ ('◻', '◾'),
+ ('☀', '★'),
+ ('☇', '☒'),
+ ('☔', '⚅'),
+ ('⚐', '✅'),
+ ('✈', '✒'),
+ ('✔', '✔'),
+ ('✖', '✖'),
+ ('✝', '✝'),
+ ('✡', '✡'),
+ ('✨', '✨'),
+ ('✳', '✴'),
+ ('❄', '❄'),
+ ('❇', '❇'),
+ ('❌', '❌'),
+ ('❎', '❎'),
+ ('❓', '❕'),
+ ('❗', '❗'),
+ ('❣', '❧'),
+ ('➕', '➗'),
+ ('➡', '➡'),
+ ('➰', '➰'),
+ ('➿', '➿'),
+ ('⤴', '⤵'),
+ ('⬅', '⬇'),
+ ('⬛', '⬜'),
+ ('⭐', '⭐'),
+ ('⭕', '⭕'),
+ ('〰', '〰'),
+ ('〽', '〽'),
+ ('㊗', '㊗'),
+ ('㊙', '㊙'),
+ ('🀀', '\u{1f0ff}'),
+ ('🄍', '🄏'),
+ ('🄯', '🄯'),
+ ('🅬', '🅱'),
+ ('🅾', '🅿'),
+ ('🆎', '🆎'),
+ ('🆑', '🆚'),
+ ('🆭', '\u{1f1e5}'),
+ ('🈁', '\u{1f20f}'),
+ ('🈚', '🈚'),
+ ('🈯', '🈯'),
+ ('🈲', '🈺'),
+ ('\u{1f23c}', '\u{1f23f}'),
+ ('\u{1f249}', '🏺'),
+ ('🐀', '🔽'),
+ ('🕆', '🙏'),
+ ('🚀', '\u{1f6ff}'),
+ ('🝴', '🝿'),
+ ('🟕', '\u{1f7ff}'),
+ ('\u{1f80c}', '\u{1f80f}'),
+ ('\u{1f848}', '\u{1f84f}'),
+ ('\u{1f85a}', '\u{1f85f}'),
+ ('\u{1f888}', '\u{1f88f}'),
+ ('\u{1f8ae}', '\u{1f8ff}'),
+ ('🤌', '🤺'),
+ ('🤼', '🥅'),
+ ('🥇', '\u{1faff}'),
+ ('\u{1fc00}', '\u{1fffd}'),
+];
+
+pub const EXTENDER: &'static [(char, char)] = &[
+ ('·', '·'),
+ ('ː', 'ˑ'),
+ ('ـ', 'ـ'),
+ ('ߺ', 'ߺ'),
+ ('\u{b55}', '\u{b55}'),
+ ('ๆ', 'ๆ'),
+ ('ໆ', 'ໆ'),
+ ('᠊', '᠊'),
+ ('ᡃ', 'ᡃ'),
+ ('ᪧ', 'ᪧ'),
+ ('\u{1c36}', '\u{1c36}'),
+ ('ᱻ', 'ᱻ'),
+ ('々', '々'),
+ ('〱', '〵'),
+ ('ゝ', 'ゞ'),
+ ('ー', 'ヾ'),
+ ('ꀕ', 'ꀕ'),
+ ('ꘌ', 'ꘌ'),
+ ('ꧏ', 'ꧏ'),
+ ('ꧦ', 'ꧦ'),
+ ('ꩰ', 'ꩰ'),
+ ('ꫝ', 'ꫝ'),
+ ('ꫳ', 'ꫴ'),
+ ('ー', 'ー'),
+ ('𐞁', '𐞂'),
+ ('𑍝', '𑍝'),
+ ('𑗆', '𑗈'),
+ ('\u{11a98}', '\u{11a98}'),
+ ('𖭂', '𖭃'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '𖿣'),
+ ('𞄼', '𞄽'),
+ ('\u{1e944}', '\u{1e946}'),
+];
+
+pub const GRAPHEME_BASE: &'static [(char, char)] = &[
+ (' ', '~'),
+ ('\u{a0}', '¬'),
+ ('®', '˿'),
+ ('Ͱ', 'ͷ'),
+ ('ͺ', 'Ϳ'),
+ ('΄', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', '҂'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', '֊'),
+ ('֍', '֏'),
+ ('־', '־'),
+ ('׀', '׀'),
+ ('׃', '׃'),
+ ('׆', '׆'),
+ ('א', 'ת'),
+ ('ׯ', '״'),
+ ('؆', '؏'),
+ ('؛', '؛'),
+ ('؝', 'ي'),
+ ('٠', 'ٯ'),
+ ('ٱ', 'ە'),
+ ('۞', '۞'),
+ ('ۥ', 'ۦ'),
+ ('۩', '۩'),
+ ('ۮ', '܍'),
+ ('ܐ', 'ܐ'),
+ ('ܒ', 'ܯ'),
+ ('ݍ', 'ޥ'),
+ ('ޱ', 'ޱ'),
+ ('߀', 'ߪ'),
+ ('ߴ', 'ߺ'),
+ ('߾', 'ࠕ'),
+ ('ࠚ', 'ࠚ'),
+ ('ࠤ', 'ࠤ'),
+ ('ࠨ', 'ࠨ'),
+ ('࠰', '࠾'),
+ ('ࡀ', 'ࡘ'),
+ ('࡞', '࡞'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢎ'),
+ ('ࢠ', 'ࣉ'),
+ ('ः', 'ह'),
+ ('ऻ', 'ऻ'),
+ ('ऽ', 'ी'),
+ ('ॉ', 'ौ'),
+ ('ॎ', 'ॐ'),
+ ('क़', 'ॡ'),
+ ('।', 'ঀ'),
+ ('ং', 'ঃ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('ঽ', 'ঽ'),
+ ('ি', 'ী'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৌ'),
+ ('ৎ', 'ৎ'),
+ ('ড়', 'ঢ়'),
+ ('য়', 'ৡ'),
+ ('০', '৽'),
+ ('ਃ', 'ਃ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('ਾ', 'ੀ'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('੦', '੯'),
+ ('ੲ', 'ੴ'),
+ ('੶', '੶'),
+ ('ઃ', 'ઃ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('ઽ', 'ી'),
+ ('ૉ', 'ૉ'),
+ ('ો', 'ૌ'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', 'ૡ'),
+ ('૦', '૱'),
+ ('ૹ', 'ૹ'),
+ ('ଂ', 'ଃ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('ଽ', 'ଽ'),
+ ('ୀ', 'ୀ'),
+ ('େ', 'ୈ'),
+ ('ୋ', 'ୌ'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', 'ୡ'),
+ ('୦', '୷'),
+ ('ஃ', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('ி', 'ி'),
+ ('ு', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', 'ௌ'),
+ ('ௐ', 'ௐ'),
+ ('௦', '௺'),
+ ('ఁ', 'ః'),
+ ('అ', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('ఽ', 'ఽ'),
+ ('ు', 'ౄ'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', 'ౡ'),
+ ('౦', '౯'),
+ ('౷', 'ಀ'),
+ ('ಂ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('ಽ', 'ಾ'),
+ ('ೀ', 'ು'),
+ ('ೃ', 'ೄ'),
+ ('ೇ', 'ೈ'),
+ ('ೊ', 'ೋ'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', 'ೡ'),
+ ('೦', '೯'),
+ ('ೱ', 'ೳ'),
+ ('ം', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', 'ഺ'),
+ ('ഽ', 'ഽ'),
+ ('ി', 'ീ'),
+ ('െ', 'ൈ'),
+ ('ൊ', 'ൌ'),
+ ('ൎ', '൏'),
+ ('ൔ', 'ൖ'),
+ ('൘', 'ൡ'),
+ ('൦', 'ൿ'),
+ ('ං', 'ඃ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('ැ', 'ෑ'),
+ ('ෘ', 'ෞ'),
+ ('෦', '෯'),
+ ('ෲ', '෴'),
+ ('ก', 'ะ'),
+ ('า', 'ำ'),
+ ('฿', 'ๆ'),
+ ('๏', '๛'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ະ'),
+ ('າ', 'ຳ'),
+ ('ຽ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('໐', '໙'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', '༗'),
+ ('༚', '༴'),
+ ('༶', '༶'),
+ ('༸', '༸'),
+ ('༺', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('ཿ', 'ཿ'),
+ ('྅', '྅'),
+ ('ྈ', 'ྌ'),
+ ('྾', '࿅'),
+ ('࿇', '࿌'),
+ ('࿎', '࿚'),
+ ('က', 'ာ'),
+ ('ေ', 'ေ'),
+ ('း', 'း'),
+ ('ျ', 'ြ'),
+ ('ဿ', 'ၗ'),
+ ('ၚ', 'ၝ'),
+ ('ၡ', 'ၰ'),
+ ('ၵ', 'ႁ'),
+ ('ႃ', 'ႄ'),
+ ('ႇ', 'ႌ'),
+ ('ႎ', 'ႜ'),
+ ('႞', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('፠', '፼'),
+ ('ᎀ', '᎙'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('᐀', '᚜'),
+ ('ᚠ', 'ᛸ'),
+ ('ᜀ', 'ᜑ'),
+ ('᜕', '᜕'),
+ ('ᜟ', 'ᜱ'),
+ ('᜴', '᜶'),
+ ('ᝀ', 'ᝑ'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('ក', 'ឳ'),
+ ('ា', 'ា'),
+ ('ើ', 'ៅ'),
+ ('ះ', 'ៈ'),
+ ('។', 'ៜ'),
+ ('០', '៩'),
+ ('៰', '៹'),
+ ('᠀', '᠊'),
+ ('᠐', '᠙'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢄ'),
+ ('ᢇ', 'ᢨ'),
+ ('ᢪ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('ᤣ', 'ᤦ'),
+ ('ᤩ', 'ᤫ'),
+ ('ᤰ', 'ᤱ'),
+ ('ᤳ', 'ᤸ'),
+ ('᥀', '᥀'),
+ ('᥄', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('᧐', '᧚'),
+ ('᧞', 'ᨖ'),
+ ('ᨙ', 'ᨚ'),
+ ('᨞', 'ᩕ'),
+ ('ᩗ', 'ᩗ'),
+ ('ᩡ', 'ᩡ'),
+ ('ᩣ', 'ᩤ'),
+ ('ᩭ', 'ᩲ'),
+ ('᪀', '᪉'),
+ ('᪐', '᪙'),
+ ('᪠', '᪭'),
+ ('ᬄ', 'ᬳ'),
+ ('ᬻ', 'ᬻ'),
+ ('ᬽ', 'ᭁ'),
+ ('ᭃ', 'ᭌ'),
+ ('᭐', '᭪'),
+ ('᭴', '᭾'),
+ ('ᮂ', 'ᮡ'),
+ ('ᮦ', 'ᮧ'),
+ ('᮪', '᮪'),
+ ('ᮮ', 'ᯥ'),
+ ('ᯧ', 'ᯧ'),
+ ('ᯪ', 'ᯬ'),
+ ('ᯮ', 'ᯮ'),
+ ('᯲', '᯳'),
+ ('᯼', 'ᰫ'),
+ ('ᰴ', 'ᰵ'),
+ ('᰻', '᱉'),
+ ('ᱍ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', '᳇'),
+ ('᳓', '᳓'),
+ ('᳡', '᳡'),
+ ('ᳩ', 'ᳬ'),
+ ('ᳮ', 'ᳳ'),
+ ('ᳵ', '᳷'),
+ ('ᳺ', 'ᳺ'),
+ ('ᴀ', 'ᶿ'),
+ ('Ḁ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ῄ'),
+ ('ῆ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('῝', '`'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', '῾'),
+ ('\u{2000}', '\u{200a}'),
+ ('‐', '‧'),
+ ('\u{202f}', '\u{205f}'),
+ ('⁰', 'ⁱ'),
+ ('⁴', '₎'),
+ ('ₐ', 'ₜ'),
+ ('₠', '⃀'),
+ ('℀', '↋'),
+ ('←', '␦'),
+ ('⑀', '⑊'),
+ ('①', '⭳'),
+ ('⭶', '⮕'),
+ ('⮗', 'ⳮ'),
+ ('Ⳳ', 'ⳳ'),
+ ('⳹', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', '⵰'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('⸀', '⹝'),
+ ('⺀', '⺙'),
+ ('⺛', '⻳'),
+ ('⼀', '⿕'),
+ ('⿰', '⿻'),
+ ('\u{3000}', '〩'),
+ ('〰', '〿'),
+ ('ぁ', 'ゖ'),
+ ('゛', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('㆐', '㇣'),
+ ('ㇰ', '㈞'),
+ ('㈠', 'ꒌ'),
+ ('꒐', '꓆'),
+ ('ꓐ', 'ꘫ'),
+ ('Ꙁ', 'ꙮ'),
+ ('꙳', '꙳'),
+ ('꙾', 'ꚝ'),
+ ('ꚠ', 'ꛯ'),
+ ('꛲', '꛷'),
+ ('꜀', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠁ'),
+ ('ꠃ', 'ꠅ'),
+ ('ꠇ', 'ꠊ'),
+ ('ꠌ', 'ꠤ'),
+ ('ꠧ', '꠫'),
+ ('꠰', '꠹'),
+ ('ꡀ', '꡷'),
+ ('ꢀ', 'ꣃ'),
+ ('꣎', '꣙'),
+ ('ꣲ', 'ꣾ'),
+ ('꤀', 'ꤥ'),
+ ('꤮', 'ꥆ'),
+ ('ꥒ', '꥓'),
+ ('꥟', 'ꥼ'),
+ ('ꦃ', 'ꦲ'),
+ ('ꦴ', 'ꦵ'),
+ ('ꦺ', 'ꦻ'),
+ ('ꦾ', '꧍'),
+ ('ꧏ', '꧙'),
+ ('꧞', 'ꧤ'),
+ ('ꧦ', 'ꧾ'),
+ ('ꨀ', 'ꨨ'),
+ ('ꨯ', 'ꨰ'),
+ ('ꨳ', 'ꨴ'),
+ ('ꩀ', 'ꩂ'),
+ ('ꩄ', 'ꩋ'),
+ ('ꩍ', 'ꩍ'),
+ ('꩐', '꩙'),
+ ('꩜', 'ꩻ'),
+ ('ꩽ', 'ꪯ'),
+ ('ꪱ', 'ꪱ'),
+ ('ꪵ', 'ꪶ'),
+ ('ꪹ', 'ꪽ'),
+ ('ꫀ', 'ꫀ'),
+ ('ꫂ', 'ꫂ'),
+ ('ꫛ', 'ꫫ'),
+ ('ꫮ', 'ꫵ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', '꭫'),
+ ('ꭰ', 'ꯤ'),
+ ('ꯦ', 'ꯧ'),
+ ('ꯩ', '꯬'),
+ ('꯰', '꯹'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('יִ', 'יִ'),
+ ('ײַ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', '﯂'),
+ ('ﯓ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('﷏', '﷏'),
+ ('ﷰ', '﷿'),
+ ('︐', '︙'),
+ ('︰', '﹒'),
+ ('﹔', '﹦'),
+ ('﹨', '﹫'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('!', 'ン'),
+ ('ᅠ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('¢', '₩'),
+ ('│', '○'),
+ ('', '�'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐄀', '𐄂'),
+ ('𐄇', '𐄳'),
+ ('𐄷', '𐆎'),
+ ('𐆐', '𐆜'),
+ ('𐆠', '𐆠'),
+ ('𐇐', '𐇼'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('𐋡', '𐋻'),
+ ('𐌀', '𐌣'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '𐍵'),
+ ('𐎀', '𐎝'),
+ ('𐎟', '𐏃'),
+ ('𐏈', '𐏕'),
+ ('𐐀', '𐒝'),
+ ('𐒠', '𐒩'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕯', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡗', '𐢞'),
+ ('𐢧', '𐢯'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐣻', '𐤛'),
+ ('𐤟', '𐤹'),
+ ('𐤿', '𐤿'),
+ ('𐦀', '𐦷'),
+ ('𐦼', '𐧏'),
+ ('𐧒', '𐨀'),
+ ('𐨐', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('𐩀', '𐩈'),
+ ('𐩐', '𐩘'),
+ ('𐩠', '𐪟'),
+ ('𐫀', '𐫤'),
+ ('𐫫', '𐫶'),
+ ('𐬀', '𐬵'),
+ ('𐬹', '𐭕'),
+ ('𐭘', '𐭲'),
+ ('𐭸', '𐮑'),
+ ('𐮙', '𐮜'),
+ ('𐮩', '𐮯'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐳺', '𐴣'),
+ ('𐴰', '𐴹'),
+ ('𐹠', '𐹾'),
+ ('𐺀', '𐺩'),
+ ('𐺭', '𐺭'),
+ ('𐺰', '𐺱'),
+ ('𐼀', '𐼧'),
+ ('𐼰', '𐽅'),
+ ('𐽑', '𐽙'),
+ ('𐽰', '𐾁'),
+ ('𐾆', '𐾉'),
+ ('𐾰', '𐿋'),
+ ('𐿠', '𐿶'),
+ ('𑀀', '𑀀'),
+ ('𑀂', '𑀷'),
+ ('𑁇', '𑁍'),
+ ('𑁒', '𑁯'),
+ ('𑁱', '𑁲'),
+ ('𑁵', '𑁵'),
+ ('𑂂', '𑂲'),
+ ('𑂷', '𑂸'),
+ ('𑂻', '𑂼'),
+ ('𑂾', '𑃁'),
+ ('𑃐', '𑃨'),
+ ('𑃰', '𑃹'),
+ ('𑄃', '𑄦'),
+ ('𑄬', '𑄬'),
+ ('𑄶', '𑅇'),
+ ('𑅐', '𑅲'),
+ ('𑅴', '𑅶'),
+ ('𑆂', '𑆵'),
+ ('𑆿', '𑇈'),
+ ('𑇍', '𑇎'),
+ ('𑇐', '𑇟'),
+ ('𑇡', '𑇴'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '𑈮'),
+ ('𑈲', '𑈳'),
+ ('𑈵', '𑈵'),
+ ('𑈸', '𑈽'),
+ ('𑈿', '𑉀'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊩'),
+ ('𑊰', '𑋞'),
+ ('𑋠', '𑋢'),
+ ('𑋰', '𑋹'),
+ ('𑌂', '𑌃'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('𑌽', '𑌽'),
+ ('𑌿', '𑌿'),
+ ('𑍁', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('𑍐', '𑍐'),
+ ('𑍝', '𑍣'),
+ ('𑐀', '𑐷'),
+ ('𑑀', '𑑁'),
+ ('𑑅', '𑑅'),
+ ('𑑇', '𑑛'),
+ ('𑑝', '𑑝'),
+ ('𑑟', '𑑡'),
+ ('𑒀', '𑒯'),
+ ('𑒱', '𑒲'),
+ ('𑒹', '𑒹'),
+ ('𑒻', '𑒼'),
+ ('𑒾', '𑒾'),
+ ('𑓁', '𑓁'),
+ ('𑓄', '𑓇'),
+ ('𑓐', '𑓙'),
+ ('𑖀', '𑖮'),
+ ('𑖰', '𑖱'),
+ ('𑖸', '𑖻'),
+ ('𑖾', '𑖾'),
+ ('𑗁', '𑗛'),
+ ('𑘀', '𑘲'),
+ ('𑘻', '𑘼'),
+ ('𑘾', '𑘾'),
+ ('𑙁', '𑙄'),
+ ('𑙐', '𑙙'),
+ ('𑙠', '𑙬'),
+ ('𑚀', '𑚪'),
+ ('𑚬', '𑚬'),
+ ('𑚮', '𑚯'),
+ ('𑚶', '𑚶'),
+ ('𑚸', '𑚹'),
+ ('𑛀', '𑛉'),
+ ('𑜀', '𑜚'),
+ ('𑜠', '𑜡'),
+ ('𑜦', '𑜦'),
+ ('𑜰', '𑝆'),
+ ('𑠀', '𑠮'),
+ ('𑠸', '𑠸'),
+ ('𑠻', '𑠻'),
+ ('𑢠', '𑣲'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤯'),
+ ('𑤱', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('𑤽', '𑤽'),
+ ('𑤿', '𑥂'),
+ ('𑥄', '𑥆'),
+ ('𑥐', '𑥙'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '𑧓'),
+ ('𑧜', '𑧟'),
+ ('𑧡', '𑧤'),
+ ('𑨀', '𑨀'),
+ ('𑨋', '𑨲'),
+ ('𑨹', '𑨺'),
+ ('𑨿', '𑩆'),
+ ('𑩐', '𑩐'),
+ ('𑩗', '𑩘'),
+ ('𑩜', '𑪉'),
+ ('𑪗', '𑪗'),
+ ('𑪚', '𑪢'),
+ ('𑪰', '𑫸'),
+ ('𑬀', '𑬉'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '𑰯'),
+ ('𑰾', '𑰾'),
+ ('𑱀', '𑱅'),
+ ('𑱐', '𑱬'),
+ ('𑱰', '𑲏'),
+ ('𑲩', '𑲩'),
+ ('𑲱', '𑲱'),
+ ('𑲴', '𑲴'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '𑴰'),
+ ('𑵆', '𑵆'),
+ ('𑵐', '𑵙'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶎'),
+ ('𑶓', '𑶔'),
+ ('𑶖', '𑶖'),
+ ('𑶘', '𑶘'),
+ ('𑶠', '𑶩'),
+ ('𑻠', '𑻲'),
+ ('𑻵', '𑻸'),
+ ('𑼂', '𑼐'),
+ ('𑼒', '𑼵'),
+ ('𑼾', '𑼿'),
+ ('𑽁', '𑽁'),
+ ('𑽃', '𑽙'),
+ ('𑾰', '𑾰'),
+ ('𑿀', '𑿱'),
+ ('𑿿', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒑰', '𒑴'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿲'),
+ ('𓀀', '𓐯'),
+ ('𓑁', '𓑆'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩠', '𖩩'),
+ ('𖩮', '𖪾'),
+ ('𖫀', '𖫉'),
+ ('𖫐', '𖫭'),
+ ('𖫵', '𖫵'),
+ ('𖬀', '𖬯'),
+ ('𖬷', '𖭅'),
+ ('𖭐', '𖭙'),
+ ('𖭛', '𖭡'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖺚'),
+ ('𖼀', '𖽊'),
+ ('𖽐', '𖾇'),
+ ('𖾓', '𖾟'),
+ ('𖿠', '𖿣'),
+ ('𖿰', '𖿱'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('𛲜', '𛲜'),
+ ('𛲟', '𛲟'),
+ ('𜽐', '𜿃'),
+ ('𝀀', '𝃵'),
+ ('𝄀', '𝄦'),
+ ('𝄩', '𝅘𝅥𝅲'),
+ ('𝅦', '𝅦'),
+ ('𝅪', '𝅭'),
+ ('𝆃', '𝆄'),
+ ('𝆌', '𝆩'),
+ ('𝆮', '𝇪'),
+ ('𝈀', '𝉁'),
+ ('𝉅', '𝉅'),
+ ('𝋀', '𝋓'),
+ ('𝋠', '𝋳'),
+ ('𝌀', '𝍖'),
+ ('𝍠', '𝍸'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝟋'),
+ ('𝟎', '𝧿'),
+ ('𝨷', '𝨺'),
+ ('𝩭', '𝩴'),
+ ('𝩶', '𝪃'),
+ ('𝪅', '𝪋'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞀰', '𞁭'),
+ ('𞄀', '𞄬'),
+ ('𞄷', '𞄽'),
+ ('𞅀', '𞅉'),
+ ('𞅎', '𞅏'),
+ ('𞊐', '𞊭'),
+ ('𞋀', '𞋫'),
+ ('𞋰', '𞋹'),
+ ('𞋿', '𞋿'),
+ ('𞓐', '𞓫'),
+ ('𞓰', '𞓹'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('𞣇', '𞣏'),
+ ('𞤀', '𞥃'),
+ ('𞥋', '𞥋'),
+ ('𞥐', '𞥙'),
+ ('𞥞', '𞥟'),
+ ('𞱱', '𞲴'),
+ ('𞴁', '𞴽'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𞻰', '𞻱'),
+ ('🀀', '🀫'),
+ ('🀰', '🂓'),
+ ('🂠', '🂮'),
+ ('🂱', '🂿'),
+ ('🃁', '🃏'),
+ ('🃑', '🃵'),
+ ('🄀', '🆭'),
+ ('🇦', '🈂'),
+ ('🈐', '🈻'),
+ ('🉀', '🉈'),
+ ('🉐', '🉑'),
+ ('🉠', '🉥'),
+ ('🌀', '🛗'),
+ ('🛜', '🛬'),
+ ('🛰', '🛼'),
+ ('🜀', '🝶'),
+ ('🝻', '🟙'),
+ ('🟠', '🟫'),
+ ('🟰', '🟰'),
+ ('🠀', '🠋'),
+ ('🠐', '🡇'),
+ ('🡐', '🡙'),
+ ('🡠', '🢇'),
+ ('🢐', '🢭'),
+ ('🢰', '🢱'),
+ ('🤀', '🩓'),
+ ('🩠', '🩭'),
+ ('🩰', '🩼'),
+ ('🪀', '🪈'),
+ ('🪐', '🪽'),
+ ('🪿', '🫅'),
+ ('🫎', '🫛'),
+ ('🫠', '🫨'),
+ ('🫰', '🫸'),
+ ('🬀', '🮒'),
+ ('🮔', '🯊'),
+ ('🯰', '🯹'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const GRAPHEME_EXTEND: &'static [(char, char)] = &[
+ ('\u{300}', '\u{36f}'),
+ ('\u{483}', '\u{489}'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('\u{610}', '\u{61a}'),
+ ('\u{64b}', '\u{65f}'),
+ ('\u{670}', '\u{670}'),
+ ('\u{6d6}', '\u{6dc}'),
+ ('\u{6df}', '\u{6e4}'),
+ ('\u{6e7}', '\u{6e8}'),
+ ('\u{6ea}', '\u{6ed}'),
+ ('\u{711}', '\u{711}'),
+ ('\u{730}', '\u{74a}'),
+ ('\u{7a6}', '\u{7b0}'),
+ ('\u{7eb}', '\u{7f3}'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('\u{816}', '\u{819}'),
+ ('\u{81b}', '\u{823}'),
+ ('\u{825}', '\u{827}'),
+ ('\u{829}', '\u{82d}'),
+ ('\u{859}', '\u{85b}'),
+ ('\u{898}', '\u{89f}'),
+ ('\u{8ca}', '\u{8e1}'),
+ ('\u{8e3}', '\u{902}'),
+ ('\u{93a}', '\u{93a}'),
+ ('\u{93c}', '\u{93c}'),
+ ('\u{941}', '\u{948}'),
+ ('\u{94d}', '\u{94d}'),
+ ('\u{951}', '\u{957}'),
+ ('\u{962}', '\u{963}'),
+ ('\u{981}', '\u{981}'),
+ ('\u{9bc}', '\u{9bc}'),
+ ('\u{9be}', '\u{9be}'),
+ ('\u{9c1}', '\u{9c4}'),
+ ('\u{9cd}', '\u{9cd}'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('\u{9e2}', '\u{9e3}'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', '\u{a02}'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('\u{a41}', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('\u{a70}', '\u{a71}'),
+ ('\u{a75}', '\u{a75}'),
+ ('\u{a81}', '\u{a82}'),
+ ('\u{abc}', '\u{abc}'),
+ ('\u{ac1}', '\u{ac5}'),
+ ('\u{ac7}', '\u{ac8}'),
+ ('\u{acd}', '\u{acd}'),
+ ('\u{ae2}', '\u{ae3}'),
+ ('\u{afa}', '\u{aff}'),
+ ('\u{b01}', '\u{b01}'),
+ ('\u{b3c}', '\u{b3c}'),
+ ('\u{b3e}', '\u{b3f}'),
+ ('\u{b41}', '\u{b44}'),
+ ('\u{b4d}', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('\u{b62}', '\u{b63}'),
+ ('\u{b82}', '\u{b82}'),
+ ('\u{bbe}', '\u{bbe}'),
+ ('\u{bc0}', '\u{bc0}'),
+ ('\u{bcd}', '\u{bcd}'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('\u{c00}', '\u{c00}'),
+ ('\u{c04}', '\u{c04}'),
+ ('\u{c3c}', '\u{c3c}'),
+ ('\u{c3e}', '\u{c40}'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('\u{c62}', '\u{c63}'),
+ ('\u{c81}', '\u{c81}'),
+ ('\u{cbc}', '\u{cbc}'),
+ ('\u{cbf}', '\u{cbf}'),
+ ('\u{cc2}', '\u{cc2}'),
+ ('\u{cc6}', '\u{cc6}'),
+ ('\u{ccc}', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('\u{ce2}', '\u{ce3}'),
+ ('\u{d00}', '\u{d01}'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('\u{d3e}', '\u{d3e}'),
+ ('\u{d41}', '\u{d44}'),
+ ('\u{d4d}', '\u{d4d}'),
+ ('\u{d57}', '\u{d57}'),
+ ('\u{d62}', '\u{d63}'),
+ ('\u{d81}', '\u{d81}'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dcf}'),
+ ('\u{dd2}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('\u{ddf}', '\u{ddf}'),
+ ('\u{e31}', '\u{e31}'),
+ ('\u{e34}', '\u{e3a}'),
+ ('\u{e47}', '\u{e4e}'),
+ ('\u{eb1}', '\u{eb1}'),
+ ('\u{eb4}', '\u{ebc}'),
+ ('\u{ec8}', '\u{ece}'),
+ ('\u{f18}', '\u{f19}'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('\u{f71}', '\u{f7e}'),
+ ('\u{f80}', '\u{f84}'),
+ ('\u{f86}', '\u{f87}'),
+ ('\u{f8d}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('\u{102d}', '\u{1030}'),
+ ('\u{1032}', '\u{1037}'),
+ ('\u{1039}', '\u{103a}'),
+ ('\u{103d}', '\u{103e}'),
+ ('\u{1058}', '\u{1059}'),
+ ('\u{105e}', '\u{1060}'),
+ ('\u{1071}', '\u{1074}'),
+ ('\u{1082}', '\u{1082}'),
+ ('\u{1085}', '\u{1086}'),
+ ('\u{108d}', '\u{108d}'),
+ ('\u{109d}', '\u{109d}'),
+ ('\u{135d}', '\u{135f}'),
+ ('\u{1712}', '\u{1714}'),
+ ('\u{1732}', '\u{1733}'),
+ ('\u{1752}', '\u{1753}'),
+ ('\u{1772}', '\u{1773}'),
+ ('\u{17b4}', '\u{17b5}'),
+ ('\u{17b7}', '\u{17bd}'),
+ ('\u{17c6}', '\u{17c6}'),
+ ('\u{17c9}', '\u{17d3}'),
+ ('\u{17dd}', '\u{17dd}'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '\u{180f}'),
+ ('\u{1885}', '\u{1886}'),
+ ('\u{18a9}', '\u{18a9}'),
+ ('\u{1920}', '\u{1922}'),
+ ('\u{1927}', '\u{1928}'),
+ ('\u{1932}', '\u{1932}'),
+ ('\u{1939}', '\u{193b}'),
+ ('\u{1a17}', '\u{1a18}'),
+ ('\u{1a1b}', '\u{1a1b}'),
+ ('\u{1a56}', '\u{1a56}'),
+ ('\u{1a58}', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a60}'),
+ ('\u{1a62}', '\u{1a62}'),
+ ('\u{1a65}', '\u{1a6c}'),
+ ('\u{1a73}', '\u{1a7c}'),
+ ('\u{1a7f}', '\u{1a7f}'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1b00}', '\u{1b03}'),
+ ('\u{1b34}', '\u{1b3a}'),
+ ('\u{1b3c}', '\u{1b3c}'),
+ ('\u{1b42}', '\u{1b42}'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', '\u{1b81}'),
+ ('\u{1ba2}', '\u{1ba5}'),
+ ('\u{1ba8}', '\u{1ba9}'),
+ ('\u{1bab}', '\u{1bad}'),
+ ('\u{1be6}', '\u{1be6}'),
+ ('\u{1be8}', '\u{1be9}'),
+ ('\u{1bed}', '\u{1bed}'),
+ ('\u{1bef}', '\u{1bf1}'),
+ ('\u{1c2c}', '\u{1c33}'),
+ ('\u{1c36}', '\u{1c37}'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', '\u{1ce0}'),
+ ('\u{1ce2}', '\u{1ce8}'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('\u{1cf8}', '\u{1cf9}'),
+ ('\u{1dc0}', '\u{1dff}'),
+ ('\u{200c}', '\u{200c}'),
+ ('\u{20d0}', '\u{20f0}'),
+ ('\u{2cef}', '\u{2cf1}'),
+ ('\u{2d7f}', '\u{2d7f}'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('\u{302a}', '\u{302f}'),
+ ('\u{3099}', '\u{309a}'),
+ ('\u{a66f}', '\u{a672}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('\u{a69e}', '\u{a69f}'),
+ ('\u{a6f0}', '\u{a6f1}'),
+ ('\u{a802}', '\u{a802}'),
+ ('\u{a806}', '\u{a806}'),
+ ('\u{a80b}', '\u{a80b}'),
+ ('\u{a825}', '\u{a826}'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('\u{a8c4}', '\u{a8c5}'),
+ ('\u{a8e0}', '\u{a8f1}'),
+ ('\u{a8ff}', '\u{a8ff}'),
+ ('\u{a926}', '\u{a92d}'),
+ ('\u{a947}', '\u{a951}'),
+ ('\u{a980}', '\u{a982}'),
+ ('\u{a9b3}', '\u{a9b3}'),
+ ('\u{a9b6}', '\u{a9b9}'),
+ ('\u{a9bc}', '\u{a9bd}'),
+ ('\u{a9e5}', '\u{a9e5}'),
+ ('\u{aa29}', '\u{aa2e}'),
+ ('\u{aa31}', '\u{aa32}'),
+ ('\u{aa35}', '\u{aa36}'),
+ ('\u{aa43}', '\u{aa43}'),
+ ('\u{aa4c}', '\u{aa4c}'),
+ ('\u{aa7c}', '\u{aa7c}'),
+ ('\u{aab0}', '\u{aab0}'),
+ ('\u{aab2}', '\u{aab4}'),
+ ('\u{aab7}', '\u{aab8}'),
+ ('\u{aabe}', '\u{aabf}'),
+ ('\u{aac1}', '\u{aac1}'),
+ ('\u{aaec}', '\u{aaed}'),
+ ('\u{aaf6}', '\u{aaf6}'),
+ ('\u{abe5}', '\u{abe5}'),
+ ('\u{abe8}', '\u{abe8}'),
+ ('\u{abed}', '\u{abed}'),
+ ('\u{fb1e}', '\u{fb1e}'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('\u{10376}', '\u{1037a}'),
+ ('\u{10a01}', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '\u{10a0f}'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('\u{10ae5}', '\u{10ae6}'),
+ ('\u{10d24}', '\u{10d27}'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('\u{10f46}', '\u{10f50}'),
+ ('\u{10f82}', '\u{10f85}'),
+ ('\u{11001}', '\u{11001}'),
+ ('\u{11038}', '\u{11046}'),
+ ('\u{11070}', '\u{11070}'),
+ ('\u{11073}', '\u{11074}'),
+ ('\u{1107f}', '\u{11081}'),
+ ('\u{110b3}', '\u{110b6}'),
+ ('\u{110b9}', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('\u{11100}', '\u{11102}'),
+ ('\u{11127}', '\u{1112b}'),
+ ('\u{1112d}', '\u{11134}'),
+ ('\u{11173}', '\u{11173}'),
+ ('\u{11180}', '\u{11181}'),
+ ('\u{111b6}', '\u{111be}'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('\u{111cf}', '\u{111cf}'),
+ ('\u{1122f}', '\u{11231}'),
+ ('\u{11234}', '\u{11234}'),
+ ('\u{11236}', '\u{11237}'),
+ ('\u{1123e}', '\u{1123e}'),
+ ('\u{11241}', '\u{11241}'),
+ ('\u{112df}', '\u{112df}'),
+ ('\u{112e3}', '\u{112ea}'),
+ ('\u{11300}', '\u{11301}'),
+ ('\u{1133b}', '\u{1133c}'),
+ ('\u{1133e}', '\u{1133e}'),
+ ('\u{11340}', '\u{11340}'),
+ ('\u{11357}', '\u{11357}'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('\u{11438}', '\u{1143f}'),
+ ('\u{11442}', '\u{11444}'),
+ ('\u{11446}', '\u{11446}'),
+ ('\u{1145e}', '\u{1145e}'),
+ ('\u{114b0}', '\u{114b0}'),
+ ('\u{114b3}', '\u{114b8}'),
+ ('\u{114ba}', '\u{114ba}'),
+ ('\u{114bd}', '\u{114bd}'),
+ ('\u{114bf}', '\u{114c0}'),
+ ('\u{114c2}', '\u{114c3}'),
+ ('\u{115af}', '\u{115af}'),
+ ('\u{115b2}', '\u{115b5}'),
+ ('\u{115bc}', '\u{115bd}'),
+ ('\u{115bf}', '\u{115c0}'),
+ ('\u{115dc}', '\u{115dd}'),
+ ('\u{11633}', '\u{1163a}'),
+ ('\u{1163d}', '\u{1163d}'),
+ ('\u{1163f}', '\u{11640}'),
+ ('\u{116ab}', '\u{116ab}'),
+ ('\u{116ad}', '\u{116ad}'),
+ ('\u{116b0}', '\u{116b5}'),
+ ('\u{116b7}', '\u{116b7}'),
+ ('\u{1171d}', '\u{1171f}'),
+ ('\u{11722}', '\u{11725}'),
+ ('\u{11727}', '\u{1172b}'),
+ ('\u{1182f}', '\u{11837}'),
+ ('\u{11839}', '\u{1183a}'),
+ ('\u{11930}', '\u{11930}'),
+ ('\u{1193b}', '\u{1193c}'),
+ ('\u{1193e}', '\u{1193e}'),
+ ('\u{11943}', '\u{11943}'),
+ ('\u{119d4}', '\u{119d7}'),
+ ('\u{119da}', '\u{119db}'),
+ ('\u{119e0}', '\u{119e0}'),
+ ('\u{11a01}', '\u{11a0a}'),
+ ('\u{11a33}', '\u{11a38}'),
+ ('\u{11a3b}', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('\u{11a51}', '\u{11a56}'),
+ ('\u{11a59}', '\u{11a5b}'),
+ ('\u{11a8a}', '\u{11a96}'),
+ ('\u{11a98}', '\u{11a99}'),
+ ('\u{11c30}', '\u{11c36}'),
+ ('\u{11c38}', '\u{11c3d}'),
+ ('\u{11c3f}', '\u{11c3f}'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('\u{11caa}', '\u{11cb0}'),
+ ('\u{11cb2}', '\u{11cb3}'),
+ ('\u{11cb5}', '\u{11cb6}'),
+ ('\u{11d31}', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d45}'),
+ ('\u{11d47}', '\u{11d47}'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('\u{11d95}', '\u{11d95}'),
+ ('\u{11d97}', '\u{11d97}'),
+ ('\u{11ef3}', '\u{11ef4}'),
+ ('\u{11f00}', '\u{11f01}'),
+ ('\u{11f36}', '\u{11f3a}'),
+ ('\u{11f40}', '\u{11f40}'),
+ ('\u{11f42}', '\u{11f42}'),
+ ('\u{13440}', '\u{13440}'),
+ ('\u{13447}', '\u{13455}'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('\u{16b30}', '\u{16b36}'),
+ ('\u{16f4f}', '\u{16f4f}'),
+ ('\u{16f8f}', '\u{16f92}'),
+ ('\u{16fe4}', '\u{16fe4}'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d165}', '\u{1d165}'),
+ ('\u{1d167}', '\u{1d169}'),
+ ('\u{1d16e}', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('\u{1e130}', '\u{1e136}'),
+ ('\u{1e2ae}', '\u{1e2ae}'),
+ ('\u{1e2ec}', '\u{1e2ef}'),
+ ('\u{1e4ec}', '\u{1e4ef}'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('\u{1e944}', '\u{1e94a}'),
+ ('\u{e0020}', '\u{e007f}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const GRAPHEME_LINK: &'static [(char, char)] = &[
+ ('\u{94d}', '\u{94d}'),
+ ('\u{9cd}', '\u{9cd}'),
+ ('\u{a4d}', '\u{a4d}'),
+ ('\u{acd}', '\u{acd}'),
+ ('\u{b4d}', '\u{b4d}'),
+ ('\u{bcd}', '\u{bcd}'),
+ ('\u{c4d}', '\u{c4d}'),
+ ('\u{ccd}', '\u{ccd}'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('\u{d4d}', '\u{d4d}'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{e3a}', '\u{e3a}'),
+ ('\u{eba}', '\u{eba}'),
+ ('\u{f84}', '\u{f84}'),
+ ('\u{1039}', '\u{103a}'),
+ ('\u{1714}', '᜕'),
+ ('᜴', '᜴'),
+ ('\u{17d2}', '\u{17d2}'),
+ ('\u{1a60}', '\u{1a60}'),
+ ('᭄', '᭄'),
+ ('᮪', '\u{1bab}'),
+ ('᯲', '᯳'),
+ ('\u{2d7f}', '\u{2d7f}'),
+ ('\u{a806}', '\u{a806}'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('\u{a8c4}', '\u{a8c4}'),
+ ('꥓', '꥓'),
+ ('꧀', '꧀'),
+ ('\u{aaf6}', '\u{aaf6}'),
+ ('\u{abed}', '\u{abed}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('\u{11046}', '\u{11046}'),
+ ('\u{11070}', '\u{11070}'),
+ ('\u{1107f}', '\u{1107f}'),
+ ('\u{110b9}', '\u{110b9}'),
+ ('\u{11133}', '\u{11134}'),
+ ('𑇀', '𑇀'),
+ ('𑈵', '𑈵'),
+ ('\u{112ea}', '\u{112ea}'),
+ ('𑍍', '𑍍'),
+ ('\u{11442}', '\u{11442}'),
+ ('\u{114c2}', '\u{114c2}'),
+ ('\u{115bf}', '\u{115bf}'),
+ ('\u{1163f}', '\u{1163f}'),
+ ('𑚶', '𑚶'),
+ ('\u{1172b}', '\u{1172b}'),
+ ('\u{11839}', '\u{11839}'),
+ ('𑤽', '\u{1193e}'),
+ ('\u{119e0}', '\u{119e0}'),
+ ('\u{11a34}', '\u{11a34}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('\u{11a99}', '\u{11a99}'),
+ ('\u{11c3f}', '\u{11c3f}'),
+ ('\u{11d44}', '\u{11d45}'),
+ ('\u{11d97}', '\u{11d97}'),
+ ('𑽁', '\u{11f42}'),
+];
+
+pub const HEX_DIGIT: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('A', 'F'),
+ ('a', 'f'),
+ ('0', '9'),
+ ('A', 'F'),
+ ('a', 'f'),
+];
+
+pub const HYPHEN: &'static [(char, char)] = &[
+ ('-', '-'),
+ ('\u{ad}', '\u{ad}'),
+ ('֊', '֊'),
+ ('᠆', '᠆'),
+ ('‐', '‑'),
+ ('⸗', '⸗'),
+ ('・', '・'),
+ ('﹣', '﹣'),
+ ('-', '-'),
+ ('・', '・'),
+];
+
+pub const IDS_BINARY_OPERATOR: &'static [(char, char)] =
+ &[('⿰', '⿱'), ('⿴', '⿻')];
+
+pub const IDS_TRINARY_OPERATOR: &'static [(char, char)] = &[('⿲', '⿳')];
+
+pub const ID_CONTINUE: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('A', 'Z'),
+ ('_', '_'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('·', '·'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ˁ'),
+ ('ˆ', 'ˑ'),
+ ('ˠ', 'ˤ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('\u{300}', 'ʹ'),
+ ('Ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('\u{483}', '\u{487}'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', 'ՙ'),
+ ('ՠ', 'ֈ'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('\u{610}', '\u{61a}'),
+ ('ؠ', '٩'),
+ ('ٮ', 'ۓ'),
+ ('ە', '\u{6dc}'),
+ ('\u{6df}', '\u{6e8}'),
+ ('\u{6ea}', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', '\u{74a}'),
+ ('ݍ', 'ޱ'),
+ ('߀', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('ࠀ', '\u{82d}'),
+ ('ࡀ', '\u{85b}'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('\u{898}', '\u{8e1}'),
+ ('\u{8e3}', '\u{963}'),
+ ('०', '९'),
+ ('ॱ', 'ঃ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('\u{9bc}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৎ'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('ড়', 'ঢ়'),
+ ('য়', '\u{9e3}'),
+ ('০', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', 'ਃ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('੦', '\u{a75}'),
+ ('\u{a81}', 'ઃ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('\u{abc}', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', '\u{ae3}'),
+ ('૦', '૯'),
+ ('ૹ', '\u{aff}'),
+ ('\u{b01}', 'ଃ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('\u{b3c}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', '\u{b63}'),
+ ('୦', '୯'),
+ ('ୱ', 'ୱ'),
+ ('\u{b82}', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('ௐ', 'ௐ'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('௦', '௯'),
+ ('\u{c00}', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('\u{c3c}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', '\u{c63}'),
+ ('౦', '౯'),
+ ('ಀ', 'ಃ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('\u{cbc}', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', '\u{ce3}'),
+ ('೦', '೯'),
+ ('ೱ', 'ೳ'),
+ ('\u{d00}', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', 'ൎ'),
+ ('ൔ', '\u{d57}'),
+ ('ൟ', '\u{d63}'),
+ ('൦', '൯'),
+ ('ൺ', 'ൿ'),
+ ('\u{d81}', 'ඃ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('෦', '෯'),
+ ('ෲ', 'ෳ'),
+ ('ก', '\u{e3a}'),
+ ('เ', '\u{e4e}'),
+ ('๐', '๙'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('\u{ec8}', '\u{ece}'),
+ ('໐', '໙'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('\u{f18}', '\u{f19}'),
+ ('༠', '༩'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('༾', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('\u{f71}', '\u{f84}'),
+ ('\u{f86}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('က', '၉'),
+ ('ၐ', '\u{109d}'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('\u{135d}', '\u{135f}'),
+ ('፩', '፱'),
+ ('ᎀ', 'ᎏ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛮ', 'ᛸ'),
+ ('ᜀ', '᜕'),
+ ('ᜟ', '᜴'),
+ ('ᝀ', '\u{1753}'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('\u{1772}', '\u{1773}'),
+ ('ក', '\u{17d3}'),
+ ('ៗ', 'ៗ'),
+ ('ៜ', '\u{17dd}'),
+ ('០', '៩'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '᠙'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('᥆', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('᧐', '᧚'),
+ ('ᨀ', '\u{1a1b}'),
+ ('ᨠ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '᪉'),
+ ('᪐', '᪙'),
+ ('ᪧ', 'ᪧ'),
+ ('\u{1ab0}', '\u{1abd}'),
+ ('\u{1abf}', '\u{1ace}'),
+ ('\u{1b00}', 'ᭌ'),
+ ('᭐', '᭙'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', '᯳'),
+ ('ᰀ', '\u{1c37}'),
+ ('᱀', '᱉'),
+ ('ᱍ', 'ᱽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', 'ᳺ'),
+ ('ᴀ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('‿', '⁀'),
+ ('⁔', '⁔'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('\u{20d0}', '\u{20dc}'),
+ ('\u{20e1}', '\u{20e1}'),
+ ('\u{20e5}', '\u{20f0}'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('℘', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('\u{2d7f}', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('々', '〇'),
+ ('〡', '\u{302f}'),
+ ('〱', '〵'),
+ ('〸', '〼'),
+ ('ぁ', 'ゖ'),
+ ('\u{3099}', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘫ'),
+ ('Ꙁ', '\u{a66f}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('ꙿ', '\u{a6f1}'),
+ ('ꜗ', 'ꜟ'),
+ ('Ꜣ', 'ꞈ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠧ'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢀ', '\u{a8c5}'),
+ ('꣐', '꣙'),
+ ('\u{a8e0}', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', '\u{a92d}'),
+ ('ꤰ', '꥓'),
+ ('ꥠ', 'ꥼ'),
+ ('\u{a980}', '꧀'),
+ ('ꧏ', '꧙'),
+ ('ꧠ', 'ꧾ'),
+ ('ꨀ', '\u{aa36}'),
+ ('ꩀ', 'ꩍ'),
+ ('꩐', '꩙'),
+ ('ꩠ', 'ꩶ'),
+ ('ꩺ', 'ꫂ'),
+ ('ꫛ', 'ꫝ'),
+ ('ꫠ', 'ꫯ'),
+ ('ꫲ', '\u{aaf6}'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꯪ'),
+ ('꯬', '\u{abed}'),
+ ('꯰', '꯹'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('יִ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('︳', '︴'),
+ ('﹍', '﹏'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('0', '9'),
+ ('A', 'Z'),
+ ('_', '_'),
+ ('a', 'z'),
+ ('ヲ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐅀', '𐅴'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '\u{1037a}'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐏑', '𐏕'),
+ ('𐐀', '𐒝'),
+ ('𐒠', '𐒩'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '\u{10ae6}'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐴀', '\u{10d27}'),
+ ('𐴰', '𐴹'),
+ ('𐺀', '𐺩'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('𐺰', '𐺱'),
+ ('\u{10efd}', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '\u{10f50}'),
+ ('𐽰', '\u{10f85}'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀀', '\u{11046}'),
+ ('𑁦', '𑁵'),
+ ('\u{1107f}', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('𑃐', '𑃨'),
+ ('𑃰', '𑃹'),
+ ('\u{11100}', '\u{11134}'),
+ ('𑄶', '𑄿'),
+ ('𑅄', '𑅇'),
+ ('𑅐', '\u{11173}'),
+ ('𑅶', '𑅶'),
+ ('\u{11180}', '𑇄'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('𑇎', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '\u{11237}'),
+ ('\u{1123e}', '\u{11241}'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '\u{112ea}'),
+ ('𑋰', '𑋹'),
+ ('\u{11300}', '𑌃'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('\u{1133b}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('𑍐', '𑍐'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍝', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('𑐀', '𑑊'),
+ ('𑑐', '𑑙'),
+ ('\u{1145e}', '𑑡'),
+ ('𑒀', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑓐', '𑓙'),
+ ('𑖀', '\u{115b5}'),
+ ('𑖸', '\u{115c0}'),
+ ('𑗘', '\u{115dd}'),
+ ('𑘀', '\u{11640}'),
+ ('𑙄', '𑙄'),
+ ('𑙐', '𑙙'),
+ ('𑚀', '𑚸'),
+ ('𑛀', '𑛉'),
+ ('𑜀', '𑜚'),
+ ('\u{1171d}', '\u{1172b}'),
+ ('𑜰', '𑜹'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '\u{1183a}'),
+ ('𑢠', '𑣩'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '\u{11943}'),
+ ('𑥐', '𑥙'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '\u{119d7}'),
+ ('\u{119da}', '𑧡'),
+ ('𑧣', '𑧤'),
+ ('𑨀', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('𑩐', '\u{11a99}'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '\u{11c36}'),
+ ('\u{11c38}', '𑱀'),
+ ('𑱐', '𑱙'),
+ ('𑱲', '𑲏'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d47}'),
+ ('𑵐', '𑵙'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '𑶘'),
+ ('𑶠', '𑶩'),
+ ('𑻠', '𑻶'),
+ ('\u{11f00}', '𑼐'),
+ ('𑼒', '\u{11f3a}'),
+ ('𑼾', '\u{11f42}'),
+ ('𑽐', '𑽙'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('\u{13440}', '\u{13455}'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩠', '𖩩'),
+ ('𖩰', '𖪾'),
+ ('𖫀', '𖫉'),
+ ('𖫐', '𖫭'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('𖬀', '\u{16b36}'),
+ ('𖭀', '𖭃'),
+ ('𖭐', '𖭙'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖹿'),
+ ('𖼀', '𖽊'),
+ ('\u{16f4f}', '𖾇'),
+ ('\u{16f8f}', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '\u{16fe4}'),
+ ('𖿰', '𖿱'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d165}', '\u{1d169}'),
+ ('𝅭', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝟎', '𝟿'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('𞀰', '𞁭'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('𞄀', '𞄬'),
+ ('\u{1e130}', '𞄽'),
+ ('𞅀', '𞅉'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '\u{1e2ae}'),
+ ('𞋀', '𞋹'),
+ ('𞓐', '𞓹'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('𞤀', '𞥋'),
+ ('𞥐', '𞥙'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('🯰', '🯹'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const ID_START: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ˁ'),
+ ('ˆ', 'ˑ'),
+ ('ˠ', 'ˤ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('Ͱ', 'ʹ'),
+ ('Ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', 'ՙ'),
+ ('ՠ', 'ֈ'),
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('ؠ', 'ي'),
+ ('ٮ', 'ٯ'),
+ ('ٱ', 'ۓ'),
+ ('ە', 'ە'),
+ ('ۥ', 'ۦ'),
+ ('ۮ', 'ۯ'),
+ ('ۺ', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', 'ܐ'),
+ ('ܒ', 'ܯ'),
+ ('ݍ', 'ޥ'),
+ ('ޱ', 'ޱ'),
+ ('ߊ', 'ߪ'),
+ ('ߴ', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('ࠀ', 'ࠕ'),
+ ('ࠚ', 'ࠚ'),
+ ('ࠤ', 'ࠤ'),
+ ('ࠨ', 'ࠨ'),
+ ('ࡀ', 'ࡘ'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('ࢠ', 'ࣉ'),
+ ('ऄ', 'ह'),
+ ('ऽ', 'ऽ'),
+ ('ॐ', 'ॐ'),
+ ('क़', 'ॡ'),
+ ('ॱ', 'ঀ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('ঽ', 'ঽ'),
+ ('ৎ', 'ৎ'),
+ ('ড়', 'ঢ়'),
+ ('য়', 'ৡ'),
+ ('ৰ', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('ੲ', 'ੴ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('ઽ', 'ઽ'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', 'ૡ'),
+ ('ૹ', 'ૹ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('ଽ', 'ଽ'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', 'ୡ'),
+ ('ୱ', 'ୱ'),
+ ('ஃ', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('ௐ', 'ௐ'),
+ ('అ', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('ఽ', 'ఽ'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', 'ౡ'),
+ ('ಀ', 'ಀ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('ಽ', 'ಽ'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', 'ೡ'),
+ ('ೱ', 'ೲ'),
+ ('ഄ', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', 'ഺ'),
+ ('ഽ', 'ഽ'),
+ ('ൎ', 'ൎ'),
+ ('ൔ', 'ൖ'),
+ ('ൟ', 'ൡ'),
+ ('ൺ', 'ൿ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('ก', 'ะ'),
+ ('า', 'ำ'),
+ ('เ', 'ๆ'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ະ'),
+ ('າ', 'ຳ'),
+ ('ຽ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('ཀ', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('ྈ', 'ྌ'),
+ ('က', 'ဪ'),
+ ('ဿ', 'ဿ'),
+ ('ၐ', 'ၕ'),
+ ('ၚ', 'ၝ'),
+ ('ၡ', 'ၡ'),
+ ('ၥ', 'ၦ'),
+ ('ၮ', 'ၰ'),
+ ('ၵ', 'ႁ'),
+ ('ႎ', 'ႎ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('ᎀ', 'ᎏ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛮ', 'ᛸ'),
+ ('ᜀ', 'ᜑ'),
+ ('ᜟ', 'ᜱ'),
+ ('ᝀ', 'ᝑ'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('ក', 'ឳ'),
+ ('ៗ', 'ៗ'),
+ ('ៜ', 'ៜ'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢨ'),
+ ('ᢪ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('ᥐ', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('ᨀ', 'ᨖ'),
+ ('ᨠ', 'ᩔ'),
+ ('ᪧ', 'ᪧ'),
+ ('ᬅ', 'ᬳ'),
+ ('ᭅ', 'ᭌ'),
+ ('ᮃ', 'ᮠ'),
+ ('ᮮ', 'ᮯ'),
+ ('ᮺ', 'ᯥ'),
+ ('ᰀ', 'ᰣ'),
+ ('ᱍ', 'ᱏ'),
+ ('ᱚ', 'ᱽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ᳩ', 'ᳬ'),
+ ('ᳮ', 'ᳳ'),
+ ('ᳵ', 'ᳶ'),
+ ('ᳺ', 'ᳺ'),
+ ('ᴀ', 'ᶿ'),
+ ('Ḁ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('℘', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳮ'),
+ ('Ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('々', '〇'),
+ ('〡', '〩'),
+ ('〱', '〵'),
+ ('〸', '〼'),
+ ('ぁ', 'ゖ'),
+ ('゛', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘟ'),
+ ('ꘪ', 'ꘫ'),
+ ('Ꙁ', 'ꙮ'),
+ ('ꙿ', 'ꚝ'),
+ ('ꚠ', 'ꛯ'),
+ ('ꜗ', 'ꜟ'),
+ ('Ꜣ', 'ꞈ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠁ'),
+ ('ꠃ', 'ꠅ'),
+ ('ꠇ', 'ꠊ'),
+ ('ꠌ', 'ꠢ'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢂ', 'ꢳ'),
+ ('ꣲ', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', 'ꣾ'),
+ ('ꤊ', 'ꤥ'),
+ ('ꤰ', 'ꥆ'),
+ ('ꥠ', 'ꥼ'),
+ ('ꦄ', 'ꦲ'),
+ ('ꧏ', 'ꧏ'),
+ ('ꧠ', 'ꧤ'),
+ ('ꧦ', 'ꧯ'),
+ ('ꧺ', 'ꧾ'),
+ ('ꨀ', 'ꨨ'),
+ ('ꩀ', 'ꩂ'),
+ ('ꩄ', 'ꩋ'),
+ ('ꩠ', 'ꩶ'),
+ ('ꩺ', 'ꩺ'),
+ ('ꩾ', 'ꪯ'),
+ ('ꪱ', 'ꪱ'),
+ ('ꪵ', 'ꪶ'),
+ ('ꪹ', 'ꪽ'),
+ ('ꫀ', 'ꫀ'),
+ ('ꫂ', 'ꫂ'),
+ ('ꫛ', 'ꫝ'),
+ ('ꫠ', 'ꫪ'),
+ ('ꫲ', 'ꫴ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꯢ'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('יִ', 'יִ'),
+ ('ײַ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ヲ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐅀', '𐅴'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '𐍵'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐏑', '𐏕'),
+ ('𐐀', '𐒝'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '𐨀'),
+ ('𐨐', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '𐫤'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐴀', '𐴣'),
+ ('𐺀', '𐺩'),
+ ('𐺰', '𐺱'),
+ ('𐼀', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '𐽅'),
+ ('𐽰', '𐾁'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀃', '𑀷'),
+ ('𑁱', '𑁲'),
+ ('𑁵', '𑁵'),
+ ('𑂃', '𑂯'),
+ ('𑃐', '𑃨'),
+ ('𑄃', '𑄦'),
+ ('𑅄', '𑅄'),
+ ('𑅇', '𑅇'),
+ ('𑅐', '𑅲'),
+ ('𑅶', '𑅶'),
+ ('𑆃', '𑆲'),
+ ('𑇁', '𑇄'),
+ ('𑇚', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '𑈫'),
+ ('𑈿', '𑉀'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '𑋞'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('𑌽', '𑌽'),
+ ('𑍐', '𑍐'),
+ ('𑍝', '𑍡'),
+ ('𑐀', '𑐴'),
+ ('𑑇', '𑑊'),
+ ('𑑟', '𑑡'),
+ ('𑒀', '𑒯'),
+ ('𑓄', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑖀', '𑖮'),
+ ('𑗘', '𑗛'),
+ ('𑘀', '𑘯'),
+ ('𑙄', '𑙄'),
+ ('𑚀', '𑚪'),
+ ('𑚸', '𑚸'),
+ ('𑜀', '𑜚'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '𑠫'),
+ ('𑢠', '𑣟'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤯'),
+ ('𑤿', '𑤿'),
+ ('𑥁', '𑥁'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '𑧐'),
+ ('𑧡', '𑧡'),
+ ('𑧣', '𑧣'),
+ ('𑨀', '𑨀'),
+ ('𑨋', '𑨲'),
+ ('𑨺', '𑨺'),
+ ('𑩐', '𑩐'),
+ ('𑩜', '𑪉'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '𑰮'),
+ ('𑱀', '𑱀'),
+ ('𑱲', '𑲏'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '𑴰'),
+ ('𑵆', '𑵆'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶉'),
+ ('𑶘', '𑶘'),
+ ('𑻠', '𑻲'),
+ ('𑼂', '𑼂'),
+ ('𑼄', '𑼐'),
+ ('𑼒', '𑼳'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('𓑁', '𓑆'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩰', '𖪾'),
+ ('𖫐', '𖫭'),
+ ('𖬀', '𖬯'),
+ ('𖭀', '𖭃'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖹿'),
+ ('𖼀', '𖽊'),
+ ('𖽐', '𖽐'),
+ ('𖾓', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '𖿣'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞀰', '𞁭'),
+ ('𞄀', '𞄬'),
+ ('𞄷', '𞄽'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '𞊭'),
+ ('𞋀', '𞋫'),
+ ('𞓐', '𞓫'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('𞤀', '𞥃'),
+ ('𞥋', '𞥋'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const IDEOGRAPHIC: &'static [(char, char)] = &[
+ ('〆', '〇'),
+ ('〡', '〩'),
+ ('〸', '〺'),
+ ('㐀', '䶿'),
+ ('一', '鿿'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('\u{16fe4}', '\u{16fe4}'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𛅰', '𛋻'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const JOIN_CONTROL: &'static [(char, char)] = &[('\u{200c}', '\u{200d}')];
+
+pub const LOGICAL_ORDER_EXCEPTION: &'static [(char, char)] = &[
+ ('เ', 'ไ'),
+ ('ເ', 'ໄ'),
+ ('ᦵ', 'ᦷ'),
+ ('ᦺ', 'ᦺ'),
+ ('ꪵ', 'ꪶ'),
+ ('ꪹ', 'ꪹ'),
+ ('ꪻ', 'ꪼ'),
+];
+
+pub const LOWERCASE: &'static [(char, char)] = &[
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('ß', 'ö'),
+ ('ø', 'ÿ'),
+ ('ā', 'ā'),
+ ('ă', 'ă'),
+ ('ą', 'ą'),
+ ('ć', 'ć'),
+ ('ĉ', 'ĉ'),
+ ('ċ', 'ċ'),
+ ('č', 'č'),
+ ('ď', 'ď'),
+ ('đ', 'đ'),
+ ('ē', 'ē'),
+ ('ĕ', 'ĕ'),
+ ('ė', 'ė'),
+ ('ę', 'ę'),
+ ('ě', 'ě'),
+ ('ĝ', 'ĝ'),
+ ('ğ', 'ğ'),
+ ('ġ', 'ġ'),
+ ('ģ', 'ģ'),
+ ('ĥ', 'ĥ'),
+ ('ħ', 'ħ'),
+ ('ĩ', 'ĩ'),
+ ('ī', 'ī'),
+ ('ĭ', 'ĭ'),
+ ('į', 'į'),
+ ('ı', 'ı'),
+ ('ij', 'ij'),
+ ('ĵ', 'ĵ'),
+ ('ķ', 'ĸ'),
+ ('ĺ', 'ĺ'),
+ ('ļ', 'ļ'),
+ ('ľ', 'ľ'),
+ ('ŀ', 'ŀ'),
+ ('ł', 'ł'),
+ ('ń', 'ń'),
+ ('ņ', 'ņ'),
+ ('ň', 'ʼn'),
+ ('ŋ', 'ŋ'),
+ ('ō', 'ō'),
+ ('ŏ', 'ŏ'),
+ ('ő', 'ő'),
+ ('œ', 'œ'),
+ ('ŕ', 'ŕ'),
+ ('ŗ', 'ŗ'),
+ ('ř', 'ř'),
+ ('ś', 'ś'),
+ ('ŝ', 'ŝ'),
+ ('ş', 'ş'),
+ ('š', 'š'),
+ ('ţ', 'ţ'),
+ ('ť', 'ť'),
+ ('ŧ', 'ŧ'),
+ ('ũ', 'ũ'),
+ ('ū', 'ū'),
+ ('ŭ', 'ŭ'),
+ ('ů', 'ů'),
+ ('ű', 'ű'),
+ ('ų', 'ų'),
+ ('ŵ', 'ŵ'),
+ ('ŷ', 'ŷ'),
+ ('ź', 'ź'),
+ ('ż', 'ż'),
+ ('ž', 'ƀ'),
+ ('ƃ', 'ƃ'),
+ ('ƅ', 'ƅ'),
+ ('ƈ', 'ƈ'),
+ ('ƌ', 'ƍ'),
+ ('ƒ', 'ƒ'),
+ ('ƕ', 'ƕ'),
+ ('ƙ', 'ƛ'),
+ ('ƞ', 'ƞ'),
+ ('ơ', 'ơ'),
+ ('ƣ', 'ƣ'),
+ ('ƥ', 'ƥ'),
+ ('ƨ', 'ƨ'),
+ ('ƪ', 'ƫ'),
+ ('ƭ', 'ƭ'),
+ ('ư', 'ư'),
+ ('ƴ', 'ƴ'),
+ ('ƶ', 'ƶ'),
+ ('ƹ', 'ƺ'),
+ ('ƽ', 'ƿ'),
+ ('dž', 'dž'),
+ ('lj', 'lj'),
+ ('nj', 'nj'),
+ ('ǎ', 'ǎ'),
+ ('ǐ', 'ǐ'),
+ ('ǒ', 'ǒ'),
+ ('ǔ', 'ǔ'),
+ ('ǖ', 'ǖ'),
+ ('ǘ', 'ǘ'),
+ ('ǚ', 'ǚ'),
+ ('ǜ', 'ǝ'),
+ ('ǟ', 'ǟ'),
+ ('ǡ', 'ǡ'),
+ ('ǣ', 'ǣ'),
+ ('ǥ', 'ǥ'),
+ ('ǧ', 'ǧ'),
+ ('ǩ', 'ǩ'),
+ ('ǫ', 'ǫ'),
+ ('ǭ', 'ǭ'),
+ ('ǯ', 'ǰ'),
+ ('dz', 'dz'),
+ ('ǵ', 'ǵ'),
+ ('ǹ', 'ǹ'),
+ ('ǻ', 'ǻ'),
+ ('ǽ', 'ǽ'),
+ ('ǿ', 'ǿ'),
+ ('ȁ', 'ȁ'),
+ ('ȃ', 'ȃ'),
+ ('ȅ', 'ȅ'),
+ ('ȇ', 'ȇ'),
+ ('ȉ', 'ȉ'),
+ ('ȋ', 'ȋ'),
+ ('ȍ', 'ȍ'),
+ ('ȏ', 'ȏ'),
+ ('ȑ', 'ȑ'),
+ ('ȓ', 'ȓ'),
+ ('ȕ', 'ȕ'),
+ ('ȗ', 'ȗ'),
+ ('ș', 'ș'),
+ ('ț', 'ț'),
+ ('ȝ', 'ȝ'),
+ ('ȟ', 'ȟ'),
+ ('ȡ', 'ȡ'),
+ ('ȣ', 'ȣ'),
+ ('ȥ', 'ȥ'),
+ ('ȧ', 'ȧ'),
+ ('ȩ', 'ȩ'),
+ ('ȫ', 'ȫ'),
+ ('ȭ', 'ȭ'),
+ ('ȯ', 'ȯ'),
+ ('ȱ', 'ȱ'),
+ ('ȳ', 'ȹ'),
+ ('ȼ', 'ȼ'),
+ ('ȿ', 'ɀ'),
+ ('ɂ', 'ɂ'),
+ ('ɇ', 'ɇ'),
+ ('ɉ', 'ɉ'),
+ ('ɋ', 'ɋ'),
+ ('ɍ', 'ɍ'),
+ ('ɏ', 'ʓ'),
+ ('ʕ', 'ʸ'),
+ ('ˀ', 'ˁ'),
+ ('ˠ', 'ˤ'),
+ ('\u{345}', '\u{345}'),
+ ('ͱ', 'ͱ'),
+ ('ͳ', 'ͳ'),
+ ('ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('ΐ', 'ΐ'),
+ ('ά', 'ώ'),
+ ('ϐ', 'ϑ'),
+ ('ϕ', 'ϗ'),
+ ('ϙ', 'ϙ'),
+ ('ϛ', 'ϛ'),
+ ('ϝ', 'ϝ'),
+ ('ϟ', 'ϟ'),
+ ('ϡ', 'ϡ'),
+ ('ϣ', 'ϣ'),
+ ('ϥ', 'ϥ'),
+ ('ϧ', 'ϧ'),
+ ('ϩ', 'ϩ'),
+ ('ϫ', 'ϫ'),
+ ('ϭ', 'ϭ'),
+ ('ϯ', 'ϳ'),
+ ('ϵ', 'ϵ'),
+ ('ϸ', 'ϸ'),
+ ('ϻ', 'ϼ'),
+ ('а', 'џ'),
+ ('ѡ', 'ѡ'),
+ ('ѣ', 'ѣ'),
+ ('ѥ', 'ѥ'),
+ ('ѧ', 'ѧ'),
+ ('ѩ', 'ѩ'),
+ ('ѫ', 'ѫ'),
+ ('ѭ', 'ѭ'),
+ ('ѯ', 'ѯ'),
+ ('ѱ', 'ѱ'),
+ ('ѳ', 'ѳ'),
+ ('ѵ', 'ѵ'),
+ ('ѷ', 'ѷ'),
+ ('ѹ', 'ѹ'),
+ ('ѻ', 'ѻ'),
+ ('ѽ', 'ѽ'),
+ ('ѿ', 'ѿ'),
+ ('ҁ', 'ҁ'),
+ ('ҋ', 'ҋ'),
+ ('ҍ', 'ҍ'),
+ ('ҏ', 'ҏ'),
+ ('ґ', 'ґ'),
+ ('ғ', 'ғ'),
+ ('ҕ', 'ҕ'),
+ ('җ', 'җ'),
+ ('ҙ', 'ҙ'),
+ ('қ', 'қ'),
+ ('ҝ', 'ҝ'),
+ ('ҟ', 'ҟ'),
+ ('ҡ', 'ҡ'),
+ ('ң', 'ң'),
+ ('ҥ', 'ҥ'),
+ ('ҧ', 'ҧ'),
+ ('ҩ', 'ҩ'),
+ ('ҫ', 'ҫ'),
+ ('ҭ', 'ҭ'),
+ ('ү', 'ү'),
+ ('ұ', 'ұ'),
+ ('ҳ', 'ҳ'),
+ ('ҵ', 'ҵ'),
+ ('ҷ', 'ҷ'),
+ ('ҹ', 'ҹ'),
+ ('һ', 'һ'),
+ ('ҽ', 'ҽ'),
+ ('ҿ', 'ҿ'),
+ ('ӂ', 'ӂ'),
+ ('ӄ', 'ӄ'),
+ ('ӆ', 'ӆ'),
+ ('ӈ', 'ӈ'),
+ ('ӊ', 'ӊ'),
+ ('ӌ', 'ӌ'),
+ ('ӎ', 'ӏ'),
+ ('ӑ', 'ӑ'),
+ ('ӓ', 'ӓ'),
+ ('ӕ', 'ӕ'),
+ ('ӗ', 'ӗ'),
+ ('ә', 'ә'),
+ ('ӛ', 'ӛ'),
+ ('ӝ', 'ӝ'),
+ ('ӟ', 'ӟ'),
+ ('ӡ', 'ӡ'),
+ ('ӣ', 'ӣ'),
+ ('ӥ', 'ӥ'),
+ ('ӧ', 'ӧ'),
+ ('ө', 'ө'),
+ ('ӫ', 'ӫ'),
+ ('ӭ', 'ӭ'),
+ ('ӯ', 'ӯ'),
+ ('ӱ', 'ӱ'),
+ ('ӳ', 'ӳ'),
+ ('ӵ', 'ӵ'),
+ ('ӷ', 'ӷ'),
+ ('ӹ', 'ӹ'),
+ ('ӻ', 'ӻ'),
+ ('ӽ', 'ӽ'),
+ ('ӿ', 'ӿ'),
+ ('ԁ', 'ԁ'),
+ ('ԃ', 'ԃ'),
+ ('ԅ', 'ԅ'),
+ ('ԇ', 'ԇ'),
+ ('ԉ', 'ԉ'),
+ ('ԋ', 'ԋ'),
+ ('ԍ', 'ԍ'),
+ ('ԏ', 'ԏ'),
+ ('ԑ', 'ԑ'),
+ ('ԓ', 'ԓ'),
+ ('ԕ', 'ԕ'),
+ ('ԗ', 'ԗ'),
+ ('ԙ', 'ԙ'),
+ ('ԛ', 'ԛ'),
+ ('ԝ', 'ԝ'),
+ ('ԟ', 'ԟ'),
+ ('ԡ', 'ԡ'),
+ ('ԣ', 'ԣ'),
+ ('ԥ', 'ԥ'),
+ ('ԧ', 'ԧ'),
+ ('ԩ', 'ԩ'),
+ ('ԫ', 'ԫ'),
+ ('ԭ', 'ԭ'),
+ ('ԯ', 'ԯ'),
+ ('ՠ', 'ֈ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ჿ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᲀ', 'ᲈ'),
+ ('ᴀ', 'ᶿ'),
+ ('ḁ', 'ḁ'),
+ ('ḃ', 'ḃ'),
+ ('ḅ', 'ḅ'),
+ ('ḇ', 'ḇ'),
+ ('ḉ', 'ḉ'),
+ ('ḋ', 'ḋ'),
+ ('ḍ', 'ḍ'),
+ ('ḏ', 'ḏ'),
+ ('ḑ', 'ḑ'),
+ ('ḓ', 'ḓ'),
+ ('ḕ', 'ḕ'),
+ ('ḗ', 'ḗ'),
+ ('ḙ', 'ḙ'),
+ ('ḛ', 'ḛ'),
+ ('ḝ', 'ḝ'),
+ ('ḟ', 'ḟ'),
+ ('ḡ', 'ḡ'),
+ ('ḣ', 'ḣ'),
+ ('ḥ', 'ḥ'),
+ ('ḧ', 'ḧ'),
+ ('ḩ', 'ḩ'),
+ ('ḫ', 'ḫ'),
+ ('ḭ', 'ḭ'),
+ ('ḯ', 'ḯ'),
+ ('ḱ', 'ḱ'),
+ ('ḳ', 'ḳ'),
+ ('ḵ', 'ḵ'),
+ ('ḷ', 'ḷ'),
+ ('ḹ', 'ḹ'),
+ ('ḻ', 'ḻ'),
+ ('ḽ', 'ḽ'),
+ ('ḿ', 'ḿ'),
+ ('ṁ', 'ṁ'),
+ ('ṃ', 'ṃ'),
+ ('ṅ', 'ṅ'),
+ ('ṇ', 'ṇ'),
+ ('ṉ', 'ṉ'),
+ ('ṋ', 'ṋ'),
+ ('ṍ', 'ṍ'),
+ ('ṏ', 'ṏ'),
+ ('ṑ', 'ṑ'),
+ ('ṓ', 'ṓ'),
+ ('ṕ', 'ṕ'),
+ ('ṗ', 'ṗ'),
+ ('ṙ', 'ṙ'),
+ ('ṛ', 'ṛ'),
+ ('ṝ', 'ṝ'),
+ ('ṟ', 'ṟ'),
+ ('ṡ', 'ṡ'),
+ ('ṣ', 'ṣ'),
+ ('ṥ', 'ṥ'),
+ ('ṧ', 'ṧ'),
+ ('ṩ', 'ṩ'),
+ ('ṫ', 'ṫ'),
+ ('ṭ', 'ṭ'),
+ ('ṯ', 'ṯ'),
+ ('ṱ', 'ṱ'),
+ ('ṳ', 'ṳ'),
+ ('ṵ', 'ṵ'),
+ ('ṷ', 'ṷ'),
+ ('ṹ', 'ṹ'),
+ ('ṻ', 'ṻ'),
+ ('ṽ', 'ṽ'),
+ ('ṿ', 'ṿ'),
+ ('ẁ', 'ẁ'),
+ ('ẃ', 'ẃ'),
+ ('ẅ', 'ẅ'),
+ ('ẇ', 'ẇ'),
+ ('ẉ', 'ẉ'),
+ ('ẋ', 'ẋ'),
+ ('ẍ', 'ẍ'),
+ ('ẏ', 'ẏ'),
+ ('ẑ', 'ẑ'),
+ ('ẓ', 'ẓ'),
+ ('ẕ', 'ẝ'),
+ ('ẟ', 'ẟ'),
+ ('ạ', 'ạ'),
+ ('ả', 'ả'),
+ ('ấ', 'ấ'),
+ ('ầ', 'ầ'),
+ ('ẩ', 'ẩ'),
+ ('ẫ', 'ẫ'),
+ ('ậ', 'ậ'),
+ ('ắ', 'ắ'),
+ ('ằ', 'ằ'),
+ ('ẳ', 'ẳ'),
+ ('ẵ', 'ẵ'),
+ ('ặ', 'ặ'),
+ ('ẹ', 'ẹ'),
+ ('ẻ', 'ẻ'),
+ ('ẽ', 'ẽ'),
+ ('ế', 'ế'),
+ ('ề', 'ề'),
+ ('ể', 'ể'),
+ ('ễ', 'ễ'),
+ ('ệ', 'ệ'),
+ ('ỉ', 'ỉ'),
+ ('ị', 'ị'),
+ ('ọ', 'ọ'),
+ ('ỏ', 'ỏ'),
+ ('ố', 'ố'),
+ ('ồ', 'ồ'),
+ ('ổ', 'ổ'),
+ ('ỗ', 'ỗ'),
+ ('ộ', 'ộ'),
+ ('ớ', 'ớ'),
+ ('ờ', 'ờ'),
+ ('ở', 'ở'),
+ ('ỡ', 'ỡ'),
+ ('ợ', 'ợ'),
+ ('ụ', 'ụ'),
+ ('ủ', 'ủ'),
+ ('ứ', 'ứ'),
+ ('ừ', 'ừ'),
+ ('ử', 'ử'),
+ ('ữ', 'ữ'),
+ ('ự', 'ự'),
+ ('ỳ', 'ỳ'),
+ ('ỵ', 'ỵ'),
+ ('ỷ', 'ỷ'),
+ ('ỹ', 'ỹ'),
+ ('ỻ', 'ỻ'),
+ ('ỽ', 'ỽ'),
+ ('ỿ', 'ἇ'),
+ ('ἐ', 'ἕ'),
+ ('ἠ', 'ἧ'),
+ ('ἰ', 'ἷ'),
+ ('ὀ', 'ὅ'),
+ ('ὐ', 'ὗ'),
+ ('ὠ', 'ὧ'),
+ ('ὰ', 'ώ'),
+ ('ᾀ', 'ᾇ'),
+ ('ᾐ', 'ᾗ'),
+ ('ᾠ', 'ᾧ'),
+ ('ᾰ', 'ᾴ'),
+ ('ᾶ', 'ᾷ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῇ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'ῗ'),
+ ('ῠ', 'ῧ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῷ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ℊ', 'ℊ'),
+ ('ℎ', 'ℏ'),
+ ('ℓ', 'ℓ'),
+ ('ℯ', 'ℯ'),
+ ('ℴ', 'ℴ'),
+ ('ℹ', 'ℹ'),
+ ('ℼ', 'ℽ'),
+ ('ⅆ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('ⅰ', 'ⅿ'),
+ ('ↄ', 'ↄ'),
+ ('ⓐ', 'ⓩ'),
+ ('ⰰ', 'ⱟ'),
+ ('ⱡ', 'ⱡ'),
+ ('ⱥ', 'ⱦ'),
+ ('ⱨ', 'ⱨ'),
+ ('ⱪ', 'ⱪ'),
+ ('ⱬ', 'ⱬ'),
+ ('ⱱ', 'ⱱ'),
+ ('ⱳ', 'ⱴ'),
+ ('ⱶ', 'ⱽ'),
+ ('ⲁ', 'ⲁ'),
+ ('ⲃ', 'ⲃ'),
+ ('ⲅ', 'ⲅ'),
+ ('ⲇ', 'ⲇ'),
+ ('ⲉ', 'ⲉ'),
+ ('ⲋ', 'ⲋ'),
+ ('ⲍ', 'ⲍ'),
+ ('ⲏ', 'ⲏ'),
+ ('ⲑ', 'ⲑ'),
+ ('ⲓ', 'ⲓ'),
+ ('ⲕ', 'ⲕ'),
+ ('ⲗ', 'ⲗ'),
+ ('ⲙ', 'ⲙ'),
+ ('ⲛ', 'ⲛ'),
+ ('ⲝ', 'ⲝ'),
+ ('ⲟ', 'ⲟ'),
+ ('ⲡ', 'ⲡ'),
+ ('ⲣ', 'ⲣ'),
+ ('ⲥ', 'ⲥ'),
+ ('ⲧ', 'ⲧ'),
+ ('ⲩ', 'ⲩ'),
+ ('ⲫ', 'ⲫ'),
+ ('ⲭ', 'ⲭ'),
+ ('ⲯ', 'ⲯ'),
+ ('ⲱ', 'ⲱ'),
+ ('ⲳ', 'ⲳ'),
+ ('ⲵ', 'ⲵ'),
+ ('ⲷ', 'ⲷ'),
+ ('ⲹ', 'ⲹ'),
+ ('ⲻ', 'ⲻ'),
+ ('ⲽ', 'ⲽ'),
+ ('ⲿ', 'ⲿ'),
+ ('ⳁ', 'ⳁ'),
+ ('ⳃ', 'ⳃ'),
+ ('ⳅ', 'ⳅ'),
+ ('ⳇ', 'ⳇ'),
+ ('ⳉ', 'ⳉ'),
+ ('ⳋ', 'ⳋ'),
+ ('ⳍ', 'ⳍ'),
+ ('ⳏ', 'ⳏ'),
+ ('ⳑ', 'ⳑ'),
+ ('ⳓ', 'ⳓ'),
+ ('ⳕ', 'ⳕ'),
+ ('ⳗ', 'ⳗ'),
+ ('ⳙ', 'ⳙ'),
+ ('ⳛ', 'ⳛ'),
+ ('ⳝ', 'ⳝ'),
+ ('ⳟ', 'ⳟ'),
+ ('ⳡ', 'ⳡ'),
+ ('ⳣ', 'ⳤ'),
+ ('ⳬ', 'ⳬ'),
+ ('ⳮ', 'ⳮ'),
+ ('ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ꙁ', 'ꙁ'),
+ ('ꙃ', 'ꙃ'),
+ ('ꙅ', 'ꙅ'),
+ ('ꙇ', 'ꙇ'),
+ ('ꙉ', 'ꙉ'),
+ ('ꙋ', 'ꙋ'),
+ ('ꙍ', 'ꙍ'),
+ ('ꙏ', 'ꙏ'),
+ ('ꙑ', 'ꙑ'),
+ ('ꙓ', 'ꙓ'),
+ ('ꙕ', 'ꙕ'),
+ ('ꙗ', 'ꙗ'),
+ ('ꙙ', 'ꙙ'),
+ ('ꙛ', 'ꙛ'),
+ ('ꙝ', 'ꙝ'),
+ ('ꙟ', 'ꙟ'),
+ ('ꙡ', 'ꙡ'),
+ ('ꙣ', 'ꙣ'),
+ ('ꙥ', 'ꙥ'),
+ ('ꙧ', 'ꙧ'),
+ ('ꙩ', 'ꙩ'),
+ ('ꙫ', 'ꙫ'),
+ ('ꙭ', 'ꙭ'),
+ ('ꚁ', 'ꚁ'),
+ ('ꚃ', 'ꚃ'),
+ ('ꚅ', 'ꚅ'),
+ ('ꚇ', 'ꚇ'),
+ ('ꚉ', 'ꚉ'),
+ ('ꚋ', 'ꚋ'),
+ ('ꚍ', 'ꚍ'),
+ ('ꚏ', 'ꚏ'),
+ ('ꚑ', 'ꚑ'),
+ ('ꚓ', 'ꚓ'),
+ ('ꚕ', 'ꚕ'),
+ ('ꚗ', 'ꚗ'),
+ ('ꚙ', 'ꚙ'),
+ ('ꚛ', 'ꚝ'),
+ ('ꜣ', 'ꜣ'),
+ ('ꜥ', 'ꜥ'),
+ ('ꜧ', 'ꜧ'),
+ ('ꜩ', 'ꜩ'),
+ ('ꜫ', 'ꜫ'),
+ ('ꜭ', 'ꜭ'),
+ ('ꜯ', 'ꜱ'),
+ ('ꜳ', 'ꜳ'),
+ ('ꜵ', 'ꜵ'),
+ ('ꜷ', 'ꜷ'),
+ ('ꜹ', 'ꜹ'),
+ ('ꜻ', 'ꜻ'),
+ ('ꜽ', 'ꜽ'),
+ ('ꜿ', 'ꜿ'),
+ ('ꝁ', 'ꝁ'),
+ ('ꝃ', 'ꝃ'),
+ ('ꝅ', 'ꝅ'),
+ ('ꝇ', 'ꝇ'),
+ ('ꝉ', 'ꝉ'),
+ ('ꝋ', 'ꝋ'),
+ ('ꝍ', 'ꝍ'),
+ ('ꝏ', 'ꝏ'),
+ ('ꝑ', 'ꝑ'),
+ ('ꝓ', 'ꝓ'),
+ ('ꝕ', 'ꝕ'),
+ ('ꝗ', 'ꝗ'),
+ ('ꝙ', 'ꝙ'),
+ ('ꝛ', 'ꝛ'),
+ ('ꝝ', 'ꝝ'),
+ ('ꝟ', 'ꝟ'),
+ ('ꝡ', 'ꝡ'),
+ ('ꝣ', 'ꝣ'),
+ ('ꝥ', 'ꝥ'),
+ ('ꝧ', 'ꝧ'),
+ ('ꝩ', 'ꝩ'),
+ ('ꝫ', 'ꝫ'),
+ ('ꝭ', 'ꝭ'),
+ ('ꝯ', 'ꝸ'),
+ ('ꝺ', 'ꝺ'),
+ ('ꝼ', 'ꝼ'),
+ ('ꝿ', 'ꝿ'),
+ ('ꞁ', 'ꞁ'),
+ ('ꞃ', 'ꞃ'),
+ ('ꞅ', 'ꞅ'),
+ ('ꞇ', 'ꞇ'),
+ ('ꞌ', 'ꞌ'),
+ ('ꞎ', 'ꞎ'),
+ ('ꞑ', 'ꞑ'),
+ ('ꞓ', 'ꞕ'),
+ ('ꞗ', 'ꞗ'),
+ ('ꞙ', 'ꞙ'),
+ ('ꞛ', 'ꞛ'),
+ ('ꞝ', 'ꞝ'),
+ ('ꞟ', 'ꞟ'),
+ ('ꞡ', 'ꞡ'),
+ ('ꞣ', 'ꞣ'),
+ ('ꞥ', 'ꞥ'),
+ ('ꞧ', 'ꞧ'),
+ ('ꞩ', 'ꞩ'),
+ ('ꞯ', 'ꞯ'),
+ ('ꞵ', 'ꞵ'),
+ ('ꞷ', 'ꞷ'),
+ ('ꞹ', 'ꞹ'),
+ ('ꞻ', 'ꞻ'),
+ ('ꞽ', 'ꞽ'),
+ ('ꞿ', 'ꞿ'),
+ ('ꟁ', 'ꟁ'),
+ ('ꟃ', 'ꟃ'),
+ ('ꟈ', 'ꟈ'),
+ ('ꟊ', 'ꟊ'),
+ ('ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟕ'),
+ ('ꟗ', 'ꟗ'),
+ ('ꟙ', 'ꟙ'),
+ ('ꟲ', 'ꟴ'),
+ ('ꟶ', 'ꟶ'),
+ ('ꟸ', 'ꟺ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꮿ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('a', 'z'),
+ ('𐐨', '𐑏'),
+ ('𐓘', '𐓻'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐞀', '𐞀'),
+ ('𐞃', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐳀', '𐳲'),
+ ('𑣀', '𑣟'),
+ ('𖹠', '𖹿'),
+ ('𝐚', '𝐳'),
+ ('𝑎', '𝑔'),
+ ('𝑖', '𝑧'),
+ ('𝒂', '𝒛'),
+ ('𝒶', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝓏'),
+ ('𝓪', '𝔃'),
+ ('𝔞', '𝔷'),
+ ('𝕒', '𝕫'),
+ ('𝖆', '𝖟'),
+ ('𝖺', '𝗓'),
+ ('𝗮', '𝘇'),
+ ('𝘢', '𝘻'),
+ ('𝙖', '𝙯'),
+ ('𝚊', '𝚥'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛡'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜛'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝕'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞏'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟉'),
+ ('𝟋', '𝟋'),
+ ('𝼀', '𝼉'),
+ ('𝼋', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞀰', '𞁭'),
+ ('𞤢', '𞥃'),
+];
+
+pub const MATH: &'static [(char, char)] = &[
+ ('+', '+'),
+ ('<', '>'),
+ ('^', '^'),
+ ('|', '|'),
+ ('~', '~'),
+ ('¬', '¬'),
+ ('±', '±'),
+ ('×', '×'),
+ ('÷', '÷'),
+ ('ϐ', 'ϒ'),
+ ('ϕ', 'ϕ'),
+ ('ϰ', 'ϱ'),
+ ('ϴ', '϶'),
+ ('؆', '؈'),
+ ('‖', '‖'),
+ ('′', '‴'),
+ ('⁀', '⁀'),
+ ('⁄', '⁄'),
+ ('⁒', '⁒'),
+ ('\u{2061}', '\u{2064}'),
+ ('⁺', '⁾'),
+ ('₊', '₎'),
+ ('\u{20d0}', '\u{20dc}'),
+ ('\u{20e1}', '\u{20e1}'),
+ ('\u{20e5}', '\u{20e6}'),
+ ('\u{20eb}', '\u{20ef}'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('℘', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('ℨ', '℩'),
+ ('ℬ', 'ℭ'),
+ ('ℯ', 'ℱ'),
+ ('ℳ', 'ℸ'),
+ ('ℼ', 'ⅉ'),
+ ('⅋', '⅋'),
+ ('←', '↧'),
+ ('↩', '↮'),
+ ('↰', '↱'),
+ ('↶', '↷'),
+ ('↼', '⇛'),
+ ('⇝', '⇝'),
+ ('⇤', '⇥'),
+ ('⇴', '⋿'),
+ ('⌈', '⌋'),
+ ('⌠', '⌡'),
+ ('⍼', '⍼'),
+ ('⎛', '⎵'),
+ ('⎷', '⎷'),
+ ('⏐', '⏐'),
+ ('⏜', '⏢'),
+ ('■', '□'),
+ ('▮', '▷'),
+ ('▼', '◁'),
+ ('◆', '◇'),
+ ('◊', '○'),
+ ('●', '◓'),
+ ('◢', '◢'),
+ ('◤', '◤'),
+ ('◧', '◬'),
+ ('◸', '◿'),
+ ('★', '☆'),
+ ('♀', '♀'),
+ ('♂', '♂'),
+ ('♠', '♣'),
+ ('♭', '♯'),
+ ('⟀', '⟿'),
+ ('⤀', '⫿'),
+ ('⬰', '⭄'),
+ ('⭇', '⭌'),
+ ('﬩', '﬩'),
+ ('﹡', '﹦'),
+ ('﹨', '﹨'),
+ ('+', '+'),
+ ('<', '>'),
+ ('\', '\'),
+ ('^', '^'),
+ ('|', '|'),
+ ('~', '~'),
+ ('¬', '¬'),
+ ('←', '↓'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝟋'),
+ ('𝟎', '𝟿'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𞻰', '𞻱'),
+];
+
+pub const NONCHARACTER_CODE_POINT: &'static [(char, char)] = &[
+ ('\u{fdd0}', '\u{fdef}'),
+ ('\u{fffe}', '\u{ffff}'),
+ ('\u{1fffe}', '\u{1ffff}'),
+ ('\u{2fffe}', '\u{2ffff}'),
+ ('\u{3fffe}', '\u{3ffff}'),
+ ('\u{4fffe}', '\u{4ffff}'),
+ ('\u{5fffe}', '\u{5ffff}'),
+ ('\u{6fffe}', '\u{6ffff}'),
+ ('\u{7fffe}', '\u{7ffff}'),
+ ('\u{8fffe}', '\u{8ffff}'),
+ ('\u{9fffe}', '\u{9ffff}'),
+ ('\u{afffe}', '\u{affff}'),
+ ('\u{bfffe}', '\u{bffff}'),
+ ('\u{cfffe}', '\u{cffff}'),
+ ('\u{dfffe}', '\u{dffff}'),
+ ('\u{efffe}', '\u{effff}'),
+ ('\u{ffffe}', '\u{fffff}'),
+ ('\u{10fffe}', '\u{10ffff}'),
+];
+
+pub const OTHER_ALPHABETIC: &'static [(char, char)] = &[
+ ('\u{345}', '\u{345}'),
+ ('\u{5b0}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('\u{610}', '\u{61a}'),
+ ('\u{64b}', '\u{657}'),
+ ('\u{659}', '\u{65f}'),
+ ('\u{670}', '\u{670}'),
+ ('\u{6d6}', '\u{6dc}'),
+ ('\u{6e1}', '\u{6e4}'),
+ ('\u{6e7}', '\u{6e8}'),
+ ('\u{6ed}', '\u{6ed}'),
+ ('\u{711}', '\u{711}'),
+ ('\u{730}', '\u{73f}'),
+ ('\u{7a6}', '\u{7b0}'),
+ ('\u{816}', '\u{817}'),
+ ('\u{81b}', '\u{823}'),
+ ('\u{825}', '\u{827}'),
+ ('\u{829}', '\u{82c}'),
+ ('\u{8d4}', '\u{8df}'),
+ ('\u{8e3}', '\u{8e9}'),
+ ('\u{8f0}', 'ः'),
+ ('\u{93a}', 'ऻ'),
+ ('ा', 'ौ'),
+ ('ॎ', 'ॏ'),
+ ('\u{955}', '\u{957}'),
+ ('\u{962}', '\u{963}'),
+ ('\u{981}', 'ঃ'),
+ ('\u{9be}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৌ'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('\u{9e2}', '\u{9e3}'),
+ ('\u{a01}', 'ਃ'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4c}'),
+ ('\u{a51}', '\u{a51}'),
+ ('\u{a70}', '\u{a71}'),
+ ('\u{a75}', '\u{a75}'),
+ ('\u{a81}', 'ઃ'),
+ ('ા', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', 'ૌ'),
+ ('\u{ae2}', '\u{ae3}'),
+ ('\u{afa}', '\u{afc}'),
+ ('\u{b01}', 'ଃ'),
+ ('\u{b3e}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', 'ୌ'),
+ ('\u{b56}', '\u{b57}'),
+ ('\u{b62}', '\u{b63}'),
+ ('\u{b82}', '\u{b82}'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', 'ௌ'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('\u{c00}', '\u{c04}'),
+ ('\u{c3e}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4c}'),
+ ('\u{c55}', '\u{c56}'),
+ ('\u{c62}', '\u{c63}'),
+ ('\u{c81}', 'ಃ'),
+ ('ಾ', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccc}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('\u{ce2}', '\u{ce3}'),
+ ('ೳ', 'ೳ'),
+ ('\u{d00}', 'ഃ'),
+ ('\u{d3e}', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', 'ൌ'),
+ ('\u{d57}', '\u{d57}'),
+ ('\u{d62}', '\u{d63}'),
+ ('\u{d81}', 'ඃ'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('ෲ', 'ෳ'),
+ ('\u{e31}', '\u{e31}'),
+ ('\u{e34}', '\u{e3a}'),
+ ('\u{e4d}', '\u{e4d}'),
+ ('\u{eb1}', '\u{eb1}'),
+ ('\u{eb4}', '\u{eb9}'),
+ ('\u{ebb}', '\u{ebc}'),
+ ('\u{ecd}', '\u{ecd}'),
+ ('\u{f71}', '\u{f83}'),
+ ('\u{f8d}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('ါ', '\u{1036}'),
+ ('း', 'း'),
+ ('ျ', '\u{103e}'),
+ ('ၖ', '\u{1059}'),
+ ('\u{105e}', '\u{1060}'),
+ ('ၢ', 'ၤ'),
+ ('ၧ', 'ၭ'),
+ ('\u{1071}', '\u{1074}'),
+ ('\u{1082}', '\u{108d}'),
+ ('ႏ', 'ႏ'),
+ ('ႚ', '\u{109d}'),
+ ('\u{1712}', '\u{1713}'),
+ ('\u{1732}', '\u{1733}'),
+ ('\u{1752}', '\u{1753}'),
+ ('\u{1772}', '\u{1773}'),
+ ('ា', 'ៈ'),
+ ('\u{1885}', '\u{1886}'),
+ ('\u{18a9}', '\u{18a9}'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', 'ᤸ'),
+ ('\u{1a17}', '\u{1a1b}'),
+ ('ᩕ', '\u{1a5e}'),
+ ('ᩡ', '\u{1a74}'),
+ ('\u{1abf}', '\u{1ac0}'),
+ ('\u{1acc}', '\u{1ace}'),
+ ('\u{1b00}', 'ᬄ'),
+ ('\u{1b35}', 'ᭃ'),
+ ('\u{1b80}', 'ᮂ'),
+ ('ᮡ', '\u{1ba9}'),
+ ('\u{1bac}', '\u{1bad}'),
+ ('ᯧ', '\u{1bf1}'),
+ ('ᰤ', '\u{1c36}'),
+ ('\u{1de7}', '\u{1df4}'),
+ ('Ⓐ', 'ⓩ'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('\u{a674}', '\u{a67b}'),
+ ('\u{a69e}', '\u{a69f}'),
+ ('\u{a802}', '\u{a802}'),
+ ('\u{a80b}', '\u{a80b}'),
+ ('ꠣ', 'ꠧ'),
+ ('ꢀ', 'ꢁ'),
+ ('ꢴ', 'ꣃ'),
+ ('\u{a8c5}', '\u{a8c5}'),
+ ('\u{a8ff}', '\u{a8ff}'),
+ ('\u{a926}', '\u{a92a}'),
+ ('\u{a947}', 'ꥒ'),
+ ('\u{a980}', 'ꦃ'),
+ ('ꦴ', 'ꦿ'),
+ ('\u{a9e5}', '\u{a9e5}'),
+ ('\u{aa29}', '\u{aa36}'),
+ ('\u{aa43}', '\u{aa43}'),
+ ('\u{aa4c}', 'ꩍ'),
+ ('ꩻ', 'ꩽ'),
+ ('\u{aab0}', '\u{aab0}'),
+ ('\u{aab2}', '\u{aab4}'),
+ ('\u{aab7}', '\u{aab8}'),
+ ('\u{aabe}', '\u{aabe}'),
+ ('ꫫ', 'ꫯ'),
+ ('ꫵ', 'ꫵ'),
+ ('ꯣ', 'ꯪ'),
+ ('\u{fb1e}', '\u{fb1e}'),
+ ('\u{10376}', '\u{1037a}'),
+ ('\u{10a01}', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '\u{10a0f}'),
+ ('\u{10d24}', '\u{10d27}'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('𑀀', '𑀂'),
+ ('\u{11038}', '\u{11045}'),
+ ('\u{11073}', '\u{11074}'),
+ ('\u{11080}', '𑂂'),
+ ('𑂰', '𑂸'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('\u{11100}', '\u{11102}'),
+ ('\u{11127}', '\u{11132}'),
+ ('𑅅', '𑅆'),
+ ('\u{11180}', '𑆂'),
+ ('𑆳', '𑆿'),
+ ('𑇎', '\u{111cf}'),
+ ('𑈬', '\u{11234}'),
+ ('\u{11237}', '\u{11237}'),
+ ('\u{1123e}', '\u{1123e}'),
+ ('\u{11241}', '\u{11241}'),
+ ('\u{112df}', '\u{112e8}'),
+ ('\u{11300}', '𑌃'),
+ ('\u{1133e}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍌'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍢', '𑍣'),
+ ('𑐵', '𑑁'),
+ ('\u{11443}', '𑑅'),
+ ('\u{114b0}', '𑓁'),
+ ('\u{115af}', '\u{115b5}'),
+ ('𑖸', '𑖾'),
+ ('\u{115dc}', '\u{115dd}'),
+ ('𑘰', '𑘾'),
+ ('\u{11640}', '\u{11640}'),
+ ('\u{116ab}', '\u{116b5}'),
+ ('\u{1171d}', '\u{1172a}'),
+ ('𑠬', '𑠸'),
+ ('\u{11930}', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '\u{1193c}'),
+ ('𑥀', '𑥀'),
+ ('𑥂', '𑥂'),
+ ('𑧑', '\u{119d7}'),
+ ('\u{119da}', '𑧟'),
+ ('𑧤', '𑧤'),
+ ('\u{11a01}', '\u{11a0a}'),
+ ('\u{11a35}', '𑨹'),
+ ('\u{11a3b}', '\u{11a3e}'),
+ ('\u{11a51}', '\u{11a5b}'),
+ ('\u{11a8a}', '𑪗'),
+ ('𑰯', '\u{11c36}'),
+ ('\u{11c38}', '𑰾'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('\u{11d31}', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d41}'),
+ ('\u{11d43}', '\u{11d43}'),
+ ('\u{11d47}', '\u{11d47}'),
+ ('𑶊', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '𑶖'),
+ ('\u{11ef3}', '𑻶'),
+ ('\u{11f00}', '\u{11f01}'),
+ ('𑼃', '𑼃'),
+ ('𑼴', '\u{11f3a}'),
+ ('𑼾', '\u{11f40}'),
+ ('\u{16f4f}', '\u{16f4f}'),
+ ('𖽑', '𖾇'),
+ ('\u{16f8f}', '\u{16f92}'),
+ ('𖿰', '𖿱'),
+ ('\u{1bc9e}', '\u{1bc9e}'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('\u{1e947}', '\u{1e947}'),
+ ('🄰', '🅉'),
+ ('🅐', '🅩'),
+ ('🅰', '🆉'),
+];
+
+pub const OTHER_DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[
+ ('\u{34f}', '\u{34f}'),
+ ('ᅟ', 'ᅠ'),
+ ('\u{17b4}', '\u{17b5}'),
+ ('\u{2065}', '\u{2065}'),
+ ('ㅤ', 'ㅤ'),
+ ('ᅠ', 'ᅠ'),
+ ('\u{fff0}', '\u{fff8}'),
+ ('\u{e0000}', '\u{e0000}'),
+ ('\u{e0002}', '\u{e001f}'),
+ ('\u{e0080}', '\u{e00ff}'),
+ ('\u{e01f0}', '\u{e0fff}'),
+];
+
+pub const OTHER_GRAPHEME_EXTEND: &'static [(char, char)] = &[
+ ('\u{9be}', '\u{9be}'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('\u{b3e}', '\u{b3e}'),
+ ('\u{b57}', '\u{b57}'),
+ ('\u{bbe}', '\u{bbe}'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('\u{cc2}', '\u{cc2}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('\u{d3e}', '\u{d3e}'),
+ ('\u{d57}', '\u{d57}'),
+ ('\u{dcf}', '\u{dcf}'),
+ ('\u{ddf}', '\u{ddf}'),
+ ('\u{1b35}', '\u{1b35}'),
+ ('\u{200c}', '\u{200c}'),
+ ('\u{302e}', '\u{302f}'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ ('\u{1133e}', '\u{1133e}'),
+ ('\u{11357}', '\u{11357}'),
+ ('\u{114b0}', '\u{114b0}'),
+ ('\u{114bd}', '\u{114bd}'),
+ ('\u{115af}', '\u{115af}'),
+ ('\u{11930}', '\u{11930}'),
+ ('\u{1d165}', '\u{1d165}'),
+ ('\u{1d16e}', '\u{1d172}'),
+ ('\u{e0020}', '\u{e007f}'),
+];
+
+pub const OTHER_ID_CONTINUE: &'static [(char, char)] =
+ &[('·', '·'), ('·', '·'), ('፩', '፱'), ('᧚', '᧚')];
+
+pub const OTHER_ID_START: &'static [(char, char)] =
+ &[('\u{1885}', '\u{1886}'), ('℘', '℘'), ('℮', '℮'), ('゛', '゜')];
+
+pub const OTHER_LOWERCASE: &'static [(char, char)] = &[
+ ('ª', 'ª'),
+ ('º', 'º'),
+ ('ʰ', 'ʸ'),
+ ('ˀ', 'ˁ'),
+ ('ˠ', 'ˤ'),
+ ('\u{345}', '\u{345}'),
+ ('ͺ', 'ͺ'),
+ ('ჼ', 'ჼ'),
+ ('ᴬ', 'ᵪ'),
+ ('ᵸ', 'ᵸ'),
+ ('ᶛ', 'ᶿ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ⅰ', 'ⅿ'),
+ ('ⓐ', 'ⓩ'),
+ ('ⱼ', 'ⱽ'),
+ ('ꚜ', 'ꚝ'),
+ ('ꝰ', 'ꝰ'),
+ ('ꟲ', 'ꟴ'),
+ ('ꟸ', 'ꟹ'),
+ ('ꭜ', 'ꭟ'),
+ ('ꭩ', 'ꭩ'),
+ ('𐞀', '𐞀'),
+ ('𐞃', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𞀰', '𞁭'),
+];
+
+pub const OTHER_MATH: &'static [(char, char)] = &[
+ ('^', '^'),
+ ('ϐ', 'ϒ'),
+ ('ϕ', 'ϕ'),
+ ('ϰ', 'ϱ'),
+ ('ϴ', 'ϵ'),
+ ('‖', '‖'),
+ ('′', '‴'),
+ ('⁀', '⁀'),
+ ('\u{2061}', '\u{2064}'),
+ ('⁽', '⁾'),
+ ('₍', '₎'),
+ ('\u{20d0}', '\u{20dc}'),
+ ('\u{20e1}', '\u{20e1}'),
+ ('\u{20e5}', '\u{20e6}'),
+ ('\u{20eb}', '\u{20ef}'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('ℨ', '℩'),
+ ('ℬ', 'ℭ'),
+ ('ℯ', 'ℱ'),
+ ('ℳ', 'ℸ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('↕', '↙'),
+ ('↜', '↟'),
+ ('↡', '↢'),
+ ('↤', '↥'),
+ ('↧', '↧'),
+ ('↩', '↭'),
+ ('↰', '↱'),
+ ('↶', '↷'),
+ ('↼', '⇍'),
+ ('⇐', '⇑'),
+ ('⇓', '⇓'),
+ ('⇕', '⇛'),
+ ('⇝', '⇝'),
+ ('⇤', '⇥'),
+ ('⌈', '⌋'),
+ ('⎴', '⎵'),
+ ('⎷', '⎷'),
+ ('⏐', '⏐'),
+ ('⏢', '⏢'),
+ ('■', '□'),
+ ('▮', '▶'),
+ ('▼', '◀'),
+ ('◆', '◇'),
+ ('◊', '○'),
+ ('●', '◓'),
+ ('◢', '◢'),
+ ('◤', '◤'),
+ ('◧', '◬'),
+ ('★', '☆'),
+ ('♀', '♀'),
+ ('♂', '♂'),
+ ('♠', '♣'),
+ ('♭', '♮'),
+ ('⟅', '⟆'),
+ ('⟦', '⟯'),
+ ('⦃', '⦘'),
+ ('⧘', '⧛'),
+ ('⧼', '⧽'),
+ ('﹡', '﹡'),
+ ('﹣', '﹣'),
+ ('﹨', '﹨'),
+ ('\', '\'),
+ ('^', '^'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝟎', '𝟿'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+];
+
+pub const OTHER_UPPERCASE: &'static [(char, char)] =
+ &[('Ⅰ', 'Ⅿ'), ('Ⓐ', 'Ⓩ'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉')];
+
+pub const PATTERN_SYNTAX: &'static [(char, char)] = &[
+ ('!', '/'),
+ (':', '@'),
+ ('[', '^'),
+ ('`', '`'),
+ ('{', '~'),
+ ('¡', '§'),
+ ('©', '©'),
+ ('«', '¬'),
+ ('®', '®'),
+ ('°', '±'),
+ ('¶', '¶'),
+ ('»', '»'),
+ ('¿', '¿'),
+ ('×', '×'),
+ ('÷', '÷'),
+ ('‐', '‧'),
+ ('‰', '‾'),
+ ('⁁', '⁓'),
+ ('⁕', '⁞'),
+ ('←', '\u{245f}'),
+ ('─', '❵'),
+ ('➔', '⯿'),
+ ('⸀', '\u{2e7f}'),
+ ('、', '〃'),
+ ('〈', '〠'),
+ ('〰', '〰'),
+ ('﴾', '﴿'),
+ ('﹅', '﹆'),
+];
+
+pub const PATTERN_WHITE_SPACE: &'static [(char, char)] = &[
+ ('\t', '\r'),
+ (' ', ' '),
+ ('\u{85}', '\u{85}'),
+ ('\u{200e}', '\u{200f}'),
+ ('\u{2028}', '\u{2029}'),
+];
+
+pub const PREPENDED_CONCATENATION_MARK: &'static [(char, char)] = &[
+ ('\u{600}', '\u{605}'),
+ ('\u{6dd}', '\u{6dd}'),
+ ('\u{70f}', '\u{70f}'),
+ ('\u{890}', '\u{891}'),
+ ('\u{8e2}', '\u{8e2}'),
+ ('\u{110bd}', '\u{110bd}'),
+ ('\u{110cd}', '\u{110cd}'),
+];
+
+pub const QUOTATION_MARK: &'static [(char, char)] = &[
+ ('"', '"'),
+ ('\'', '\''),
+ ('«', '«'),
+ ('»', '»'),
+ ('‘', '‟'),
+ ('‹', '›'),
+ ('⹂', '⹂'),
+ ('「', '』'),
+ ('〝', '〟'),
+ ('﹁', '﹄'),
+ ('"', '"'),
+ (''', '''),
+ ('「', '」'),
+];
+
+pub const RADICAL: &'static [(char, char)] =
+ &[('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕')];
+
+pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')];
+
+pub const SENTENCE_TERMINAL: &'static [(char, char)] = &[
+ ('!', '!'),
+ ('.', '.'),
+ ('?', '?'),
+ ('։', '։'),
+ ('؝', '؟'),
+ ('۔', '۔'),
+ ('܀', '܂'),
+ ('߹', '߹'),
+ ('࠷', '࠷'),
+ ('࠹', '࠹'),
+ ('࠽', '࠾'),
+ ('।', '॥'),
+ ('၊', '။'),
+ ('።', '።'),
+ ('፧', '፨'),
+ ('᙮', '᙮'),
+ ('᜵', '᜶'),
+ ('᠃', '᠃'),
+ ('᠉', '᠉'),
+ ('᥄', '᥅'),
+ ('᪨', '᪫'),
+ ('᭚', '᭛'),
+ ('᭞', '᭟'),
+ ('᭽', '᭾'),
+ ('᰻', '᰼'),
+ ('᱾', '᱿'),
+ ('‼', '‽'),
+ ('⁇', '⁉'),
+ ('⸮', '⸮'),
+ ('⸼', '⸼'),
+ ('⹓', '⹔'),
+ ('。', '。'),
+ ('꓿', '꓿'),
+ ('꘎', '꘏'),
+ ('꛳', '꛳'),
+ ('꛷', '꛷'),
+ ('꡶', '꡷'),
+ ('꣎', '꣏'),
+ ('꤯', '꤯'),
+ ('꧈', '꧉'),
+ ('꩝', '꩟'),
+ ('꫰', '꫱'),
+ ('꯫', '꯫'),
+ ('﹒', '﹒'),
+ ('﹖', '﹗'),
+ ('!', '!'),
+ ('.', '.'),
+ ('?', '?'),
+ ('。', '。'),
+ ('𐩖', '𐩗'),
+ ('𐽕', '𐽙'),
+ ('𐾆', '𐾉'),
+ ('𑁇', '𑁈'),
+ ('𑂾', '𑃁'),
+ ('𑅁', '𑅃'),
+ ('𑇅', '𑇆'),
+ ('𑇍', '𑇍'),
+ ('𑇞', '𑇟'),
+ ('𑈸', '𑈹'),
+ ('𑈻', '𑈼'),
+ ('𑊩', '𑊩'),
+ ('𑑋', '𑑌'),
+ ('𑗂', '𑗃'),
+ ('𑗉', '𑗗'),
+ ('𑙁', '𑙂'),
+ ('𑜼', '𑜾'),
+ ('𑥄', '𑥄'),
+ ('𑥆', '𑥆'),
+ ('𑩂', '𑩃'),
+ ('𑪛', '𑪜'),
+ ('𑱁', '𑱂'),
+ ('𑻷', '𑻸'),
+ ('𑽃', '𑽄'),
+ ('𖩮', '𖩯'),
+ ('𖫵', '𖫵'),
+ ('𖬷', '𖬸'),
+ ('𖭄', '𖭄'),
+ ('𖺘', '𖺘'),
+ ('𛲟', '𛲟'),
+ ('𝪈', '𝪈'),
+];
+
+pub const SOFT_DOTTED: &'static [(char, char)] = &[
+ ('i', 'j'),
+ ('į', 'į'),
+ ('ɉ', 'ɉ'),
+ ('ɨ', 'ɨ'),
+ ('ʝ', 'ʝ'),
+ ('ʲ', 'ʲ'),
+ ('ϳ', 'ϳ'),
+ ('і', 'і'),
+ ('ј', 'ј'),
+ ('ᵢ', 'ᵢ'),
+ ('ᶖ', 'ᶖ'),
+ ('ᶤ', 'ᶤ'),
+ ('ᶨ', 'ᶨ'),
+ ('ḭ', 'ḭ'),
+ ('ị', 'ị'),
+ ('ⁱ', 'ⁱ'),
+ ('ⅈ', 'ⅉ'),
+ ('ⱼ', 'ⱼ'),
+ ('𝐢', '𝐣'),
+ ('𝑖', '𝑗'),
+ ('𝒊', '𝒋'),
+ ('𝒾', '𝒿'),
+ ('𝓲', '𝓳'),
+ ('𝔦', '𝔧'),
+ ('𝕚', '𝕛'),
+ ('𝖎', '𝖏'),
+ ('𝗂', '𝗃'),
+ ('𝗶', '𝗷'),
+ ('𝘪', '𝘫'),
+ ('𝙞', '𝙟'),
+ ('𝚒', '𝚓'),
+ ('𝼚', '𝼚'),
+ ('𞁌', '𞁍'),
+ ('𞁨', '𞁨'),
+];
+
+pub const TERMINAL_PUNCTUATION: &'static [(char, char)] = &[
+ ('!', '!'),
+ (',', ','),
+ ('.', '.'),
+ (':', ';'),
+ ('?', '?'),
+ (';', ';'),
+ ('·', '·'),
+ ('։', '։'),
+ ('׃', '׃'),
+ ('،', '،'),
+ ('؛', '؛'),
+ ('؝', '؟'),
+ ('۔', '۔'),
+ ('܀', '܊'),
+ ('܌', '܌'),
+ ('߸', '߹'),
+ ('࠰', '࠾'),
+ ('࡞', '࡞'),
+ ('।', '॥'),
+ ('๚', '๛'),
+ ('༈', '༈'),
+ ('།', '༒'),
+ ('၊', '။'),
+ ('፡', '፨'),
+ ('᙮', '᙮'),
+ ('᛫', '᛭'),
+ ('᜵', '᜶'),
+ ('។', '៖'),
+ ('៚', '៚'),
+ ('᠂', '᠅'),
+ ('᠈', '᠉'),
+ ('᥄', '᥅'),
+ ('᪨', '᪫'),
+ ('᭚', '᭛'),
+ ('᭝', '᭟'),
+ ('᭽', '᭾'),
+ ('᰻', '᰿'),
+ ('᱾', '᱿'),
+ ('‼', '‽'),
+ ('⁇', '⁉'),
+ ('⸮', '⸮'),
+ ('⸼', '⸼'),
+ ('⹁', '⹁'),
+ ('⹌', '⹌'),
+ ('⹎', '⹏'),
+ ('⹓', '⹔'),
+ ('、', '。'),
+ ('꓾', '꓿'),
+ ('꘍', '꘏'),
+ ('꛳', '꛷'),
+ ('꡶', '꡷'),
+ ('꣎', '꣏'),
+ ('꤯', '꤯'),
+ ('꧇', '꧉'),
+ ('꩝', '꩟'),
+ ('꫟', '꫟'),
+ ('꫰', '꫱'),
+ ('꯫', '꯫'),
+ ('﹐', '﹒'),
+ ('﹔', '﹗'),
+ ('!', '!'),
+ (',', ','),
+ ('.', '.'),
+ (':', ';'),
+ ('?', '?'),
+ ('。', '。'),
+ ('、', '、'),
+ ('𐎟', '𐎟'),
+ ('𐏐', '𐏐'),
+ ('𐡗', '𐡗'),
+ ('𐤟', '𐤟'),
+ ('𐩖', '𐩗'),
+ ('𐫰', '𐫵'),
+ ('𐬺', '𐬿'),
+ ('𐮙', '𐮜'),
+ ('𐽕', '𐽙'),
+ ('𐾆', '𐾉'),
+ ('𑁇', '𑁍'),
+ ('𑂾', '𑃁'),
+ ('𑅁', '𑅃'),
+ ('𑇅', '𑇆'),
+ ('𑇍', '𑇍'),
+ ('𑇞', '𑇟'),
+ ('𑈸', '𑈼'),
+ ('𑊩', '𑊩'),
+ ('𑑋', '𑑍'),
+ ('𑑚', '𑑛'),
+ ('𑗂', '𑗅'),
+ ('𑗉', '𑗗'),
+ ('𑙁', '𑙂'),
+ ('𑜼', '𑜾'),
+ ('𑥄', '𑥄'),
+ ('𑥆', '𑥆'),
+ ('𑩂', '𑩃'),
+ ('𑪛', '𑪜'),
+ ('𑪡', '𑪢'),
+ ('𑱁', '𑱃'),
+ ('𑱱', '𑱱'),
+ ('𑻷', '𑻸'),
+ ('𑽃', '𑽄'),
+ ('𒑰', '𒑴'),
+ ('𖩮', '𖩯'),
+ ('𖫵', '𖫵'),
+ ('𖬷', '𖬹'),
+ ('𖭄', '𖭄'),
+ ('𖺗', '𖺘'),
+ ('𛲟', '𛲟'),
+ ('𝪇', '𝪊'),
+];
+
+pub const UNIFIED_IDEOGRAPH: &'static [(char, char)] = &[
+ ('㐀', '䶿'),
+ ('一', '鿿'),
+ ('﨎', '﨏'),
+ ('﨑', '﨑'),
+ ('﨓', '﨔'),
+ ('﨟', '﨟'),
+ ('﨡', '﨡'),
+ ('﨣', '﨤'),
+ ('﨧', '﨩'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const UPPERCASE: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('À', 'Ö'),
+ ('Ø', 'Þ'),
+ ('Ā', 'Ā'),
+ ('Ă', 'Ă'),
+ ('Ą', 'Ą'),
+ ('Ć', 'Ć'),
+ ('Ĉ', 'Ĉ'),
+ ('Ċ', 'Ċ'),
+ ('Č', 'Č'),
+ ('Ď', 'Ď'),
+ ('Đ', 'Đ'),
+ ('Ē', 'Ē'),
+ ('Ĕ', 'Ĕ'),
+ ('Ė', 'Ė'),
+ ('Ę', 'Ę'),
+ ('Ě', 'Ě'),
+ ('Ĝ', 'Ĝ'),
+ ('Ğ', 'Ğ'),
+ ('Ġ', 'Ġ'),
+ ('Ģ', 'Ģ'),
+ ('Ĥ', 'Ĥ'),
+ ('Ħ', 'Ħ'),
+ ('Ĩ', 'Ĩ'),
+ ('Ī', 'Ī'),
+ ('Ĭ', 'Ĭ'),
+ ('Į', 'Į'),
+ ('İ', 'İ'),
+ ('IJ', 'IJ'),
+ ('Ĵ', 'Ĵ'),
+ ('Ķ', 'Ķ'),
+ ('Ĺ', 'Ĺ'),
+ ('Ļ', 'Ļ'),
+ ('Ľ', 'Ľ'),
+ ('Ŀ', 'Ŀ'),
+ ('Ł', 'Ł'),
+ ('Ń', 'Ń'),
+ ('Ņ', 'Ņ'),
+ ('Ň', 'Ň'),
+ ('Ŋ', 'Ŋ'),
+ ('Ō', 'Ō'),
+ ('Ŏ', 'Ŏ'),
+ ('Ő', 'Ő'),
+ ('Œ', 'Œ'),
+ ('Ŕ', 'Ŕ'),
+ ('Ŗ', 'Ŗ'),
+ ('Ř', 'Ř'),
+ ('Ś', 'Ś'),
+ ('Ŝ', 'Ŝ'),
+ ('Ş', 'Ş'),
+ ('Š', 'Š'),
+ ('Ţ', 'Ţ'),
+ ('Ť', 'Ť'),
+ ('Ŧ', 'Ŧ'),
+ ('Ũ', 'Ũ'),
+ ('Ū', 'Ū'),
+ ('Ŭ', 'Ŭ'),
+ ('Ů', 'Ů'),
+ ('Ű', 'Ű'),
+ ('Ų', 'Ų'),
+ ('Ŵ', 'Ŵ'),
+ ('Ŷ', 'Ŷ'),
+ ('Ÿ', 'Ź'),
+ ('Ż', 'Ż'),
+ ('Ž', 'Ž'),
+ ('Ɓ', 'Ƃ'),
+ ('Ƅ', 'Ƅ'),
+ ('Ɔ', 'Ƈ'),
+ ('Ɖ', 'Ƌ'),
+ ('Ǝ', 'Ƒ'),
+ ('Ɠ', 'Ɣ'),
+ ('Ɩ', 'Ƙ'),
+ ('Ɯ', 'Ɲ'),
+ ('Ɵ', 'Ơ'),
+ ('Ƣ', 'Ƣ'),
+ ('Ƥ', 'Ƥ'),
+ ('Ʀ', 'Ƨ'),
+ ('Ʃ', 'Ʃ'),
+ ('Ƭ', 'Ƭ'),
+ ('Ʈ', 'Ư'),
+ ('Ʊ', 'Ƴ'),
+ ('Ƶ', 'Ƶ'),
+ ('Ʒ', 'Ƹ'),
+ ('Ƽ', 'Ƽ'),
+ ('DŽ', 'DŽ'),
+ ('LJ', 'LJ'),
+ ('NJ', 'NJ'),
+ ('Ǎ', 'Ǎ'),
+ ('Ǐ', 'Ǐ'),
+ ('Ǒ', 'Ǒ'),
+ ('Ǔ', 'Ǔ'),
+ ('Ǖ', 'Ǖ'),
+ ('Ǘ', 'Ǘ'),
+ ('Ǚ', 'Ǚ'),
+ ('Ǜ', 'Ǜ'),
+ ('Ǟ', 'Ǟ'),
+ ('Ǡ', 'Ǡ'),
+ ('Ǣ', 'Ǣ'),
+ ('Ǥ', 'Ǥ'),
+ ('Ǧ', 'Ǧ'),
+ ('Ǩ', 'Ǩ'),
+ ('Ǫ', 'Ǫ'),
+ ('Ǭ', 'Ǭ'),
+ ('Ǯ', 'Ǯ'),
+ ('DZ', 'DZ'),
+ ('Ǵ', 'Ǵ'),
+ ('Ƕ', 'Ǹ'),
+ ('Ǻ', 'Ǻ'),
+ ('Ǽ', 'Ǽ'),
+ ('Ǿ', 'Ǿ'),
+ ('Ȁ', 'Ȁ'),
+ ('Ȃ', 'Ȃ'),
+ ('Ȅ', 'Ȅ'),
+ ('Ȇ', 'Ȇ'),
+ ('Ȉ', 'Ȉ'),
+ ('Ȋ', 'Ȋ'),
+ ('Ȍ', 'Ȍ'),
+ ('Ȏ', 'Ȏ'),
+ ('Ȑ', 'Ȑ'),
+ ('Ȓ', 'Ȓ'),
+ ('Ȕ', 'Ȕ'),
+ ('Ȗ', 'Ȗ'),
+ ('Ș', 'Ș'),
+ ('Ț', 'Ț'),
+ ('Ȝ', 'Ȝ'),
+ ('Ȟ', 'Ȟ'),
+ ('Ƞ', 'Ƞ'),
+ ('Ȣ', 'Ȣ'),
+ ('Ȥ', 'Ȥ'),
+ ('Ȧ', 'Ȧ'),
+ ('Ȩ', 'Ȩ'),
+ ('Ȫ', 'Ȫ'),
+ ('Ȭ', 'Ȭ'),
+ ('Ȯ', 'Ȯ'),
+ ('Ȱ', 'Ȱ'),
+ ('Ȳ', 'Ȳ'),
+ ('Ⱥ', 'Ȼ'),
+ ('Ƚ', 'Ⱦ'),
+ ('Ɂ', 'Ɂ'),
+ ('Ƀ', 'Ɇ'),
+ ('Ɉ', 'Ɉ'),
+ ('Ɋ', 'Ɋ'),
+ ('Ɍ', 'Ɍ'),
+ ('Ɏ', 'Ɏ'),
+ ('Ͱ', 'Ͱ'),
+ ('Ͳ', 'Ͳ'),
+ ('Ͷ', 'Ͷ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ώ'),
+ ('Α', 'Ρ'),
+ ('Σ', 'Ϋ'),
+ ('Ϗ', 'Ϗ'),
+ ('ϒ', 'ϔ'),
+ ('Ϙ', 'Ϙ'),
+ ('Ϛ', 'Ϛ'),
+ ('Ϝ', 'Ϝ'),
+ ('Ϟ', 'Ϟ'),
+ ('Ϡ', 'Ϡ'),
+ ('Ϣ', 'Ϣ'),
+ ('Ϥ', 'Ϥ'),
+ ('Ϧ', 'Ϧ'),
+ ('Ϩ', 'Ϩ'),
+ ('Ϫ', 'Ϫ'),
+ ('Ϭ', 'Ϭ'),
+ ('Ϯ', 'Ϯ'),
+ ('ϴ', 'ϴ'),
+ ('Ϸ', 'Ϸ'),
+ ('Ϲ', 'Ϻ'),
+ ('Ͻ', 'Я'),
+ ('Ѡ', 'Ѡ'),
+ ('Ѣ', 'Ѣ'),
+ ('Ѥ', 'Ѥ'),
+ ('Ѧ', 'Ѧ'),
+ ('Ѩ', 'Ѩ'),
+ ('Ѫ', 'Ѫ'),
+ ('Ѭ', 'Ѭ'),
+ ('Ѯ', 'Ѯ'),
+ ('Ѱ', 'Ѱ'),
+ ('Ѳ', 'Ѳ'),
+ ('Ѵ', 'Ѵ'),
+ ('Ѷ', 'Ѷ'),
+ ('Ѹ', 'Ѹ'),
+ ('Ѻ', 'Ѻ'),
+ ('Ѽ', 'Ѽ'),
+ ('Ѿ', 'Ѿ'),
+ ('Ҁ', 'Ҁ'),
+ ('Ҋ', 'Ҋ'),
+ ('Ҍ', 'Ҍ'),
+ ('Ҏ', 'Ҏ'),
+ ('Ґ', 'Ґ'),
+ ('Ғ', 'Ғ'),
+ ('Ҕ', 'Ҕ'),
+ ('Җ', 'Җ'),
+ ('Ҙ', 'Ҙ'),
+ ('Қ', 'Қ'),
+ ('Ҝ', 'Ҝ'),
+ ('Ҟ', 'Ҟ'),
+ ('Ҡ', 'Ҡ'),
+ ('Ң', 'Ң'),
+ ('Ҥ', 'Ҥ'),
+ ('Ҧ', 'Ҧ'),
+ ('Ҩ', 'Ҩ'),
+ ('Ҫ', 'Ҫ'),
+ ('Ҭ', 'Ҭ'),
+ ('Ү', 'Ү'),
+ ('Ұ', 'Ұ'),
+ ('Ҳ', 'Ҳ'),
+ ('Ҵ', 'Ҵ'),
+ ('Ҷ', 'Ҷ'),
+ ('Ҹ', 'Ҹ'),
+ ('Һ', 'Һ'),
+ ('Ҽ', 'Ҽ'),
+ ('Ҿ', 'Ҿ'),
+ ('Ӏ', 'Ӂ'),
+ ('Ӄ', 'Ӄ'),
+ ('Ӆ', 'Ӆ'),
+ ('Ӈ', 'Ӈ'),
+ ('Ӊ', 'Ӊ'),
+ ('Ӌ', 'Ӌ'),
+ ('Ӎ', 'Ӎ'),
+ ('Ӑ', 'Ӑ'),
+ ('Ӓ', 'Ӓ'),
+ ('Ӕ', 'Ӕ'),
+ ('Ӗ', 'Ӗ'),
+ ('Ә', 'Ә'),
+ ('Ӛ', 'Ӛ'),
+ ('Ӝ', 'Ӝ'),
+ ('Ӟ', 'Ӟ'),
+ ('Ӡ', 'Ӡ'),
+ ('Ӣ', 'Ӣ'),
+ ('Ӥ', 'Ӥ'),
+ ('Ӧ', 'Ӧ'),
+ ('Ө', 'Ө'),
+ ('Ӫ', 'Ӫ'),
+ ('Ӭ', 'Ӭ'),
+ ('Ӯ', 'Ӯ'),
+ ('Ӱ', 'Ӱ'),
+ ('Ӳ', 'Ӳ'),
+ ('Ӵ', 'Ӵ'),
+ ('Ӷ', 'Ӷ'),
+ ('Ӹ', 'Ӹ'),
+ ('Ӻ', 'Ӻ'),
+ ('Ӽ', 'Ӽ'),
+ ('Ӿ', 'Ӿ'),
+ ('Ԁ', 'Ԁ'),
+ ('Ԃ', 'Ԃ'),
+ ('Ԅ', 'Ԅ'),
+ ('Ԇ', 'Ԇ'),
+ ('Ԉ', 'Ԉ'),
+ ('Ԋ', 'Ԋ'),
+ ('Ԍ', 'Ԍ'),
+ ('Ԏ', 'Ԏ'),
+ ('Ԑ', 'Ԑ'),
+ ('Ԓ', 'Ԓ'),
+ ('Ԕ', 'Ԕ'),
+ ('Ԗ', 'Ԗ'),
+ ('Ԙ', 'Ԙ'),
+ ('Ԛ', 'Ԛ'),
+ ('Ԝ', 'Ԝ'),
+ ('Ԟ', 'Ԟ'),
+ ('Ԡ', 'Ԡ'),
+ ('Ԣ', 'Ԣ'),
+ ('Ԥ', 'Ԥ'),
+ ('Ԧ', 'Ԧ'),
+ ('Ԩ', 'Ԩ'),
+ ('Ԫ', 'Ԫ'),
+ ('Ԭ', 'Ԭ'),
+ ('Ԯ', 'Ԯ'),
+ ('Ա', 'Ֆ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('Ḁ', 'Ḁ'),
+ ('Ḃ', 'Ḃ'),
+ ('Ḅ', 'Ḅ'),
+ ('Ḇ', 'Ḇ'),
+ ('Ḉ', 'Ḉ'),
+ ('Ḋ', 'Ḋ'),
+ ('Ḍ', 'Ḍ'),
+ ('Ḏ', 'Ḏ'),
+ ('Ḑ', 'Ḑ'),
+ ('Ḓ', 'Ḓ'),
+ ('Ḕ', 'Ḕ'),
+ ('Ḗ', 'Ḗ'),
+ ('Ḙ', 'Ḙ'),
+ ('Ḛ', 'Ḛ'),
+ ('Ḝ', 'Ḝ'),
+ ('Ḟ', 'Ḟ'),
+ ('Ḡ', 'Ḡ'),
+ ('Ḣ', 'Ḣ'),
+ ('Ḥ', 'Ḥ'),
+ ('Ḧ', 'Ḧ'),
+ ('Ḩ', 'Ḩ'),
+ ('Ḫ', 'Ḫ'),
+ ('Ḭ', 'Ḭ'),
+ ('Ḯ', 'Ḯ'),
+ ('Ḱ', 'Ḱ'),
+ ('Ḳ', 'Ḳ'),
+ ('Ḵ', 'Ḵ'),
+ ('Ḷ', 'Ḷ'),
+ ('Ḹ', 'Ḹ'),
+ ('Ḻ', 'Ḻ'),
+ ('Ḽ', 'Ḽ'),
+ ('Ḿ', 'Ḿ'),
+ ('Ṁ', 'Ṁ'),
+ ('Ṃ', 'Ṃ'),
+ ('Ṅ', 'Ṅ'),
+ ('Ṇ', 'Ṇ'),
+ ('Ṉ', 'Ṉ'),
+ ('Ṋ', 'Ṋ'),
+ ('Ṍ', 'Ṍ'),
+ ('Ṏ', 'Ṏ'),
+ ('Ṑ', 'Ṑ'),
+ ('Ṓ', 'Ṓ'),
+ ('Ṕ', 'Ṕ'),
+ ('Ṗ', 'Ṗ'),
+ ('Ṙ', 'Ṙ'),
+ ('Ṛ', 'Ṛ'),
+ ('Ṝ', 'Ṝ'),
+ ('Ṟ', 'Ṟ'),
+ ('Ṡ', 'Ṡ'),
+ ('Ṣ', 'Ṣ'),
+ ('Ṥ', 'Ṥ'),
+ ('Ṧ', 'Ṧ'),
+ ('Ṩ', 'Ṩ'),
+ ('Ṫ', 'Ṫ'),
+ ('Ṭ', 'Ṭ'),
+ ('Ṯ', 'Ṯ'),
+ ('Ṱ', 'Ṱ'),
+ ('Ṳ', 'Ṳ'),
+ ('Ṵ', 'Ṵ'),
+ ('Ṷ', 'Ṷ'),
+ ('Ṹ', 'Ṹ'),
+ ('Ṻ', 'Ṻ'),
+ ('Ṽ', 'Ṽ'),
+ ('Ṿ', 'Ṿ'),
+ ('Ẁ', 'Ẁ'),
+ ('Ẃ', 'Ẃ'),
+ ('Ẅ', 'Ẅ'),
+ ('Ẇ', 'Ẇ'),
+ ('Ẉ', 'Ẉ'),
+ ('Ẋ', 'Ẋ'),
+ ('Ẍ', 'Ẍ'),
+ ('Ẏ', 'Ẏ'),
+ ('Ẑ', 'Ẑ'),
+ ('Ẓ', 'Ẓ'),
+ ('Ẕ', 'Ẕ'),
+ ('ẞ', 'ẞ'),
+ ('Ạ', 'Ạ'),
+ ('Ả', 'Ả'),
+ ('Ấ', 'Ấ'),
+ ('Ầ', 'Ầ'),
+ ('Ẩ', 'Ẩ'),
+ ('Ẫ', 'Ẫ'),
+ ('Ậ', 'Ậ'),
+ ('Ắ', 'Ắ'),
+ ('Ằ', 'Ằ'),
+ ('Ẳ', 'Ẳ'),
+ ('Ẵ', 'Ẵ'),
+ ('Ặ', 'Ặ'),
+ ('Ẹ', 'Ẹ'),
+ ('Ẻ', 'Ẻ'),
+ ('Ẽ', 'Ẽ'),
+ ('Ế', 'Ế'),
+ ('Ề', 'Ề'),
+ ('Ể', 'Ể'),
+ ('Ễ', 'Ễ'),
+ ('Ệ', 'Ệ'),
+ ('Ỉ', 'Ỉ'),
+ ('Ị', 'Ị'),
+ ('Ọ', 'Ọ'),
+ ('Ỏ', 'Ỏ'),
+ ('Ố', 'Ố'),
+ ('Ồ', 'Ồ'),
+ ('Ổ', 'Ổ'),
+ ('Ỗ', 'Ỗ'),
+ ('Ộ', 'Ộ'),
+ ('Ớ', 'Ớ'),
+ ('Ờ', 'Ờ'),
+ ('Ở', 'Ở'),
+ ('Ỡ', 'Ỡ'),
+ ('Ợ', 'Ợ'),
+ ('Ụ', 'Ụ'),
+ ('Ủ', 'Ủ'),
+ ('Ứ', 'Ứ'),
+ ('Ừ', 'Ừ'),
+ ('Ử', 'Ử'),
+ ('Ữ', 'Ữ'),
+ ('Ự', 'Ự'),
+ ('Ỳ', 'Ỳ'),
+ ('Ỵ', 'Ỵ'),
+ ('Ỷ', 'Ỷ'),
+ ('Ỹ', 'Ỹ'),
+ ('Ỻ', 'Ỻ'),
+ ('Ỽ', 'Ỽ'),
+ ('Ỿ', 'Ỿ'),
+ ('Ἀ', 'Ἇ'),
+ ('Ἐ', 'Ἕ'),
+ ('Ἠ', 'Ἧ'),
+ ('Ἰ', 'Ἷ'),
+ ('Ὀ', 'Ὅ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'Ὗ'),
+ ('Ὠ', 'Ὧ'),
+ ('Ᾰ', 'Ά'),
+ ('Ὲ', 'Ή'),
+ ('Ῐ', 'Ί'),
+ ('Ῠ', 'Ῥ'),
+ ('Ὸ', 'Ώ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℋ', 'ℍ'),
+ ('ℐ', 'ℒ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℰ', 'ℳ'),
+ ('ℾ', 'ℿ'),
+ ('ⅅ', 'ⅅ'),
+ ('Ⅰ', 'Ⅿ'),
+ ('Ↄ', 'Ↄ'),
+ ('Ⓐ', 'Ⓩ'),
+ ('Ⰰ', 'Ⱟ'),
+ ('Ⱡ', 'Ⱡ'),
+ ('Ɫ', 'Ɽ'),
+ ('Ⱨ', 'Ⱨ'),
+ ('Ⱪ', 'Ⱪ'),
+ ('Ⱬ', 'Ⱬ'),
+ ('Ɑ', 'Ɒ'),
+ ('Ⱳ', 'Ⱳ'),
+ ('Ⱶ', 'Ⱶ'),
+ ('Ȿ', 'Ⲁ'),
+ ('Ⲃ', 'Ⲃ'),
+ ('Ⲅ', 'Ⲅ'),
+ ('Ⲇ', 'Ⲇ'),
+ ('Ⲉ', 'Ⲉ'),
+ ('Ⲋ', 'Ⲋ'),
+ ('Ⲍ', 'Ⲍ'),
+ ('Ⲏ', 'Ⲏ'),
+ ('Ⲑ', 'Ⲑ'),
+ ('Ⲓ', 'Ⲓ'),
+ ('Ⲕ', 'Ⲕ'),
+ ('Ⲗ', 'Ⲗ'),
+ ('Ⲙ', 'Ⲙ'),
+ ('Ⲛ', 'Ⲛ'),
+ ('Ⲝ', 'Ⲝ'),
+ ('Ⲟ', 'Ⲟ'),
+ ('Ⲡ', 'Ⲡ'),
+ ('Ⲣ', 'Ⲣ'),
+ ('Ⲥ', 'Ⲥ'),
+ ('Ⲧ', 'Ⲧ'),
+ ('Ⲩ', 'Ⲩ'),
+ ('Ⲫ', 'Ⲫ'),
+ ('Ⲭ', 'Ⲭ'),
+ ('Ⲯ', 'Ⲯ'),
+ ('Ⲱ', 'Ⲱ'),
+ ('Ⲳ', 'Ⲳ'),
+ ('Ⲵ', 'Ⲵ'),
+ ('Ⲷ', 'Ⲷ'),
+ ('Ⲹ', 'Ⲹ'),
+ ('Ⲻ', 'Ⲻ'),
+ ('Ⲽ', 'Ⲽ'),
+ ('Ⲿ', 'Ⲿ'),
+ ('Ⳁ', 'Ⳁ'),
+ ('Ⳃ', 'Ⳃ'),
+ ('Ⳅ', 'Ⳅ'),
+ ('Ⳇ', 'Ⳇ'),
+ ('Ⳉ', 'Ⳉ'),
+ ('Ⳋ', 'Ⳋ'),
+ ('Ⳍ', 'Ⳍ'),
+ ('Ⳏ', 'Ⳏ'),
+ ('Ⳑ', 'Ⳑ'),
+ ('Ⳓ', 'Ⳓ'),
+ ('Ⳕ', 'Ⳕ'),
+ ('Ⳗ', 'Ⳗ'),
+ ('Ⳙ', 'Ⳙ'),
+ ('Ⳛ', 'Ⳛ'),
+ ('Ⳝ', 'Ⳝ'),
+ ('Ⳟ', 'Ⳟ'),
+ ('Ⳡ', 'Ⳡ'),
+ ('Ⳣ', 'Ⳣ'),
+ ('Ⳬ', 'Ⳬ'),
+ ('Ⳮ', 'Ⳮ'),
+ ('Ⳳ', 'Ⳳ'),
+ ('Ꙁ', 'Ꙁ'),
+ ('Ꙃ', 'Ꙃ'),
+ ('Ꙅ', 'Ꙅ'),
+ ('Ꙇ', 'Ꙇ'),
+ ('Ꙉ', 'Ꙉ'),
+ ('Ꙋ', 'Ꙋ'),
+ ('Ꙍ', 'Ꙍ'),
+ ('Ꙏ', 'Ꙏ'),
+ ('Ꙑ', 'Ꙑ'),
+ ('Ꙓ', 'Ꙓ'),
+ ('Ꙕ', 'Ꙕ'),
+ ('Ꙗ', 'Ꙗ'),
+ ('Ꙙ', 'Ꙙ'),
+ ('Ꙛ', 'Ꙛ'),
+ ('Ꙝ', 'Ꙝ'),
+ ('Ꙟ', 'Ꙟ'),
+ ('Ꙡ', 'Ꙡ'),
+ ('Ꙣ', 'Ꙣ'),
+ ('Ꙥ', 'Ꙥ'),
+ ('Ꙧ', 'Ꙧ'),
+ ('Ꙩ', 'Ꙩ'),
+ ('Ꙫ', 'Ꙫ'),
+ ('Ꙭ', 'Ꙭ'),
+ ('Ꚁ', 'Ꚁ'),
+ ('Ꚃ', 'Ꚃ'),
+ ('Ꚅ', 'Ꚅ'),
+ ('Ꚇ', 'Ꚇ'),
+ ('Ꚉ', 'Ꚉ'),
+ ('Ꚋ', 'Ꚋ'),
+ ('Ꚍ', 'Ꚍ'),
+ ('Ꚏ', 'Ꚏ'),
+ ('Ꚑ', 'Ꚑ'),
+ ('Ꚓ', 'Ꚓ'),
+ ('Ꚕ', 'Ꚕ'),
+ ('Ꚗ', 'Ꚗ'),
+ ('Ꚙ', 'Ꚙ'),
+ ('Ꚛ', 'Ꚛ'),
+ ('Ꜣ', 'Ꜣ'),
+ ('Ꜥ', 'Ꜥ'),
+ ('Ꜧ', 'Ꜧ'),
+ ('Ꜩ', 'Ꜩ'),
+ ('Ꜫ', 'Ꜫ'),
+ ('Ꜭ', 'Ꜭ'),
+ ('Ꜯ', 'Ꜯ'),
+ ('Ꜳ', 'Ꜳ'),
+ ('Ꜵ', 'Ꜵ'),
+ ('Ꜷ', 'Ꜷ'),
+ ('Ꜹ', 'Ꜹ'),
+ ('Ꜻ', 'Ꜻ'),
+ ('Ꜽ', 'Ꜽ'),
+ ('Ꜿ', 'Ꜿ'),
+ ('Ꝁ', 'Ꝁ'),
+ ('Ꝃ', 'Ꝃ'),
+ ('Ꝅ', 'Ꝅ'),
+ ('Ꝇ', 'Ꝇ'),
+ ('Ꝉ', 'Ꝉ'),
+ ('Ꝋ', 'Ꝋ'),
+ ('Ꝍ', 'Ꝍ'),
+ ('Ꝏ', 'Ꝏ'),
+ ('Ꝑ', 'Ꝑ'),
+ ('Ꝓ', 'Ꝓ'),
+ ('Ꝕ', 'Ꝕ'),
+ ('Ꝗ', 'Ꝗ'),
+ ('Ꝙ', 'Ꝙ'),
+ ('Ꝛ', 'Ꝛ'),
+ ('Ꝝ', 'Ꝝ'),
+ ('Ꝟ', 'Ꝟ'),
+ ('Ꝡ', 'Ꝡ'),
+ ('Ꝣ', 'Ꝣ'),
+ ('Ꝥ', 'Ꝥ'),
+ ('Ꝧ', 'Ꝧ'),
+ ('Ꝩ', 'Ꝩ'),
+ ('Ꝫ', 'Ꝫ'),
+ ('Ꝭ', 'Ꝭ'),
+ ('Ꝯ', 'Ꝯ'),
+ ('Ꝺ', 'Ꝺ'),
+ ('Ꝼ', 'Ꝼ'),
+ ('Ᵹ', 'Ꝿ'),
+ ('Ꞁ', 'Ꞁ'),
+ ('Ꞃ', 'Ꞃ'),
+ ('Ꞅ', 'Ꞅ'),
+ ('Ꞇ', 'Ꞇ'),
+ ('Ꞌ', 'Ꞌ'),
+ ('Ɥ', 'Ɥ'),
+ ('Ꞑ', 'Ꞑ'),
+ ('Ꞓ', 'Ꞓ'),
+ ('Ꞗ', 'Ꞗ'),
+ ('Ꞙ', 'Ꞙ'),
+ ('Ꞛ', 'Ꞛ'),
+ ('Ꞝ', 'Ꞝ'),
+ ('Ꞟ', 'Ꞟ'),
+ ('Ꞡ', 'Ꞡ'),
+ ('Ꞣ', 'Ꞣ'),
+ ('Ꞥ', 'Ꞥ'),
+ ('Ꞧ', 'Ꞧ'),
+ ('Ꞩ', 'Ꞩ'),
+ ('Ɦ', 'Ɪ'),
+ ('Ʞ', 'Ꞵ'),
+ ('Ꞷ', 'Ꞷ'),
+ ('Ꞹ', 'Ꞹ'),
+ ('Ꞻ', 'Ꞻ'),
+ ('Ꞽ', 'Ꞽ'),
+ ('Ꞿ', 'Ꞿ'),
+ ('Ꟁ', 'Ꟁ'),
+ ('Ꟃ', 'Ꟃ'),
+ ('Ꞔ', 'Ꟈ'),
+ ('Ꟊ', 'Ꟊ'),
+ ('Ꟑ', 'Ꟑ'),
+ ('Ꟗ', 'Ꟗ'),
+ ('Ꟙ', 'Ꟙ'),
+ ('Ꟶ', 'Ꟶ'),
+ ('A', 'Z'),
+ ('𐐀', '𐐧'),
+ ('𐒰', '𐓓'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐲀', '𐲲'),
+ ('𑢠', '𑢿'),
+ ('𖹀', '𖹟'),
+ ('𝐀', '𝐙'),
+ ('𝐴', '𝑍'),
+ ('𝑨', '𝒁'),
+ ('𝒜', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒵'),
+ ('𝓐', '𝓩'),
+ ('𝔄', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔸', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕬', '𝖅'),
+ ('𝖠', '𝖹'),
+ ('𝗔', '𝗭'),
+ ('𝘈', '𝘡'),
+ ('𝘼', '𝙕'),
+ ('𝙰', '𝚉'),
+ ('𝚨', '𝛀'),
+ ('𝛢', '𝛺'),
+ ('𝜜', '𝜴'),
+ ('𝝖', '𝝮'),
+ ('𝞐', '𝞨'),
+ ('𝟊', '𝟊'),
+ ('𞤀', '𞤡'),
+ ('🄰', '🅉'),
+ ('🅐', '🅩'),
+ ('🅰', '🆉'),
+];
+
+pub const VARIATION_SELECTOR: &'static [(char, char)] = &[
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '\u{180f}'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const WHITE_SPACE: &'static [(char, char)] = &[
+ ('\t', '\r'),
+ (' ', ' '),
+ ('\u{85}', '\u{85}'),
+ ('\u{a0}', '\u{a0}'),
+ ('\u{1680}', '\u{1680}'),
+ ('\u{2000}', '\u{200a}'),
+ ('\u{2028}', '\u{2029}'),
+ ('\u{202f}', '\u{202f}'),
+ ('\u{205f}', '\u{205f}'),
+ ('\u{3000}', '\u{3000}'),
+];
+
+pub const XID_CONTINUE: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('A', 'Z'),
+ ('_', '_'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('·', '·'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ˁ'),
+ ('ˆ', 'ˑ'),
+ ('ˠ', 'ˤ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('\u{300}', 'ʹ'),
+ ('Ͷ', 'ͷ'),
+ ('ͻ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('\u{483}', '\u{487}'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', 'ՙ'),
+ ('ՠ', 'ֈ'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('\u{610}', '\u{61a}'),
+ ('ؠ', '٩'),
+ ('ٮ', 'ۓ'),
+ ('ە', '\u{6dc}'),
+ ('\u{6df}', '\u{6e8}'),
+ ('\u{6ea}', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', '\u{74a}'),
+ ('ݍ', 'ޱ'),
+ ('߀', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('ࠀ', '\u{82d}'),
+ ('ࡀ', '\u{85b}'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('\u{898}', '\u{8e1}'),
+ ('\u{8e3}', '\u{963}'),
+ ('०', '९'),
+ ('ॱ', 'ঃ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('\u{9bc}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৎ'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('ড়', 'ঢ়'),
+ ('য়', '\u{9e3}'),
+ ('০', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', 'ਃ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('੦', '\u{a75}'),
+ ('\u{a81}', 'ઃ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('\u{abc}', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', '\u{ae3}'),
+ ('૦', '૯'),
+ ('ૹ', '\u{aff}'),
+ ('\u{b01}', 'ଃ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('\u{b3c}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', '\u{b63}'),
+ ('୦', '୯'),
+ ('ୱ', 'ୱ'),
+ ('\u{b82}', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('ௐ', 'ௐ'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('௦', '௯'),
+ ('\u{c00}', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('\u{c3c}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', '\u{c63}'),
+ ('౦', '౯'),
+ ('ಀ', 'ಃ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('\u{cbc}', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', '\u{ce3}'),
+ ('೦', '೯'),
+ ('ೱ', 'ೳ'),
+ ('\u{d00}', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', 'ൎ'),
+ ('ൔ', '\u{d57}'),
+ ('ൟ', '\u{d63}'),
+ ('൦', '൯'),
+ ('ൺ', 'ൿ'),
+ ('\u{d81}', 'ඃ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('෦', '෯'),
+ ('ෲ', 'ෳ'),
+ ('ก', '\u{e3a}'),
+ ('เ', '\u{e4e}'),
+ ('๐', '๙'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('\u{ec8}', '\u{ece}'),
+ ('໐', '໙'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('\u{f18}', '\u{f19}'),
+ ('༠', '༩'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('༾', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('\u{f71}', '\u{f84}'),
+ ('\u{f86}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('က', '၉'),
+ ('ၐ', '\u{109d}'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('\u{135d}', '\u{135f}'),
+ ('፩', '፱'),
+ ('ᎀ', 'ᎏ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛮ', 'ᛸ'),
+ ('ᜀ', '᜕'),
+ ('ᜟ', '᜴'),
+ ('ᝀ', '\u{1753}'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('\u{1772}', '\u{1773}'),
+ ('ក', '\u{17d3}'),
+ ('ៗ', 'ៗ'),
+ ('ៜ', '\u{17dd}'),
+ ('០', '៩'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '᠙'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('᥆', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('᧐', '᧚'),
+ ('ᨀ', '\u{1a1b}'),
+ ('ᨠ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '᪉'),
+ ('᪐', '᪙'),
+ ('ᪧ', 'ᪧ'),
+ ('\u{1ab0}', '\u{1abd}'),
+ ('\u{1abf}', '\u{1ace}'),
+ ('\u{1b00}', 'ᭌ'),
+ ('᭐', '᭙'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', '᯳'),
+ ('ᰀ', '\u{1c37}'),
+ ('᱀', '᱉'),
+ ('ᱍ', 'ᱽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', 'ᳺ'),
+ ('ᴀ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('‿', '⁀'),
+ ('⁔', '⁔'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('\u{20d0}', '\u{20dc}'),
+ ('\u{20e1}', '\u{20e1}'),
+ ('\u{20e5}', '\u{20f0}'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('℘', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('\u{2d7f}', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('々', '〇'),
+ ('〡', '\u{302f}'),
+ ('〱', '〵'),
+ ('〸', '〼'),
+ ('ぁ', 'ゖ'),
+ ('\u{3099}', '\u{309a}'),
+ ('ゝ', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘫ'),
+ ('Ꙁ', '\u{a66f}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('ꙿ', '\u{a6f1}'),
+ ('ꜗ', 'ꜟ'),
+ ('Ꜣ', 'ꞈ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠧ'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢀ', '\u{a8c5}'),
+ ('꣐', '꣙'),
+ ('\u{a8e0}', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', '\u{a92d}'),
+ ('ꤰ', '꥓'),
+ ('ꥠ', 'ꥼ'),
+ ('\u{a980}', '꧀'),
+ ('ꧏ', '꧙'),
+ ('ꧠ', 'ꧾ'),
+ ('ꨀ', '\u{aa36}'),
+ ('ꩀ', 'ꩍ'),
+ ('꩐', '꩙'),
+ ('ꩠ', 'ꩶ'),
+ ('ꩺ', 'ꫂ'),
+ ('ꫛ', 'ꫝ'),
+ ('ꫠ', 'ꫯ'),
+ ('ꫲ', '\u{aaf6}'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꯪ'),
+ ('꯬', '\u{abed}'),
+ ('꯰', '꯹'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('יִ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﱝ'),
+ ('ﱤ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷹ'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('︳', '︴'),
+ ('﹍', '﹏'),
+ ('ﹱ', 'ﹱ'),
+ ('ﹳ', 'ﹳ'),
+ ('ﹷ', 'ﹷ'),
+ ('ﹹ', 'ﹹ'),
+ ('ﹻ', 'ﹻ'),
+ ('ﹽ', 'ﹽ'),
+ ('ﹿ', 'ﻼ'),
+ ('0', '9'),
+ ('A', 'Z'),
+ ('_', '_'),
+ ('a', 'z'),
+ ('ヲ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐅀', '𐅴'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '\u{1037a}'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐏑', '𐏕'),
+ ('𐐀', '𐒝'),
+ ('𐒠', '𐒩'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '\u{10ae6}'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐴀', '\u{10d27}'),
+ ('𐴰', '𐴹'),
+ ('𐺀', '𐺩'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('𐺰', '𐺱'),
+ ('\u{10efd}', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '\u{10f50}'),
+ ('𐽰', '\u{10f85}'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀀', '\u{11046}'),
+ ('𑁦', '𑁵'),
+ ('\u{1107f}', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('𑃐', '𑃨'),
+ ('𑃰', '𑃹'),
+ ('\u{11100}', '\u{11134}'),
+ ('𑄶', '𑄿'),
+ ('𑅄', '𑅇'),
+ ('𑅐', '\u{11173}'),
+ ('𑅶', '𑅶'),
+ ('\u{11180}', '𑇄'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('𑇎', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '\u{11237}'),
+ ('\u{1123e}', '\u{11241}'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '\u{112ea}'),
+ ('𑋰', '𑋹'),
+ ('\u{11300}', '𑌃'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('\u{1133b}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('𑍐', '𑍐'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍝', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('𑐀', '𑑊'),
+ ('𑑐', '𑑙'),
+ ('\u{1145e}', '𑑡'),
+ ('𑒀', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑓐', '𑓙'),
+ ('𑖀', '\u{115b5}'),
+ ('𑖸', '\u{115c0}'),
+ ('𑗘', '\u{115dd}'),
+ ('𑘀', '\u{11640}'),
+ ('𑙄', '𑙄'),
+ ('𑙐', '𑙙'),
+ ('𑚀', '𑚸'),
+ ('𑛀', '𑛉'),
+ ('𑜀', '𑜚'),
+ ('\u{1171d}', '\u{1172b}'),
+ ('𑜰', '𑜹'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '\u{1183a}'),
+ ('𑢠', '𑣩'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '\u{11943}'),
+ ('𑥐', '𑥙'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '\u{119d7}'),
+ ('\u{119da}', '𑧡'),
+ ('𑧣', '𑧤'),
+ ('𑨀', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('𑩐', '\u{11a99}'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '\u{11c36}'),
+ ('\u{11c38}', '𑱀'),
+ ('𑱐', '𑱙'),
+ ('𑱲', '𑲏'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d47}'),
+ ('𑵐', '𑵙'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '𑶘'),
+ ('𑶠', '𑶩'),
+ ('𑻠', '𑻶'),
+ ('\u{11f00}', '𑼐'),
+ ('𑼒', '\u{11f3a}'),
+ ('𑼾', '\u{11f42}'),
+ ('𑽐', '𑽙'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('\u{13440}', '\u{13455}'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩠', '𖩩'),
+ ('𖩰', '𖪾'),
+ ('𖫀', '𖫉'),
+ ('𖫐', '𖫭'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('𖬀', '\u{16b36}'),
+ ('𖭀', '𖭃'),
+ ('𖭐', '𖭙'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖹿'),
+ ('𖼀', '𖽊'),
+ ('\u{16f4f}', '𖾇'),
+ ('\u{16f8f}', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '\u{16fe4}'),
+ ('𖿰', '𖿱'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d165}', '\u{1d169}'),
+ ('𝅭', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝟎', '𝟿'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('𞀰', '𞁭'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('𞄀', '𞄬'),
+ ('\u{1e130}', '𞄽'),
+ ('𞅀', '𞅉'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '\u{1e2ae}'),
+ ('𞋀', '𞋹'),
+ ('𞓐', '𞓹'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('𞤀', '𞥋'),
+ ('𞥐', '𞥙'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('🯰', '🯹'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const XID_START: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ˁ'),
+ ('ˆ', 'ˑ'),
+ ('ˠ', 'ˤ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('Ͱ', 'ʹ'),
+ ('Ͷ', 'ͷ'),
+ ('ͻ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', 'ՙ'),
+ ('ՠ', 'ֈ'),
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('ؠ', 'ي'),
+ ('ٮ', 'ٯ'),
+ ('ٱ', 'ۓ'),
+ ('ە', 'ە'),
+ ('ۥ', 'ۦ'),
+ ('ۮ', 'ۯ'),
+ ('ۺ', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', 'ܐ'),
+ ('ܒ', 'ܯ'),
+ ('ݍ', 'ޥ'),
+ ('ޱ', 'ޱ'),
+ ('ߊ', 'ߪ'),
+ ('ߴ', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('ࠀ', 'ࠕ'),
+ ('ࠚ', 'ࠚ'),
+ ('ࠤ', 'ࠤ'),
+ ('ࠨ', 'ࠨ'),
+ ('ࡀ', 'ࡘ'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('ࢠ', 'ࣉ'),
+ ('ऄ', 'ह'),
+ ('ऽ', 'ऽ'),
+ ('ॐ', 'ॐ'),
+ ('क़', 'ॡ'),
+ ('ॱ', 'ঀ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('ঽ', 'ঽ'),
+ ('ৎ', 'ৎ'),
+ ('ড়', 'ঢ়'),
+ ('য়', 'ৡ'),
+ ('ৰ', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('ੲ', 'ੴ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('ઽ', 'ઽ'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', 'ૡ'),
+ ('ૹ', 'ૹ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('ଽ', 'ଽ'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', 'ୡ'),
+ ('ୱ', 'ୱ'),
+ ('ஃ', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('ௐ', 'ௐ'),
+ ('అ', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('ఽ', 'ఽ'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', 'ౡ'),
+ ('ಀ', 'ಀ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('ಽ', 'ಽ'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', 'ೡ'),
+ ('ೱ', 'ೲ'),
+ ('ഄ', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', 'ഺ'),
+ ('ഽ', 'ഽ'),
+ ('ൎ', 'ൎ'),
+ ('ൔ', 'ൖ'),
+ ('ൟ', 'ൡ'),
+ ('ൺ', 'ൿ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('ก', 'ะ'),
+ ('า', 'า'),
+ ('เ', 'ๆ'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ະ'),
+ ('າ', 'າ'),
+ ('ຽ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('ཀ', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('ྈ', 'ྌ'),
+ ('က', 'ဪ'),
+ ('ဿ', 'ဿ'),
+ ('ၐ', 'ၕ'),
+ ('ၚ', 'ၝ'),
+ ('ၡ', 'ၡ'),
+ ('ၥ', 'ၦ'),
+ ('ၮ', 'ၰ'),
+ ('ၵ', 'ႁ'),
+ ('ႎ', 'ႎ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('ᎀ', 'ᎏ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛮ', 'ᛸ'),
+ ('ᜀ', 'ᜑ'),
+ ('ᜟ', 'ᜱ'),
+ ('ᝀ', 'ᝑ'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('ក', 'ឳ'),
+ ('ៗ', 'ៗ'),
+ ('ៜ', 'ៜ'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢨ'),
+ ('ᢪ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('ᥐ', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('ᨀ', 'ᨖ'),
+ ('ᨠ', 'ᩔ'),
+ ('ᪧ', 'ᪧ'),
+ ('ᬅ', 'ᬳ'),
+ ('ᭅ', 'ᭌ'),
+ ('ᮃ', 'ᮠ'),
+ ('ᮮ', 'ᮯ'),
+ ('ᮺ', 'ᯥ'),
+ ('ᰀ', 'ᰣ'),
+ ('ᱍ', 'ᱏ'),
+ ('ᱚ', 'ᱽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ᳩ', 'ᳬ'),
+ ('ᳮ', 'ᳳ'),
+ ('ᳵ', 'ᳶ'),
+ ('ᳺ', 'ᳺ'),
+ ('ᴀ', 'ᶿ'),
+ ('Ḁ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('℘', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳮ'),
+ ('Ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('々', '〇'),
+ ('〡', '〩'),
+ ('〱', '〵'),
+ ('〸', '〼'),
+ ('ぁ', 'ゖ'),
+ ('ゝ', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘟ'),
+ ('ꘪ', 'ꘫ'),
+ ('Ꙁ', 'ꙮ'),
+ ('ꙿ', 'ꚝ'),
+ ('ꚠ', 'ꛯ'),
+ ('ꜗ', 'ꜟ'),
+ ('Ꜣ', 'ꞈ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠁ'),
+ ('ꠃ', 'ꠅ'),
+ ('ꠇ', 'ꠊ'),
+ ('ꠌ', 'ꠢ'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢂ', 'ꢳ'),
+ ('ꣲ', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', 'ꣾ'),
+ ('ꤊ', 'ꤥ'),
+ ('ꤰ', 'ꥆ'),
+ ('ꥠ', 'ꥼ'),
+ ('ꦄ', 'ꦲ'),
+ ('ꧏ', 'ꧏ'),
+ ('ꧠ', 'ꧤ'),
+ ('ꧦ', 'ꧯ'),
+ ('ꧺ', 'ꧾ'),
+ ('ꨀ', 'ꨨ'),
+ ('ꩀ', 'ꩂ'),
+ ('ꩄ', 'ꩋ'),
+ ('ꩠ', 'ꩶ'),
+ ('ꩺ', 'ꩺ'),
+ ('ꩾ', 'ꪯ'),
+ ('ꪱ', 'ꪱ'),
+ ('ꪵ', 'ꪶ'),
+ ('ꪹ', 'ꪽ'),
+ ('ꫀ', 'ꫀ'),
+ ('ꫂ', 'ꫂ'),
+ ('ꫛ', 'ꫝ'),
+ ('ꫠ', 'ꫪ'),
+ ('ꫲ', 'ꫴ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꯢ'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('יִ', 'יִ'),
+ ('ײַ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﱝ'),
+ ('ﱤ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷹ'),
+ ('ﹱ', 'ﹱ'),
+ ('ﹳ', 'ﹳ'),
+ ('ﹷ', 'ﹷ'),
+ ('ﹹ', 'ﹹ'),
+ ('ﹻ', 'ﹻ'),
+ ('ﹽ', 'ﹽ'),
+ ('ﹿ', 'ﻼ'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ヲ', 'ン'),
+ ('ᅠ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐅀', '𐅴'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '𐍵'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐏑', '𐏕'),
+ ('𐐀', '𐒝'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '𐨀'),
+ ('𐨐', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '𐫤'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐴀', '𐴣'),
+ ('𐺀', '𐺩'),
+ ('𐺰', '𐺱'),
+ ('𐼀', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '𐽅'),
+ ('𐽰', '𐾁'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀃', '𑀷'),
+ ('𑁱', '𑁲'),
+ ('𑁵', '𑁵'),
+ ('𑂃', '𑂯'),
+ ('𑃐', '𑃨'),
+ ('𑄃', '𑄦'),
+ ('𑅄', '𑅄'),
+ ('𑅇', '𑅇'),
+ ('𑅐', '𑅲'),
+ ('𑅶', '𑅶'),
+ ('𑆃', '𑆲'),
+ ('𑇁', '𑇄'),
+ ('𑇚', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '𑈫'),
+ ('𑈿', '𑉀'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '𑋞'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('𑌽', '𑌽'),
+ ('𑍐', '𑍐'),
+ ('𑍝', '𑍡'),
+ ('𑐀', '𑐴'),
+ ('𑑇', '𑑊'),
+ ('𑑟', '𑑡'),
+ ('𑒀', '𑒯'),
+ ('𑓄', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑖀', '𑖮'),
+ ('𑗘', '𑗛'),
+ ('𑘀', '𑘯'),
+ ('𑙄', '𑙄'),
+ ('𑚀', '𑚪'),
+ ('𑚸', '𑚸'),
+ ('𑜀', '𑜚'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '𑠫'),
+ ('𑢠', '𑣟'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤯'),
+ ('𑤿', '𑤿'),
+ ('𑥁', '𑥁'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '𑧐'),
+ ('𑧡', '𑧡'),
+ ('𑧣', '𑧣'),
+ ('𑨀', '𑨀'),
+ ('𑨋', '𑨲'),
+ ('𑨺', '𑨺'),
+ ('𑩐', '𑩐'),
+ ('𑩜', '𑪉'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '𑰮'),
+ ('𑱀', '𑱀'),
+ ('𑱲', '𑲏'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '𑴰'),
+ ('𑵆', '𑵆'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶉'),
+ ('𑶘', '𑶘'),
+ ('𑻠', '𑻲'),
+ ('𑼂', '𑼂'),
+ ('𑼄', '𑼐'),
+ ('𑼒', '𑼳'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('𓑁', '𓑆'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩰', '𖪾'),
+ ('𖫐', '𖫭'),
+ ('𖬀', '𖬯'),
+ ('𖭀', '𖭃'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖹿'),
+ ('𖼀', '𖽊'),
+ ('𖽐', '𖽐'),
+ ('𖾓', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '𖿣'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞀰', '𞁭'),
+ ('𞄀', '𞄬'),
+ ('𞄷', '𞄽'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '𞊭'),
+ ('𞋀', '𞋫'),
+ ('𞓐', '𞓫'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('𞤀', '𞥃'),
+ ('𞥋', '𞥋'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/property_names.rs b/third_party/rust/regex-syntax/src/unicode_tables/property_names.rs
new file mode 100644
index 0000000000..599a123ae5
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/property_names.rs
@@ -0,0 +1,264 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate property-names ucd-15.0.0
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
+ ("age", "Age"),
+ ("ahex", "ASCII_Hex_Digit"),
+ ("alpha", "Alphabetic"),
+ ("alphabetic", "Alphabetic"),
+ ("asciihexdigit", "ASCII_Hex_Digit"),
+ ("bc", "Bidi_Class"),
+ ("bidic", "Bidi_Control"),
+ ("bidiclass", "Bidi_Class"),
+ ("bidicontrol", "Bidi_Control"),
+ ("bidim", "Bidi_Mirrored"),
+ ("bidimirrored", "Bidi_Mirrored"),
+ ("bidimirroringglyph", "Bidi_Mirroring_Glyph"),
+ ("bidipairedbracket", "Bidi_Paired_Bracket"),
+ ("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"),
+ ("blk", "Block"),
+ ("block", "Block"),
+ ("bmg", "Bidi_Mirroring_Glyph"),
+ ("bpb", "Bidi_Paired_Bracket"),
+ ("bpt", "Bidi_Paired_Bracket_Type"),
+ ("canonicalcombiningclass", "Canonical_Combining_Class"),
+ ("cased", "Cased"),
+ ("casefolding", "Case_Folding"),
+ ("caseignorable", "Case_Ignorable"),
+ ("ccc", "Canonical_Combining_Class"),
+ ("ce", "Composition_Exclusion"),
+ ("cf", "Case_Folding"),
+ ("changeswhencasefolded", "Changes_When_Casefolded"),
+ ("changeswhencasemapped", "Changes_When_Casemapped"),
+ ("changeswhenlowercased", "Changes_When_Lowercased"),
+ ("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"),
+ ("changeswhentitlecased", "Changes_When_Titlecased"),
+ ("changeswhenuppercased", "Changes_When_Uppercased"),
+ ("ci", "Case_Ignorable"),
+ ("cjkaccountingnumeric", "kAccountingNumeric"),
+ ("cjkcompatibilityvariant", "kCompatibilityVariant"),
+ ("cjkiicore", "kIICore"),
+ ("cjkirggsource", "kIRG_GSource"),
+ ("cjkirghsource", "kIRG_HSource"),
+ ("cjkirgjsource", "kIRG_JSource"),
+ ("cjkirgkpsource", "kIRG_KPSource"),
+ ("cjkirgksource", "kIRG_KSource"),
+ ("cjkirgmsource", "kIRG_MSource"),
+ ("cjkirgssource", "kIRG_SSource"),
+ ("cjkirgtsource", "kIRG_TSource"),
+ ("cjkirguksource", "kIRG_UKSource"),
+ ("cjkirgusource", "kIRG_USource"),
+ ("cjkirgvsource", "kIRG_VSource"),
+ ("cjkothernumeric", "kOtherNumeric"),
+ ("cjkprimarynumeric", "kPrimaryNumeric"),
+ ("cjkrsunicode", "kRSUnicode"),
+ ("compex", "Full_Composition_Exclusion"),
+ ("compositionexclusion", "Composition_Exclusion"),
+ ("cwcf", "Changes_When_Casefolded"),
+ ("cwcm", "Changes_When_Casemapped"),
+ ("cwkcf", "Changes_When_NFKC_Casefolded"),
+ ("cwl", "Changes_When_Lowercased"),
+ ("cwt", "Changes_When_Titlecased"),
+ ("cwu", "Changes_When_Uppercased"),
+ ("dash", "Dash"),
+ ("decompositionmapping", "Decomposition_Mapping"),
+ ("decompositiontype", "Decomposition_Type"),
+ ("defaultignorablecodepoint", "Default_Ignorable_Code_Point"),
+ ("dep", "Deprecated"),
+ ("deprecated", "Deprecated"),
+ ("di", "Default_Ignorable_Code_Point"),
+ ("dia", "Diacritic"),
+ ("diacritic", "Diacritic"),
+ ("dm", "Decomposition_Mapping"),
+ ("dt", "Decomposition_Type"),
+ ("ea", "East_Asian_Width"),
+ ("eastasianwidth", "East_Asian_Width"),
+ ("ebase", "Emoji_Modifier_Base"),
+ ("ecomp", "Emoji_Component"),
+ ("emod", "Emoji_Modifier"),
+ ("emoji", "Emoji"),
+ ("emojicomponent", "Emoji_Component"),
+ ("emojimodifier", "Emoji_Modifier"),
+ ("emojimodifierbase", "Emoji_Modifier_Base"),
+ ("emojipresentation", "Emoji_Presentation"),
+ ("epres", "Emoji_Presentation"),
+ ("equideo", "Equivalent_Unified_Ideograph"),
+ ("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
+ ("expandsonnfc", "Expands_On_NFC"),
+ ("expandsonnfd", "Expands_On_NFD"),
+ ("expandsonnfkc", "Expands_On_NFKC"),
+ ("expandsonnfkd", "Expands_On_NFKD"),
+ ("ext", "Extender"),
+ ("extendedpictographic", "Extended_Pictographic"),
+ ("extender", "Extender"),
+ ("extpict", "Extended_Pictographic"),
+ ("fcnfkc", "FC_NFKC_Closure"),
+ ("fcnfkcclosure", "FC_NFKC_Closure"),
+ ("fullcompositionexclusion", "Full_Composition_Exclusion"),
+ ("gc", "General_Category"),
+ ("gcb", "Grapheme_Cluster_Break"),
+ ("generalcategory", "General_Category"),
+ ("graphemebase", "Grapheme_Base"),
+ ("graphemeclusterbreak", "Grapheme_Cluster_Break"),
+ ("graphemeextend", "Grapheme_Extend"),
+ ("graphemelink", "Grapheme_Link"),
+ ("grbase", "Grapheme_Base"),
+ ("grext", "Grapheme_Extend"),
+ ("grlink", "Grapheme_Link"),
+ ("hangulsyllabletype", "Hangul_Syllable_Type"),
+ ("hex", "Hex_Digit"),
+ ("hexdigit", "Hex_Digit"),
+ ("hst", "Hangul_Syllable_Type"),
+ ("hyphen", "Hyphen"),
+ ("idc", "ID_Continue"),
+ ("idcontinue", "ID_Continue"),
+ ("ideo", "Ideographic"),
+ ("ideographic", "Ideographic"),
+ ("ids", "ID_Start"),
+ ("idsb", "IDS_Binary_Operator"),
+ ("idsbinaryoperator", "IDS_Binary_Operator"),
+ ("idst", "IDS_Trinary_Operator"),
+ ("idstart", "ID_Start"),
+ ("idstrinaryoperator", "IDS_Trinary_Operator"),
+ ("indicpositionalcategory", "Indic_Positional_Category"),
+ ("indicsyllabiccategory", "Indic_Syllabic_Category"),
+ ("inpc", "Indic_Positional_Category"),
+ ("insc", "Indic_Syllabic_Category"),
+ ("isc", "ISO_Comment"),
+ ("jamoshortname", "Jamo_Short_Name"),
+ ("jg", "Joining_Group"),
+ ("joinc", "Join_Control"),
+ ("joincontrol", "Join_Control"),
+ ("joininggroup", "Joining_Group"),
+ ("joiningtype", "Joining_Type"),
+ ("jsn", "Jamo_Short_Name"),
+ ("jt", "Joining_Type"),
+ ("kaccountingnumeric", "kAccountingNumeric"),
+ ("kcompatibilityvariant", "kCompatibilityVariant"),
+ ("kiicore", "kIICore"),
+ ("kirggsource", "kIRG_GSource"),
+ ("kirghsource", "kIRG_HSource"),
+ ("kirgjsource", "kIRG_JSource"),
+ ("kirgkpsource", "kIRG_KPSource"),
+ ("kirgksource", "kIRG_KSource"),
+ ("kirgmsource", "kIRG_MSource"),
+ ("kirgssource", "kIRG_SSource"),
+ ("kirgtsource", "kIRG_TSource"),
+ ("kirguksource", "kIRG_UKSource"),
+ ("kirgusource", "kIRG_USource"),
+ ("kirgvsource", "kIRG_VSource"),
+ ("kothernumeric", "kOtherNumeric"),
+ ("kprimarynumeric", "kPrimaryNumeric"),
+ ("krsunicode", "kRSUnicode"),
+ ("lb", "Line_Break"),
+ ("lc", "Lowercase_Mapping"),
+ ("linebreak", "Line_Break"),
+ ("loe", "Logical_Order_Exception"),
+ ("logicalorderexception", "Logical_Order_Exception"),
+ ("lower", "Lowercase"),
+ ("lowercase", "Lowercase"),
+ ("lowercasemapping", "Lowercase_Mapping"),
+ ("math", "Math"),
+ ("na", "Name"),
+ ("na1", "Unicode_1_Name"),
+ ("name", "Name"),
+ ("namealias", "Name_Alias"),
+ ("nchar", "Noncharacter_Code_Point"),
+ ("nfcqc", "NFC_Quick_Check"),
+ ("nfcquickcheck", "NFC_Quick_Check"),
+ ("nfdqc", "NFD_Quick_Check"),
+ ("nfdquickcheck", "NFD_Quick_Check"),
+ ("nfkccasefold", "NFKC_Casefold"),
+ ("nfkccf", "NFKC_Casefold"),
+ ("nfkcqc", "NFKC_Quick_Check"),
+ ("nfkcquickcheck", "NFKC_Quick_Check"),
+ ("nfkdqc", "NFKD_Quick_Check"),
+ ("nfkdquickcheck", "NFKD_Quick_Check"),
+ ("noncharactercodepoint", "Noncharacter_Code_Point"),
+ ("nt", "Numeric_Type"),
+ ("numerictype", "Numeric_Type"),
+ ("numericvalue", "Numeric_Value"),
+ ("nv", "Numeric_Value"),
+ ("oalpha", "Other_Alphabetic"),
+ ("ocomment", "ISO_Comment"),
+ ("odi", "Other_Default_Ignorable_Code_Point"),
+ ("ogrext", "Other_Grapheme_Extend"),
+ ("oidc", "Other_ID_Continue"),
+ ("oids", "Other_ID_Start"),
+ ("olower", "Other_Lowercase"),
+ ("omath", "Other_Math"),
+ ("otheralphabetic", "Other_Alphabetic"),
+ ("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"),
+ ("othergraphemeextend", "Other_Grapheme_Extend"),
+ ("otheridcontinue", "Other_ID_Continue"),
+ ("otheridstart", "Other_ID_Start"),
+ ("otherlowercase", "Other_Lowercase"),
+ ("othermath", "Other_Math"),
+ ("otheruppercase", "Other_Uppercase"),
+ ("oupper", "Other_Uppercase"),
+ ("patsyn", "Pattern_Syntax"),
+ ("patternsyntax", "Pattern_Syntax"),
+ ("patternwhitespace", "Pattern_White_Space"),
+ ("patws", "Pattern_White_Space"),
+ ("pcm", "Prepended_Concatenation_Mark"),
+ ("prependedconcatenationmark", "Prepended_Concatenation_Mark"),
+ ("qmark", "Quotation_Mark"),
+ ("quotationmark", "Quotation_Mark"),
+ ("radical", "Radical"),
+ ("regionalindicator", "Regional_Indicator"),
+ ("ri", "Regional_Indicator"),
+ ("sb", "Sentence_Break"),
+ ("sc", "Script"),
+ ("scf", "Simple_Case_Folding"),
+ ("script", "Script"),
+ ("scriptextensions", "Script_Extensions"),
+ ("scx", "Script_Extensions"),
+ ("sd", "Soft_Dotted"),
+ ("sentencebreak", "Sentence_Break"),
+ ("sentenceterminal", "Sentence_Terminal"),
+ ("sfc", "Simple_Case_Folding"),
+ ("simplecasefolding", "Simple_Case_Folding"),
+ ("simplelowercasemapping", "Simple_Lowercase_Mapping"),
+ ("simpletitlecasemapping", "Simple_Titlecase_Mapping"),
+ ("simpleuppercasemapping", "Simple_Uppercase_Mapping"),
+ ("slc", "Simple_Lowercase_Mapping"),
+ ("softdotted", "Soft_Dotted"),
+ ("space", "White_Space"),
+ ("stc", "Simple_Titlecase_Mapping"),
+ ("sterm", "Sentence_Terminal"),
+ ("suc", "Simple_Uppercase_Mapping"),
+ ("tc", "Titlecase_Mapping"),
+ ("term", "Terminal_Punctuation"),
+ ("terminalpunctuation", "Terminal_Punctuation"),
+ ("titlecasemapping", "Titlecase_Mapping"),
+ ("uc", "Uppercase_Mapping"),
+ ("uideo", "Unified_Ideograph"),
+ ("unicode1name", "Unicode_1_Name"),
+ ("unicoderadicalstroke", "kRSUnicode"),
+ ("unifiedideograph", "Unified_Ideograph"),
+ ("upper", "Uppercase"),
+ ("uppercase", "Uppercase"),
+ ("uppercasemapping", "Uppercase_Mapping"),
+ ("urs", "kRSUnicode"),
+ ("variationselector", "Variation_Selector"),
+ ("verticalorientation", "Vertical_Orientation"),
+ ("vo", "Vertical_Orientation"),
+ ("vs", "Variation_Selector"),
+ ("wb", "Word_Break"),
+ ("whitespace", "White_Space"),
+ ("wordbreak", "Word_Break"),
+ ("wspace", "White_Space"),
+ ("xidc", "XID_Continue"),
+ ("xidcontinue", "XID_Continue"),
+ ("xids", "XID_Start"),
+ ("xidstart", "XID_Start"),
+ ("xonfc", "Expands_On_NFC"),
+ ("xonfd", "Expands_On_NFD"),
+ ("xonfkc", "Expands_On_NFKC"),
+ ("xonfkd", "Expands_On_NFKD"),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/property_values.rs b/third_party/rust/regex-syntax/src/unicode_tables/property_values.rs
new file mode 100644
index 0000000000..cb2d32fb70
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/property_values.rs
@@ -0,0 +1,924 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate property-values ucd-15.0.0 --include gc,script,scx,age,gcb,wb,sb
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const PROPERTY_VALUES: &'static [(
+ &'static str,
+ &'static [(&'static str, &'static str)],
+)] = &[
+ (
+ "Age",
+ &[
+ ("1.1", "V1_1"),
+ ("10.0", "V10_0"),
+ ("11.0", "V11_0"),
+ ("12.0", "V12_0"),
+ ("12.1", "V12_1"),
+ ("13.0", "V13_0"),
+ ("14.0", "V14_0"),
+ ("15.0", "V15_0"),
+ ("2.0", "V2_0"),
+ ("2.1", "V2_1"),
+ ("3.0", "V3_0"),
+ ("3.1", "V3_1"),
+ ("3.2", "V3_2"),
+ ("4.0", "V4_0"),
+ ("4.1", "V4_1"),
+ ("5.0", "V5_0"),
+ ("5.1", "V5_1"),
+ ("5.2", "V5_2"),
+ ("6.0", "V6_0"),
+ ("6.1", "V6_1"),
+ ("6.2", "V6_2"),
+ ("6.3", "V6_3"),
+ ("7.0", "V7_0"),
+ ("8.0", "V8_0"),
+ ("9.0", "V9_0"),
+ ("na", "Unassigned"),
+ ("unassigned", "Unassigned"),
+ ("v100", "V10_0"),
+ ("v11", "V1_1"),
+ ("v110", "V11_0"),
+ ("v120", "V12_0"),
+ ("v121", "V12_1"),
+ ("v130", "V13_0"),
+ ("v140", "V14_0"),
+ ("v150", "V15_0"),
+ ("v20", "V2_0"),
+ ("v21", "V2_1"),
+ ("v30", "V3_0"),
+ ("v31", "V3_1"),
+ ("v32", "V3_2"),
+ ("v40", "V4_0"),
+ ("v41", "V4_1"),
+ ("v50", "V5_0"),
+ ("v51", "V5_1"),
+ ("v52", "V5_2"),
+ ("v60", "V6_0"),
+ ("v61", "V6_1"),
+ ("v62", "V6_2"),
+ ("v63", "V6_3"),
+ ("v70", "V7_0"),
+ ("v80", "V8_0"),
+ ("v90", "V9_0"),
+ ],
+ ),
+ (
+ "General_Category",
+ &[
+ ("c", "Other"),
+ ("casedletter", "Cased_Letter"),
+ ("cc", "Control"),
+ ("cf", "Format"),
+ ("closepunctuation", "Close_Punctuation"),
+ ("cn", "Unassigned"),
+ ("cntrl", "Control"),
+ ("co", "Private_Use"),
+ ("combiningmark", "Mark"),
+ ("connectorpunctuation", "Connector_Punctuation"),
+ ("control", "Control"),
+ ("cs", "Surrogate"),
+ ("currencysymbol", "Currency_Symbol"),
+ ("dashpunctuation", "Dash_Punctuation"),
+ ("decimalnumber", "Decimal_Number"),
+ ("digit", "Decimal_Number"),
+ ("enclosingmark", "Enclosing_Mark"),
+ ("finalpunctuation", "Final_Punctuation"),
+ ("format", "Format"),
+ ("initialpunctuation", "Initial_Punctuation"),
+ ("l", "Letter"),
+ ("lc", "Cased_Letter"),
+ ("letter", "Letter"),
+ ("letternumber", "Letter_Number"),
+ ("lineseparator", "Line_Separator"),
+ ("ll", "Lowercase_Letter"),
+ ("lm", "Modifier_Letter"),
+ ("lo", "Other_Letter"),
+ ("lowercaseletter", "Lowercase_Letter"),
+ ("lt", "Titlecase_Letter"),
+ ("lu", "Uppercase_Letter"),
+ ("m", "Mark"),
+ ("mark", "Mark"),
+ ("mathsymbol", "Math_Symbol"),
+ ("mc", "Spacing_Mark"),
+ ("me", "Enclosing_Mark"),
+ ("mn", "Nonspacing_Mark"),
+ ("modifierletter", "Modifier_Letter"),
+ ("modifiersymbol", "Modifier_Symbol"),
+ ("n", "Number"),
+ ("nd", "Decimal_Number"),
+ ("nl", "Letter_Number"),
+ ("no", "Other_Number"),
+ ("nonspacingmark", "Nonspacing_Mark"),
+ ("number", "Number"),
+ ("openpunctuation", "Open_Punctuation"),
+ ("other", "Other"),
+ ("otherletter", "Other_Letter"),
+ ("othernumber", "Other_Number"),
+ ("otherpunctuation", "Other_Punctuation"),
+ ("othersymbol", "Other_Symbol"),
+ ("p", "Punctuation"),
+ ("paragraphseparator", "Paragraph_Separator"),
+ ("pc", "Connector_Punctuation"),
+ ("pd", "Dash_Punctuation"),
+ ("pe", "Close_Punctuation"),
+ ("pf", "Final_Punctuation"),
+ ("pi", "Initial_Punctuation"),
+ ("po", "Other_Punctuation"),
+ ("privateuse", "Private_Use"),
+ ("ps", "Open_Punctuation"),
+ ("punct", "Punctuation"),
+ ("punctuation", "Punctuation"),
+ ("s", "Symbol"),
+ ("sc", "Currency_Symbol"),
+ ("separator", "Separator"),
+ ("sk", "Modifier_Symbol"),
+ ("sm", "Math_Symbol"),
+ ("so", "Other_Symbol"),
+ ("spaceseparator", "Space_Separator"),
+ ("spacingmark", "Spacing_Mark"),
+ ("surrogate", "Surrogate"),
+ ("symbol", "Symbol"),
+ ("titlecaseletter", "Titlecase_Letter"),
+ ("unassigned", "Unassigned"),
+ ("uppercaseletter", "Uppercase_Letter"),
+ ("z", "Separator"),
+ ("zl", "Line_Separator"),
+ ("zp", "Paragraph_Separator"),
+ ("zs", "Space_Separator"),
+ ],
+ ),
+ (
+ "Grapheme_Cluster_Break",
+ &[
+ ("cn", "Control"),
+ ("control", "Control"),
+ ("cr", "CR"),
+ ("eb", "E_Base"),
+ ("ebase", "E_Base"),
+ ("ebasegaz", "E_Base_GAZ"),
+ ("ebg", "E_Base_GAZ"),
+ ("em", "E_Modifier"),
+ ("emodifier", "E_Modifier"),
+ ("ex", "Extend"),
+ ("extend", "Extend"),
+ ("gaz", "Glue_After_Zwj"),
+ ("glueafterzwj", "Glue_After_Zwj"),
+ ("l", "L"),
+ ("lf", "LF"),
+ ("lv", "LV"),
+ ("lvt", "LVT"),
+ ("other", "Other"),
+ ("pp", "Prepend"),
+ ("prepend", "Prepend"),
+ ("regionalindicator", "Regional_Indicator"),
+ ("ri", "Regional_Indicator"),
+ ("sm", "SpacingMark"),
+ ("spacingmark", "SpacingMark"),
+ ("t", "T"),
+ ("v", "V"),
+ ("xx", "Other"),
+ ("zwj", "ZWJ"),
+ ],
+ ),
+ (
+ "Script",
+ &[
+ ("adlam", "Adlam"),
+ ("adlm", "Adlam"),
+ ("aghb", "Caucasian_Albanian"),
+ ("ahom", "Ahom"),
+ ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"),
+ ("arab", "Arabic"),
+ ("arabic", "Arabic"),
+ ("armenian", "Armenian"),
+ ("armi", "Imperial_Aramaic"),
+ ("armn", "Armenian"),
+ ("avestan", "Avestan"),
+ ("avst", "Avestan"),
+ ("bali", "Balinese"),
+ ("balinese", "Balinese"),
+ ("bamu", "Bamum"),
+ ("bamum", "Bamum"),
+ ("bass", "Bassa_Vah"),
+ ("bassavah", "Bassa_Vah"),
+ ("batak", "Batak"),
+ ("batk", "Batak"),
+ ("beng", "Bengali"),
+ ("bengali", "Bengali"),
+ ("bhaiksuki", "Bhaiksuki"),
+ ("bhks", "Bhaiksuki"),
+ ("bopo", "Bopomofo"),
+ ("bopomofo", "Bopomofo"),
+ ("brah", "Brahmi"),
+ ("brahmi", "Brahmi"),
+ ("brai", "Braille"),
+ ("braille", "Braille"),
+ ("bugi", "Buginese"),
+ ("buginese", "Buginese"),
+ ("buhd", "Buhid"),
+ ("buhid", "Buhid"),
+ ("cakm", "Chakma"),
+ ("canadianaboriginal", "Canadian_Aboriginal"),
+ ("cans", "Canadian_Aboriginal"),
+ ("cari", "Carian"),
+ ("carian", "Carian"),
+ ("caucasianalbanian", "Caucasian_Albanian"),
+ ("chakma", "Chakma"),
+ ("cham", "Cham"),
+ ("cher", "Cherokee"),
+ ("cherokee", "Cherokee"),
+ ("chorasmian", "Chorasmian"),
+ ("chrs", "Chorasmian"),
+ ("common", "Common"),
+ ("copt", "Coptic"),
+ ("coptic", "Coptic"),
+ ("cpmn", "Cypro_Minoan"),
+ ("cprt", "Cypriot"),
+ ("cuneiform", "Cuneiform"),
+ ("cypriot", "Cypriot"),
+ ("cyprominoan", "Cypro_Minoan"),
+ ("cyrillic", "Cyrillic"),
+ ("cyrl", "Cyrillic"),
+ ("deseret", "Deseret"),
+ ("deva", "Devanagari"),
+ ("devanagari", "Devanagari"),
+ ("diak", "Dives_Akuru"),
+ ("divesakuru", "Dives_Akuru"),
+ ("dogr", "Dogra"),
+ ("dogra", "Dogra"),
+ ("dsrt", "Deseret"),
+ ("dupl", "Duployan"),
+ ("duployan", "Duployan"),
+ ("egyp", "Egyptian_Hieroglyphs"),
+ ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"),
+ ("elba", "Elbasan"),
+ ("elbasan", "Elbasan"),
+ ("elym", "Elymaic"),
+ ("elymaic", "Elymaic"),
+ ("ethi", "Ethiopic"),
+ ("ethiopic", "Ethiopic"),
+ ("geor", "Georgian"),
+ ("georgian", "Georgian"),
+ ("glag", "Glagolitic"),
+ ("glagolitic", "Glagolitic"),
+ ("gong", "Gunjala_Gondi"),
+ ("gonm", "Masaram_Gondi"),
+ ("goth", "Gothic"),
+ ("gothic", "Gothic"),
+ ("gran", "Grantha"),
+ ("grantha", "Grantha"),
+ ("greek", "Greek"),
+ ("grek", "Greek"),
+ ("gujarati", "Gujarati"),
+ ("gujr", "Gujarati"),
+ ("gunjalagondi", "Gunjala_Gondi"),
+ ("gurmukhi", "Gurmukhi"),
+ ("guru", "Gurmukhi"),
+ ("han", "Han"),
+ ("hang", "Hangul"),
+ ("hangul", "Hangul"),
+ ("hani", "Han"),
+ ("hanifirohingya", "Hanifi_Rohingya"),
+ ("hano", "Hanunoo"),
+ ("hanunoo", "Hanunoo"),
+ ("hatr", "Hatran"),
+ ("hatran", "Hatran"),
+ ("hebr", "Hebrew"),
+ ("hebrew", "Hebrew"),
+ ("hira", "Hiragana"),
+ ("hiragana", "Hiragana"),
+ ("hluw", "Anatolian_Hieroglyphs"),
+ ("hmng", "Pahawh_Hmong"),
+ ("hmnp", "Nyiakeng_Puachue_Hmong"),
+ ("hrkt", "Katakana_Or_Hiragana"),
+ ("hung", "Old_Hungarian"),
+ ("imperialaramaic", "Imperial_Aramaic"),
+ ("inherited", "Inherited"),
+ ("inscriptionalpahlavi", "Inscriptional_Pahlavi"),
+ ("inscriptionalparthian", "Inscriptional_Parthian"),
+ ("ital", "Old_Italic"),
+ ("java", "Javanese"),
+ ("javanese", "Javanese"),
+ ("kaithi", "Kaithi"),
+ ("kali", "Kayah_Li"),
+ ("kana", "Katakana"),
+ ("kannada", "Kannada"),
+ ("katakana", "Katakana"),
+ ("katakanaorhiragana", "Katakana_Or_Hiragana"),
+ ("kawi", "Kawi"),
+ ("kayahli", "Kayah_Li"),
+ ("khar", "Kharoshthi"),
+ ("kharoshthi", "Kharoshthi"),
+ ("khitansmallscript", "Khitan_Small_Script"),
+ ("khmer", "Khmer"),
+ ("khmr", "Khmer"),
+ ("khoj", "Khojki"),
+ ("khojki", "Khojki"),
+ ("khudawadi", "Khudawadi"),
+ ("kits", "Khitan_Small_Script"),
+ ("knda", "Kannada"),
+ ("kthi", "Kaithi"),
+ ("lana", "Tai_Tham"),
+ ("lao", "Lao"),
+ ("laoo", "Lao"),
+ ("latin", "Latin"),
+ ("latn", "Latin"),
+ ("lepc", "Lepcha"),
+ ("lepcha", "Lepcha"),
+ ("limb", "Limbu"),
+ ("limbu", "Limbu"),
+ ("lina", "Linear_A"),
+ ("linb", "Linear_B"),
+ ("lineara", "Linear_A"),
+ ("linearb", "Linear_B"),
+ ("lisu", "Lisu"),
+ ("lyci", "Lycian"),
+ ("lycian", "Lycian"),
+ ("lydi", "Lydian"),
+ ("lydian", "Lydian"),
+ ("mahajani", "Mahajani"),
+ ("mahj", "Mahajani"),
+ ("maka", "Makasar"),
+ ("makasar", "Makasar"),
+ ("malayalam", "Malayalam"),
+ ("mand", "Mandaic"),
+ ("mandaic", "Mandaic"),
+ ("mani", "Manichaean"),
+ ("manichaean", "Manichaean"),
+ ("marc", "Marchen"),
+ ("marchen", "Marchen"),
+ ("masaramgondi", "Masaram_Gondi"),
+ ("medefaidrin", "Medefaidrin"),
+ ("medf", "Medefaidrin"),
+ ("meeteimayek", "Meetei_Mayek"),
+ ("mend", "Mende_Kikakui"),
+ ("mendekikakui", "Mende_Kikakui"),
+ ("merc", "Meroitic_Cursive"),
+ ("mero", "Meroitic_Hieroglyphs"),
+ ("meroiticcursive", "Meroitic_Cursive"),
+ ("meroitichieroglyphs", "Meroitic_Hieroglyphs"),
+ ("miao", "Miao"),
+ ("mlym", "Malayalam"),
+ ("modi", "Modi"),
+ ("mong", "Mongolian"),
+ ("mongolian", "Mongolian"),
+ ("mro", "Mro"),
+ ("mroo", "Mro"),
+ ("mtei", "Meetei_Mayek"),
+ ("mult", "Multani"),
+ ("multani", "Multani"),
+ ("myanmar", "Myanmar"),
+ ("mymr", "Myanmar"),
+ ("nabataean", "Nabataean"),
+ ("nagm", "Nag_Mundari"),
+ ("nagmundari", "Nag_Mundari"),
+ ("nand", "Nandinagari"),
+ ("nandinagari", "Nandinagari"),
+ ("narb", "Old_North_Arabian"),
+ ("nbat", "Nabataean"),
+ ("newa", "Newa"),
+ ("newtailue", "New_Tai_Lue"),
+ ("nko", "Nko"),
+ ("nkoo", "Nko"),
+ ("nshu", "Nushu"),
+ ("nushu", "Nushu"),
+ ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"),
+ ("ogam", "Ogham"),
+ ("ogham", "Ogham"),
+ ("olchiki", "Ol_Chiki"),
+ ("olck", "Ol_Chiki"),
+ ("oldhungarian", "Old_Hungarian"),
+ ("olditalic", "Old_Italic"),
+ ("oldnortharabian", "Old_North_Arabian"),
+ ("oldpermic", "Old_Permic"),
+ ("oldpersian", "Old_Persian"),
+ ("oldsogdian", "Old_Sogdian"),
+ ("oldsoutharabian", "Old_South_Arabian"),
+ ("oldturkic", "Old_Turkic"),
+ ("olduyghur", "Old_Uyghur"),
+ ("oriya", "Oriya"),
+ ("orkh", "Old_Turkic"),
+ ("orya", "Oriya"),
+ ("osage", "Osage"),
+ ("osge", "Osage"),
+ ("osma", "Osmanya"),
+ ("osmanya", "Osmanya"),
+ ("ougr", "Old_Uyghur"),
+ ("pahawhhmong", "Pahawh_Hmong"),
+ ("palm", "Palmyrene"),
+ ("palmyrene", "Palmyrene"),
+ ("pauc", "Pau_Cin_Hau"),
+ ("paucinhau", "Pau_Cin_Hau"),
+ ("perm", "Old_Permic"),
+ ("phag", "Phags_Pa"),
+ ("phagspa", "Phags_Pa"),
+ ("phli", "Inscriptional_Pahlavi"),
+ ("phlp", "Psalter_Pahlavi"),
+ ("phnx", "Phoenician"),
+ ("phoenician", "Phoenician"),
+ ("plrd", "Miao"),
+ ("prti", "Inscriptional_Parthian"),
+ ("psalterpahlavi", "Psalter_Pahlavi"),
+ ("qaac", "Coptic"),
+ ("qaai", "Inherited"),
+ ("rejang", "Rejang"),
+ ("rjng", "Rejang"),
+ ("rohg", "Hanifi_Rohingya"),
+ ("runic", "Runic"),
+ ("runr", "Runic"),
+ ("samaritan", "Samaritan"),
+ ("samr", "Samaritan"),
+ ("sarb", "Old_South_Arabian"),
+ ("saur", "Saurashtra"),
+ ("saurashtra", "Saurashtra"),
+ ("sgnw", "SignWriting"),
+ ("sharada", "Sharada"),
+ ("shavian", "Shavian"),
+ ("shaw", "Shavian"),
+ ("shrd", "Sharada"),
+ ("sidd", "Siddham"),
+ ("siddham", "Siddham"),
+ ("signwriting", "SignWriting"),
+ ("sind", "Khudawadi"),
+ ("sinh", "Sinhala"),
+ ("sinhala", "Sinhala"),
+ ("sogd", "Sogdian"),
+ ("sogdian", "Sogdian"),
+ ("sogo", "Old_Sogdian"),
+ ("sora", "Sora_Sompeng"),
+ ("sorasompeng", "Sora_Sompeng"),
+ ("soyo", "Soyombo"),
+ ("soyombo", "Soyombo"),
+ ("sund", "Sundanese"),
+ ("sundanese", "Sundanese"),
+ ("sylo", "Syloti_Nagri"),
+ ("sylotinagri", "Syloti_Nagri"),
+ ("syrc", "Syriac"),
+ ("syriac", "Syriac"),
+ ("tagalog", "Tagalog"),
+ ("tagb", "Tagbanwa"),
+ ("tagbanwa", "Tagbanwa"),
+ ("taile", "Tai_Le"),
+ ("taitham", "Tai_Tham"),
+ ("taiviet", "Tai_Viet"),
+ ("takr", "Takri"),
+ ("takri", "Takri"),
+ ("tale", "Tai_Le"),
+ ("talu", "New_Tai_Lue"),
+ ("tamil", "Tamil"),
+ ("taml", "Tamil"),
+ ("tang", "Tangut"),
+ ("tangsa", "Tangsa"),
+ ("tangut", "Tangut"),
+ ("tavt", "Tai_Viet"),
+ ("telu", "Telugu"),
+ ("telugu", "Telugu"),
+ ("tfng", "Tifinagh"),
+ ("tglg", "Tagalog"),
+ ("thaa", "Thaana"),
+ ("thaana", "Thaana"),
+ ("thai", "Thai"),
+ ("tibetan", "Tibetan"),
+ ("tibt", "Tibetan"),
+ ("tifinagh", "Tifinagh"),
+ ("tirh", "Tirhuta"),
+ ("tirhuta", "Tirhuta"),
+ ("tnsa", "Tangsa"),
+ ("toto", "Toto"),
+ ("ugar", "Ugaritic"),
+ ("ugaritic", "Ugaritic"),
+ ("unknown", "Unknown"),
+ ("vai", "Vai"),
+ ("vaii", "Vai"),
+ ("vith", "Vithkuqi"),
+ ("vithkuqi", "Vithkuqi"),
+ ("wancho", "Wancho"),
+ ("wara", "Warang_Citi"),
+ ("warangciti", "Warang_Citi"),
+ ("wcho", "Wancho"),
+ ("xpeo", "Old_Persian"),
+ ("xsux", "Cuneiform"),
+ ("yezi", "Yezidi"),
+ ("yezidi", "Yezidi"),
+ ("yi", "Yi"),
+ ("yiii", "Yi"),
+ ("zanabazarsquare", "Zanabazar_Square"),
+ ("zanb", "Zanabazar_Square"),
+ ("zinh", "Inherited"),
+ ("zyyy", "Common"),
+ ("zzzz", "Unknown"),
+ ],
+ ),
+ (
+ "Script_Extensions",
+ &[
+ ("adlam", "Adlam"),
+ ("adlm", "Adlam"),
+ ("aghb", "Caucasian_Albanian"),
+ ("ahom", "Ahom"),
+ ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"),
+ ("arab", "Arabic"),
+ ("arabic", "Arabic"),
+ ("armenian", "Armenian"),
+ ("armi", "Imperial_Aramaic"),
+ ("armn", "Armenian"),
+ ("avestan", "Avestan"),
+ ("avst", "Avestan"),
+ ("bali", "Balinese"),
+ ("balinese", "Balinese"),
+ ("bamu", "Bamum"),
+ ("bamum", "Bamum"),
+ ("bass", "Bassa_Vah"),
+ ("bassavah", "Bassa_Vah"),
+ ("batak", "Batak"),
+ ("batk", "Batak"),
+ ("beng", "Bengali"),
+ ("bengali", "Bengali"),
+ ("bhaiksuki", "Bhaiksuki"),
+ ("bhks", "Bhaiksuki"),
+ ("bopo", "Bopomofo"),
+ ("bopomofo", "Bopomofo"),
+ ("brah", "Brahmi"),
+ ("brahmi", "Brahmi"),
+ ("brai", "Braille"),
+ ("braille", "Braille"),
+ ("bugi", "Buginese"),
+ ("buginese", "Buginese"),
+ ("buhd", "Buhid"),
+ ("buhid", "Buhid"),
+ ("cakm", "Chakma"),
+ ("canadianaboriginal", "Canadian_Aboriginal"),
+ ("cans", "Canadian_Aboriginal"),
+ ("cari", "Carian"),
+ ("carian", "Carian"),
+ ("caucasianalbanian", "Caucasian_Albanian"),
+ ("chakma", "Chakma"),
+ ("cham", "Cham"),
+ ("cher", "Cherokee"),
+ ("cherokee", "Cherokee"),
+ ("chorasmian", "Chorasmian"),
+ ("chrs", "Chorasmian"),
+ ("common", "Common"),
+ ("copt", "Coptic"),
+ ("coptic", "Coptic"),
+ ("cpmn", "Cypro_Minoan"),
+ ("cprt", "Cypriot"),
+ ("cuneiform", "Cuneiform"),
+ ("cypriot", "Cypriot"),
+ ("cyprominoan", "Cypro_Minoan"),
+ ("cyrillic", "Cyrillic"),
+ ("cyrl", "Cyrillic"),
+ ("deseret", "Deseret"),
+ ("deva", "Devanagari"),
+ ("devanagari", "Devanagari"),
+ ("diak", "Dives_Akuru"),
+ ("divesakuru", "Dives_Akuru"),
+ ("dogr", "Dogra"),
+ ("dogra", "Dogra"),
+ ("dsrt", "Deseret"),
+ ("dupl", "Duployan"),
+ ("duployan", "Duployan"),
+ ("egyp", "Egyptian_Hieroglyphs"),
+ ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"),
+ ("elba", "Elbasan"),
+ ("elbasan", "Elbasan"),
+ ("elym", "Elymaic"),
+ ("elymaic", "Elymaic"),
+ ("ethi", "Ethiopic"),
+ ("ethiopic", "Ethiopic"),
+ ("geor", "Georgian"),
+ ("georgian", "Georgian"),
+ ("glag", "Glagolitic"),
+ ("glagolitic", "Glagolitic"),
+ ("gong", "Gunjala_Gondi"),
+ ("gonm", "Masaram_Gondi"),
+ ("goth", "Gothic"),
+ ("gothic", "Gothic"),
+ ("gran", "Grantha"),
+ ("grantha", "Grantha"),
+ ("greek", "Greek"),
+ ("grek", "Greek"),
+ ("gujarati", "Gujarati"),
+ ("gujr", "Gujarati"),
+ ("gunjalagondi", "Gunjala_Gondi"),
+ ("gurmukhi", "Gurmukhi"),
+ ("guru", "Gurmukhi"),
+ ("han", "Han"),
+ ("hang", "Hangul"),
+ ("hangul", "Hangul"),
+ ("hani", "Han"),
+ ("hanifirohingya", "Hanifi_Rohingya"),
+ ("hano", "Hanunoo"),
+ ("hanunoo", "Hanunoo"),
+ ("hatr", "Hatran"),
+ ("hatran", "Hatran"),
+ ("hebr", "Hebrew"),
+ ("hebrew", "Hebrew"),
+ ("hira", "Hiragana"),
+ ("hiragana", "Hiragana"),
+ ("hluw", "Anatolian_Hieroglyphs"),
+ ("hmng", "Pahawh_Hmong"),
+ ("hmnp", "Nyiakeng_Puachue_Hmong"),
+ ("hrkt", "Katakana_Or_Hiragana"),
+ ("hung", "Old_Hungarian"),
+ ("imperialaramaic", "Imperial_Aramaic"),
+ ("inherited", "Inherited"),
+ ("inscriptionalpahlavi", "Inscriptional_Pahlavi"),
+ ("inscriptionalparthian", "Inscriptional_Parthian"),
+ ("ital", "Old_Italic"),
+ ("java", "Javanese"),
+ ("javanese", "Javanese"),
+ ("kaithi", "Kaithi"),
+ ("kali", "Kayah_Li"),
+ ("kana", "Katakana"),
+ ("kannada", "Kannada"),
+ ("katakana", "Katakana"),
+ ("katakanaorhiragana", "Katakana_Or_Hiragana"),
+ ("kawi", "Kawi"),
+ ("kayahli", "Kayah_Li"),
+ ("khar", "Kharoshthi"),
+ ("kharoshthi", "Kharoshthi"),
+ ("khitansmallscript", "Khitan_Small_Script"),
+ ("khmer", "Khmer"),
+ ("khmr", "Khmer"),
+ ("khoj", "Khojki"),
+ ("khojki", "Khojki"),
+ ("khudawadi", "Khudawadi"),
+ ("kits", "Khitan_Small_Script"),
+ ("knda", "Kannada"),
+ ("kthi", "Kaithi"),
+ ("lana", "Tai_Tham"),
+ ("lao", "Lao"),
+ ("laoo", "Lao"),
+ ("latin", "Latin"),
+ ("latn", "Latin"),
+ ("lepc", "Lepcha"),
+ ("lepcha", "Lepcha"),
+ ("limb", "Limbu"),
+ ("limbu", "Limbu"),
+ ("lina", "Linear_A"),
+ ("linb", "Linear_B"),
+ ("lineara", "Linear_A"),
+ ("linearb", "Linear_B"),
+ ("lisu", "Lisu"),
+ ("lyci", "Lycian"),
+ ("lycian", "Lycian"),
+ ("lydi", "Lydian"),
+ ("lydian", "Lydian"),
+ ("mahajani", "Mahajani"),
+ ("mahj", "Mahajani"),
+ ("maka", "Makasar"),
+ ("makasar", "Makasar"),
+ ("malayalam", "Malayalam"),
+ ("mand", "Mandaic"),
+ ("mandaic", "Mandaic"),
+ ("mani", "Manichaean"),
+ ("manichaean", "Manichaean"),
+ ("marc", "Marchen"),
+ ("marchen", "Marchen"),
+ ("masaramgondi", "Masaram_Gondi"),
+ ("medefaidrin", "Medefaidrin"),
+ ("medf", "Medefaidrin"),
+ ("meeteimayek", "Meetei_Mayek"),
+ ("mend", "Mende_Kikakui"),
+ ("mendekikakui", "Mende_Kikakui"),
+ ("merc", "Meroitic_Cursive"),
+ ("mero", "Meroitic_Hieroglyphs"),
+ ("meroiticcursive", "Meroitic_Cursive"),
+ ("meroitichieroglyphs", "Meroitic_Hieroglyphs"),
+ ("miao", "Miao"),
+ ("mlym", "Malayalam"),
+ ("modi", "Modi"),
+ ("mong", "Mongolian"),
+ ("mongolian", "Mongolian"),
+ ("mro", "Mro"),
+ ("mroo", "Mro"),
+ ("mtei", "Meetei_Mayek"),
+ ("mult", "Multani"),
+ ("multani", "Multani"),
+ ("myanmar", "Myanmar"),
+ ("mymr", "Myanmar"),
+ ("nabataean", "Nabataean"),
+ ("nagm", "Nag_Mundari"),
+ ("nagmundari", "Nag_Mundari"),
+ ("nand", "Nandinagari"),
+ ("nandinagari", "Nandinagari"),
+ ("narb", "Old_North_Arabian"),
+ ("nbat", "Nabataean"),
+ ("newa", "Newa"),
+ ("newtailue", "New_Tai_Lue"),
+ ("nko", "Nko"),
+ ("nkoo", "Nko"),
+ ("nshu", "Nushu"),
+ ("nushu", "Nushu"),
+ ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"),
+ ("ogam", "Ogham"),
+ ("ogham", "Ogham"),
+ ("olchiki", "Ol_Chiki"),
+ ("olck", "Ol_Chiki"),
+ ("oldhungarian", "Old_Hungarian"),
+ ("olditalic", "Old_Italic"),
+ ("oldnortharabian", "Old_North_Arabian"),
+ ("oldpermic", "Old_Permic"),
+ ("oldpersian", "Old_Persian"),
+ ("oldsogdian", "Old_Sogdian"),
+ ("oldsoutharabian", "Old_South_Arabian"),
+ ("oldturkic", "Old_Turkic"),
+ ("olduyghur", "Old_Uyghur"),
+ ("oriya", "Oriya"),
+ ("orkh", "Old_Turkic"),
+ ("orya", "Oriya"),
+ ("osage", "Osage"),
+ ("osge", "Osage"),
+ ("osma", "Osmanya"),
+ ("osmanya", "Osmanya"),
+ ("ougr", "Old_Uyghur"),
+ ("pahawhhmong", "Pahawh_Hmong"),
+ ("palm", "Palmyrene"),
+ ("palmyrene", "Palmyrene"),
+ ("pauc", "Pau_Cin_Hau"),
+ ("paucinhau", "Pau_Cin_Hau"),
+ ("perm", "Old_Permic"),
+ ("phag", "Phags_Pa"),
+ ("phagspa", "Phags_Pa"),
+ ("phli", "Inscriptional_Pahlavi"),
+ ("phlp", "Psalter_Pahlavi"),
+ ("phnx", "Phoenician"),
+ ("phoenician", "Phoenician"),
+ ("plrd", "Miao"),
+ ("prti", "Inscriptional_Parthian"),
+ ("psalterpahlavi", "Psalter_Pahlavi"),
+ ("qaac", "Coptic"),
+ ("qaai", "Inherited"),
+ ("rejang", "Rejang"),
+ ("rjng", "Rejang"),
+ ("rohg", "Hanifi_Rohingya"),
+ ("runic", "Runic"),
+ ("runr", "Runic"),
+ ("samaritan", "Samaritan"),
+ ("samr", "Samaritan"),
+ ("sarb", "Old_South_Arabian"),
+ ("saur", "Saurashtra"),
+ ("saurashtra", "Saurashtra"),
+ ("sgnw", "SignWriting"),
+ ("sharada", "Sharada"),
+ ("shavian", "Shavian"),
+ ("shaw", "Shavian"),
+ ("shrd", "Sharada"),
+ ("sidd", "Siddham"),
+ ("siddham", "Siddham"),
+ ("signwriting", "SignWriting"),
+ ("sind", "Khudawadi"),
+ ("sinh", "Sinhala"),
+ ("sinhala", "Sinhala"),
+ ("sogd", "Sogdian"),
+ ("sogdian", "Sogdian"),
+ ("sogo", "Old_Sogdian"),
+ ("sora", "Sora_Sompeng"),
+ ("sorasompeng", "Sora_Sompeng"),
+ ("soyo", "Soyombo"),
+ ("soyombo", "Soyombo"),
+ ("sund", "Sundanese"),
+ ("sundanese", "Sundanese"),
+ ("sylo", "Syloti_Nagri"),
+ ("sylotinagri", "Syloti_Nagri"),
+ ("syrc", "Syriac"),
+ ("syriac", "Syriac"),
+ ("tagalog", "Tagalog"),
+ ("tagb", "Tagbanwa"),
+ ("tagbanwa", "Tagbanwa"),
+ ("taile", "Tai_Le"),
+ ("taitham", "Tai_Tham"),
+ ("taiviet", "Tai_Viet"),
+ ("takr", "Takri"),
+ ("takri", "Takri"),
+ ("tale", "Tai_Le"),
+ ("talu", "New_Tai_Lue"),
+ ("tamil", "Tamil"),
+ ("taml", "Tamil"),
+ ("tang", "Tangut"),
+ ("tangsa", "Tangsa"),
+ ("tangut", "Tangut"),
+ ("tavt", "Tai_Viet"),
+ ("telu", "Telugu"),
+ ("telugu", "Telugu"),
+ ("tfng", "Tifinagh"),
+ ("tglg", "Tagalog"),
+ ("thaa", "Thaana"),
+ ("thaana", "Thaana"),
+ ("thai", "Thai"),
+ ("tibetan", "Tibetan"),
+ ("tibt", "Tibetan"),
+ ("tifinagh", "Tifinagh"),
+ ("tirh", "Tirhuta"),
+ ("tirhuta", "Tirhuta"),
+ ("tnsa", "Tangsa"),
+ ("toto", "Toto"),
+ ("ugar", "Ugaritic"),
+ ("ugaritic", "Ugaritic"),
+ ("unknown", "Unknown"),
+ ("vai", "Vai"),
+ ("vaii", "Vai"),
+ ("vith", "Vithkuqi"),
+ ("vithkuqi", "Vithkuqi"),
+ ("wancho", "Wancho"),
+ ("wara", "Warang_Citi"),
+ ("warangciti", "Warang_Citi"),
+ ("wcho", "Wancho"),
+ ("xpeo", "Old_Persian"),
+ ("xsux", "Cuneiform"),
+ ("yezi", "Yezidi"),
+ ("yezidi", "Yezidi"),
+ ("yi", "Yi"),
+ ("yiii", "Yi"),
+ ("zanabazarsquare", "Zanabazar_Square"),
+ ("zanb", "Zanabazar_Square"),
+ ("zinh", "Inherited"),
+ ("zyyy", "Common"),
+ ("zzzz", "Unknown"),
+ ],
+ ),
+ (
+ "Sentence_Break",
+ &[
+ ("at", "ATerm"),
+ ("aterm", "ATerm"),
+ ("cl", "Close"),
+ ("close", "Close"),
+ ("cr", "CR"),
+ ("ex", "Extend"),
+ ("extend", "Extend"),
+ ("fo", "Format"),
+ ("format", "Format"),
+ ("le", "OLetter"),
+ ("lf", "LF"),
+ ("lo", "Lower"),
+ ("lower", "Lower"),
+ ("nu", "Numeric"),
+ ("numeric", "Numeric"),
+ ("oletter", "OLetter"),
+ ("other", "Other"),
+ ("sc", "SContinue"),
+ ("scontinue", "SContinue"),
+ ("se", "Sep"),
+ ("sep", "Sep"),
+ ("sp", "Sp"),
+ ("st", "STerm"),
+ ("sterm", "STerm"),
+ ("up", "Upper"),
+ ("upper", "Upper"),
+ ("xx", "Other"),
+ ],
+ ),
+ (
+ "Word_Break",
+ &[
+ ("aletter", "ALetter"),
+ ("cr", "CR"),
+ ("doublequote", "Double_Quote"),
+ ("dq", "Double_Quote"),
+ ("eb", "E_Base"),
+ ("ebase", "E_Base"),
+ ("ebasegaz", "E_Base_GAZ"),
+ ("ebg", "E_Base_GAZ"),
+ ("em", "E_Modifier"),
+ ("emodifier", "E_Modifier"),
+ ("ex", "ExtendNumLet"),
+ ("extend", "Extend"),
+ ("extendnumlet", "ExtendNumLet"),
+ ("fo", "Format"),
+ ("format", "Format"),
+ ("gaz", "Glue_After_Zwj"),
+ ("glueafterzwj", "Glue_After_Zwj"),
+ ("hebrewletter", "Hebrew_Letter"),
+ ("hl", "Hebrew_Letter"),
+ ("ka", "Katakana"),
+ ("katakana", "Katakana"),
+ ("le", "ALetter"),
+ ("lf", "LF"),
+ ("mb", "MidNumLet"),
+ ("midletter", "MidLetter"),
+ ("midnum", "MidNum"),
+ ("midnumlet", "MidNumLet"),
+ ("ml", "MidLetter"),
+ ("mn", "MidNum"),
+ ("newline", "Newline"),
+ ("nl", "Newline"),
+ ("nu", "Numeric"),
+ ("numeric", "Numeric"),
+ ("other", "Other"),
+ ("regionalindicator", "Regional_Indicator"),
+ ("ri", "Regional_Indicator"),
+ ("singlequote", "Single_Quote"),
+ ("sq", "Single_Quote"),
+ ("wsegspace", "WSegSpace"),
+ ("xx", "Other"),
+ ("zwj", "ZWJ"),
+ ],
+ ),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/script.rs b/third_party/rust/regex-syntax/src/unicode_tables/script.rs
new file mode 100644
index 0000000000..cc5c400ddb
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/script.rs
@@ -0,0 +1,1263 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate script ucd-15.0.0 --chars
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
+ ("Adlam", ADLAM),
+ ("Ahom", AHOM),
+ ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS),
+ ("Arabic", ARABIC),
+ ("Armenian", ARMENIAN),
+ ("Avestan", AVESTAN),
+ ("Balinese", BALINESE),
+ ("Bamum", BAMUM),
+ ("Bassa_Vah", BASSA_VAH),
+ ("Batak", BATAK),
+ ("Bengali", BENGALI),
+ ("Bhaiksuki", BHAIKSUKI),
+ ("Bopomofo", BOPOMOFO),
+ ("Brahmi", BRAHMI),
+ ("Braille", BRAILLE),
+ ("Buginese", BUGINESE),
+ ("Buhid", BUHID),
+ ("Canadian_Aboriginal", CANADIAN_ABORIGINAL),
+ ("Carian", CARIAN),
+ ("Caucasian_Albanian", CAUCASIAN_ALBANIAN),
+ ("Chakma", CHAKMA),
+ ("Cham", CHAM),
+ ("Cherokee", CHEROKEE),
+ ("Chorasmian", CHORASMIAN),
+ ("Common", COMMON),
+ ("Coptic", COPTIC),
+ ("Cuneiform", CUNEIFORM),
+ ("Cypriot", CYPRIOT),
+ ("Cypro_Minoan", CYPRO_MINOAN),
+ ("Cyrillic", CYRILLIC),
+ ("Deseret", DESERET),
+ ("Devanagari", DEVANAGARI),
+ ("Dives_Akuru", DIVES_AKURU),
+ ("Dogra", DOGRA),
+ ("Duployan", DUPLOYAN),
+ ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
+ ("Elbasan", ELBASAN),
+ ("Elymaic", ELYMAIC),
+ ("Ethiopic", ETHIOPIC),
+ ("Georgian", GEORGIAN),
+ ("Glagolitic", GLAGOLITIC),
+ ("Gothic", GOTHIC),
+ ("Grantha", GRANTHA),
+ ("Greek", GREEK),
+ ("Gujarati", GUJARATI),
+ ("Gunjala_Gondi", GUNJALA_GONDI),
+ ("Gurmukhi", GURMUKHI),
+ ("Han", HAN),
+ ("Hangul", HANGUL),
+ ("Hanifi_Rohingya", HANIFI_ROHINGYA),
+ ("Hanunoo", HANUNOO),
+ ("Hatran", HATRAN),
+ ("Hebrew", HEBREW),
+ ("Hiragana", HIRAGANA),
+ ("Imperial_Aramaic", IMPERIAL_ARAMAIC),
+ ("Inherited", INHERITED),
+ ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI),
+ ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN),
+ ("Javanese", JAVANESE),
+ ("Kaithi", KAITHI),
+ ("Kannada", KANNADA),
+ ("Katakana", KATAKANA),
+ ("Kawi", KAWI),
+ ("Kayah_Li", KAYAH_LI),
+ ("Kharoshthi", KHAROSHTHI),
+ ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
+ ("Khmer", KHMER),
+ ("Khojki", KHOJKI),
+ ("Khudawadi", KHUDAWADI),
+ ("Lao", LAO),
+ ("Latin", LATIN),
+ ("Lepcha", LEPCHA),
+ ("Limbu", LIMBU),
+ ("Linear_A", LINEAR_A),
+ ("Linear_B", LINEAR_B),
+ ("Lisu", LISU),
+ ("Lycian", LYCIAN),
+ ("Lydian", LYDIAN),
+ ("Mahajani", MAHAJANI),
+ ("Makasar", MAKASAR),
+ ("Malayalam", MALAYALAM),
+ ("Mandaic", MANDAIC),
+ ("Manichaean", MANICHAEAN),
+ ("Marchen", MARCHEN),
+ ("Masaram_Gondi", MASARAM_GONDI),
+ ("Medefaidrin", MEDEFAIDRIN),
+ ("Meetei_Mayek", MEETEI_MAYEK),
+ ("Mende_Kikakui", MENDE_KIKAKUI),
+ ("Meroitic_Cursive", MEROITIC_CURSIVE),
+ ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS),
+ ("Miao", MIAO),
+ ("Modi", MODI),
+ ("Mongolian", MONGOLIAN),
+ ("Mro", MRO),
+ ("Multani", MULTANI),
+ ("Myanmar", MYANMAR),
+ ("Nabataean", NABATAEAN),
+ ("Nag_Mundari", NAG_MUNDARI),
+ ("Nandinagari", NANDINAGARI),
+ ("New_Tai_Lue", NEW_TAI_LUE),
+ ("Newa", NEWA),
+ ("Nko", NKO),
+ ("Nushu", NUSHU),
+ ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG),
+ ("Ogham", OGHAM),
+ ("Ol_Chiki", OL_CHIKI),
+ ("Old_Hungarian", OLD_HUNGARIAN),
+ ("Old_Italic", OLD_ITALIC),
+ ("Old_North_Arabian", OLD_NORTH_ARABIAN),
+ ("Old_Permic", OLD_PERMIC),
+ ("Old_Persian", OLD_PERSIAN),
+ ("Old_Sogdian", OLD_SOGDIAN),
+ ("Old_South_Arabian", OLD_SOUTH_ARABIAN),
+ ("Old_Turkic", OLD_TURKIC),
+ ("Old_Uyghur", OLD_UYGHUR),
+ ("Oriya", ORIYA),
+ ("Osage", OSAGE),
+ ("Osmanya", OSMANYA),
+ ("Pahawh_Hmong", PAHAWH_HMONG),
+ ("Palmyrene", PALMYRENE),
+ ("Pau_Cin_Hau", PAU_CIN_HAU),
+ ("Phags_Pa", PHAGS_PA),
+ ("Phoenician", PHOENICIAN),
+ ("Psalter_Pahlavi", PSALTER_PAHLAVI),
+ ("Rejang", REJANG),
+ ("Runic", RUNIC),
+ ("Samaritan", SAMARITAN),
+ ("Saurashtra", SAURASHTRA),
+ ("Sharada", SHARADA),
+ ("Shavian", SHAVIAN),
+ ("Siddham", SIDDHAM),
+ ("SignWriting", SIGNWRITING),
+ ("Sinhala", SINHALA),
+ ("Sogdian", SOGDIAN),
+ ("Sora_Sompeng", SORA_SOMPENG),
+ ("Soyombo", SOYOMBO),
+ ("Sundanese", SUNDANESE),
+ ("Syloti_Nagri", SYLOTI_NAGRI),
+ ("Syriac", SYRIAC),
+ ("Tagalog", TAGALOG),
+ ("Tagbanwa", TAGBANWA),
+ ("Tai_Le", TAI_LE),
+ ("Tai_Tham", TAI_THAM),
+ ("Tai_Viet", TAI_VIET),
+ ("Takri", TAKRI),
+ ("Tamil", TAMIL),
+ ("Tangsa", TANGSA),
+ ("Tangut", TANGUT),
+ ("Telugu", TELUGU),
+ ("Thaana", THAANA),
+ ("Thai", THAI),
+ ("Tibetan", TIBETAN),
+ ("Tifinagh", TIFINAGH),
+ ("Tirhuta", TIRHUTA),
+ ("Toto", TOTO),
+ ("Ugaritic", UGARITIC),
+ ("Vai", VAI),
+ ("Vithkuqi", VITHKUQI),
+ ("Wancho", WANCHO),
+ ("Warang_Citi", WARANG_CITI),
+ ("Yezidi", YEZIDI),
+ ("Yi", YI),
+ ("Zanabazar_Square", ZANABAZAR_SQUARE),
+];
+
+pub const ADLAM: &'static [(char, char)] =
+ &[('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')];
+
+pub const AHOM: &'static [(char, char)] =
+ &[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑝆')];
+
+pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𔐀', '𔙆')];
+
+pub const ARABIC: &'static [(char, char)] = &[
+ ('\u{600}', '\u{604}'),
+ ('؆', '؋'),
+ ('؍', '\u{61a}'),
+ ('\u{61c}', '؞'),
+ ('ؠ', 'ؿ'),
+ ('ف', 'ي'),
+ ('\u{656}', 'ٯ'),
+ ('ٱ', '\u{6dc}'),
+ ('۞', 'ۿ'),
+ ('ݐ', 'ݿ'),
+ ('ࡰ', 'ࢎ'),
+ ('\u{890}', '\u{891}'),
+ ('\u{898}', '\u{8e1}'),
+ ('\u{8e3}', '\u{8ff}'),
+ ('ﭐ', '﯂'),
+ ('ﯓ', 'ﴽ'),
+ ('﵀', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('﷏', '﷏'),
+ ('ﷰ', '﷿'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('𐹠', '𐹾'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𞻰', '𞻱'),
+];
+
+pub const ARMENIAN: &'static [(char, char)] =
+ &[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')];
+
+pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')];
+
+pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭌ'), ('᭐', '᭾')];
+
+pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')];
+
+pub const BASSA_VAH: &'static [(char, char)] =
+ &[('𖫐', '𖫭'), ('\u{16af0}', '𖫵')];
+
+pub const BATAK: &'static [(char, char)] = &[('ᯀ', '᯳'), ('᯼', '᯿')];
+
+pub const BENGALI: &'static [(char, char)] = &[
+ ('ঀ', 'ঃ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('\u{9bc}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৎ'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('ড়', 'ঢ়'),
+ ('য়', '\u{9e3}'),
+ ('০', '\u{9fe}'),
+];
+
+pub const BHAIKSUKI: &'static [(char, char)] =
+ &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')];
+
+pub const BOPOMOFO: &'static [(char, char)] =
+ &[('˪', '˫'), ('ㄅ', 'ㄯ'), ('ㆠ', 'ㆿ')];
+
+pub const BRAHMI: &'static [(char, char)] =
+ &[('𑀀', '𑁍'), ('𑁒', '𑁵'), ('\u{1107f}', '\u{1107f}')];
+
+pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')];
+
+pub const BUGINESE: &'static [(char, char)] = &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟')];
+
+pub const BUHID: &'static [(char, char)] = &[('ᝀ', '\u{1753}')];
+
+pub const CANADIAN_ABORIGINAL: &'static [(char, char)] =
+ &[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), ('𑪰', '𑪿')];
+
+pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')];
+
+pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] =
+ &[('𐔰', '𐕣'), ('𐕯', '𐕯')];
+
+pub const CHAKMA: &'static [(char, char)] =
+ &[('\u{11100}', '\u{11134}'), ('𑄶', '𑅇')];
+
+pub const CHAM: &'static [(char, char)] =
+ &[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')];
+
+pub const CHEROKEE: &'static [(char, char)] =
+ &[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')];
+
+pub const CHORASMIAN: &'static [(char, char)] = &[('𐾰', '𐿋')];
+
+pub const COMMON: &'static [(char, char)] = &[
+ ('\0', '@'),
+ ('[', '`'),
+ ('{', '©'),
+ ('«', '¹'),
+ ('»', '¿'),
+ ('×', '×'),
+ ('÷', '÷'),
+ ('ʹ', '˟'),
+ ('˥', '˩'),
+ ('ˬ', '˿'),
+ ('ʹ', 'ʹ'),
+ (';', ';'),
+ ('΅', '΅'),
+ ('·', '·'),
+ ('\u{605}', '\u{605}'),
+ ('،', '،'),
+ ('؛', '؛'),
+ ('؟', '؟'),
+ ('ـ', 'ـ'),
+ ('\u{6dd}', '\u{6dd}'),
+ ('\u{8e2}', '\u{8e2}'),
+ ('।', '॥'),
+ ('฿', '฿'),
+ ('࿕', '࿘'),
+ ('჻', '჻'),
+ ('᛫', '᛭'),
+ ('᜵', '᜶'),
+ ('᠂', '᠃'),
+ ('᠅', '᠅'),
+ ('᳓', '᳓'),
+ ('᳡', '᳡'),
+ ('ᳩ', 'ᳬ'),
+ ('ᳮ', 'ᳳ'),
+ ('ᳵ', '᳷'),
+ ('ᳺ', 'ᳺ'),
+ ('\u{2000}', '\u{200b}'),
+ ('\u{200e}', '\u{2064}'),
+ ('\u{2066}', '⁰'),
+ ('⁴', '⁾'),
+ ('₀', '₎'),
+ ('₠', '⃀'),
+ ('℀', '℥'),
+ ('℧', '℩'),
+ ('ℬ', 'ℱ'),
+ ('ℳ', '⅍'),
+ ('⅏', '⅟'),
+ ('↉', '↋'),
+ ('←', '␦'),
+ ('⑀', '⑊'),
+ ('①', '⟿'),
+ ('⤀', '⭳'),
+ ('⭶', '⮕'),
+ ('⮗', '⯿'),
+ ('⸀', '⹝'),
+ ('⿰', '⿻'),
+ ('\u{3000}', '〄'),
+ ('〆', '〆'),
+ ('〈', '〠'),
+ ('〰', '〷'),
+ ('〼', '〿'),
+ ('゛', '゜'),
+ ('゠', '゠'),
+ ('・', 'ー'),
+ ('㆐', '㆟'),
+ ('㇀', '㇣'),
+ ('㈠', '㉟'),
+ ('㉿', '㋏'),
+ ('㋿', '㋿'),
+ ('㍘', '㏿'),
+ ('䷀', '䷿'),
+ ('꜀', '꜡'),
+ ('ꞈ', '꞊'),
+ ('꠰', '꠹'),
+ ('꤮', '꤮'),
+ ('ꧏ', 'ꧏ'),
+ ('꭛', '꭛'),
+ ('꭪', '꭫'),
+ ('﴾', '﴿'),
+ ('︐', '︙'),
+ ('︰', '﹒'),
+ ('﹔', '﹦'),
+ ('﹨', '﹫'),
+ ('\u{feff}', '\u{feff}'),
+ ('!', '@'),
+ ('[', '`'),
+ ('{', '・'),
+ ('ー', 'ー'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ ('¢', '₩'),
+ ('│', '○'),
+ ('\u{fff9}', '�'),
+ ('𐄀', '𐄂'),
+ ('𐄇', '𐄳'),
+ ('𐄷', '𐄿'),
+ ('𐆐', '𐆜'),
+ ('𐇐', '𐇼'),
+ ('𐋡', '𐋻'),
+ ('\u{1bca0}', '\u{1bca3}'),
+ ('𜽐', '𜿃'),
+ ('𝀀', '𝃵'),
+ ('𝄀', '𝄦'),
+ ('𝄩', '𝅦'),
+ ('𝅪', '\u{1d17a}'),
+ ('𝆃', '𝆄'),
+ ('𝆌', '𝆩'),
+ ('𝆮', '𝇪'),
+ ('𝋀', '𝋓'),
+ ('𝋠', '𝋳'),
+ ('𝌀', '𝍖'),
+ ('𝍠', '𝍸'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝟋'),
+ ('𝟎', '𝟿'),
+ ('𞱱', '𞲴'),
+ ('𞴁', '𞴽'),
+ ('🀀', '🀫'),
+ ('🀰', '🂓'),
+ ('🂠', '🂮'),
+ ('🂱', '🂿'),
+ ('🃁', '🃏'),
+ ('🃑', '🃵'),
+ ('🄀', '🆭'),
+ ('🇦', '🇿'),
+ ('🈁', '🈂'),
+ ('🈐', '🈻'),
+ ('🉀', '🉈'),
+ ('🉐', '🉑'),
+ ('🉠', '🉥'),
+ ('🌀', '🛗'),
+ ('🛜', '🛬'),
+ ('🛰', '🛼'),
+ ('🜀', '🝶'),
+ ('🝻', '🟙'),
+ ('🟠', '🟫'),
+ ('🟰', '🟰'),
+ ('🠀', '🠋'),
+ ('🠐', '🡇'),
+ ('🡐', '🡙'),
+ ('🡠', '🢇'),
+ ('🢐', '🢭'),
+ ('🢰', '🢱'),
+ ('🤀', '🩓'),
+ ('🩠', '🩭'),
+ ('🩰', '🩼'),
+ ('🪀', '🪈'),
+ ('🪐', '🪽'),
+ ('🪿', '🫅'),
+ ('🫎', '🫛'),
+ ('🫠', '🫨'),
+ ('🫰', '🫸'),
+ ('🬀', '🮒'),
+ ('🮔', '🯊'),
+ ('🯰', '🯹'),
+ ('\u{e0001}', '\u{e0001}'),
+ ('\u{e0020}', '\u{e007f}'),
+];
+
+pub const COPTIC: &'static [(char, char)] =
+ &[('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿')];
+
+pub const CUNEIFORM: &'static [(char, char)] =
+ &[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')];
+
+pub const CYPRIOT: &'static [(char, char)] =
+ &[('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿')];
+
+pub const CYPRO_MINOAN: &'static [(char, char)] = &[('𒾐', '𒿲')];
+
+pub const CYRILLIC: &'static [(char, char)] = &[
+ ('Ѐ', '\u{484}'),
+ ('\u{487}', 'ԯ'),
+ ('ᲀ', 'ᲈ'),
+ ('ᴫ', 'ᴫ'),
+ ('ᵸ', 'ᵸ'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('Ꙁ', '\u{a69f}'),
+ ('\u{fe2e}', '\u{fe2f}'),
+ ('𞀰', '𞁭'),
+ ('\u{1e08f}', '\u{1e08f}'),
+];
+
+pub const DESERET: &'static [(char, char)] = &[('𐐀', '𐑏')];
+
+pub const DEVANAGARI: &'static [(char, char)] = &[
+ ('\u{900}', 'ॐ'),
+ ('\u{955}', '\u{963}'),
+ ('०', 'ॿ'),
+ ('\u{a8e0}', '\u{a8ff}'),
+ ('𑬀', '𑬉'),
+];
+
+pub const DIVES_AKURU: &'static [(char, char)] = &[
+ ('𑤀', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '𑥆'),
+ ('𑥐', '𑥙'),
+];
+
+pub const DOGRA: &'static [(char, char)] = &[('𑠀', '𑠻')];
+
+pub const DUPLOYAN: &'static [(char, char)] =
+ &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '𛲟')];
+
+pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] =
+ &[('𓀀', '\u{13455}')];
+
+pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')];
+
+pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')];
+
+pub const ETHIOPIC: &'static [(char, char)] = &[
+ ('ሀ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('\u{135d}', '፼'),
+ ('ᎀ', '᎙'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+];
+
+pub const GEORGIAN: &'static [(char, char)] = &[
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ჿ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+];
+
+pub const GLAGOLITIC: &'static [(char, char)] = &[
+ ('Ⰰ', 'ⱟ'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+];
+
+pub const GOTHIC: &'static [(char, char)] = &[('𐌰', '𐍊')];
+
+pub const GRANTHA: &'static [(char, char)] = &[
+ ('\u{11300}', '𑌃'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('\u{1133c}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('𑍐', '𑍐'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍝', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+];
+
+pub const GREEK: &'static [(char, char)] = &[
+ ('Ͱ', 'ͳ'),
+ ('͵', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('΄', '΄'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϡ'),
+ ('ϰ', 'Ͽ'),
+ ('ᴦ', 'ᴪ'),
+ ('ᵝ', 'ᵡ'),
+ ('ᵦ', 'ᵪ'),
+ ('ᶿ', 'ᶿ'),
+ ('ἀ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ῄ'),
+ ('ῆ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('῝', '`'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', '῾'),
+ ('Ω', 'Ω'),
+ ('ꭥ', 'ꭥ'),
+ ('𐅀', '𐆎'),
+ ('𐆠', '𐆠'),
+ ('𝈀', '𝉅'),
+];
+
+pub const GUJARATI: &'static [(char, char)] = &[
+ ('\u{a81}', 'ઃ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('\u{abc}', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', '\u{ae3}'),
+ ('૦', '૱'),
+ ('ૹ', '\u{aff}'),
+];
+
+pub const GUNJALA_GONDI: &'static [(char, char)] = &[
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '𑶘'),
+ ('𑶠', '𑶩'),
+];
+
+pub const GURMUKHI: &'static [(char, char)] = &[
+ ('\u{a01}', 'ਃ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('੦', '੶'),
+];
+
+pub const HAN: &'static [(char, char)] = &[
+ ('⺀', '⺙'),
+ ('⺛', '⻳'),
+ ('⼀', '⿕'),
+ ('々', '々'),
+ ('〇', '〇'),
+ ('〡', '〩'),
+ ('〸', '〻'),
+ ('㐀', '䶿'),
+ ('一', '鿿'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('𖿢', '𖿣'),
+ ('𖿰', '𖿱'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const HANGUL: &'static [(char, char)] = &[
+ ('ᄀ', 'ᇿ'),
+ ('\u{302e}', '\u{302f}'),
+ ('ㄱ', 'ㆎ'),
+ ('㈀', '㈞'),
+ ('㉠', '㉾'),
+ ('ꥠ', 'ꥼ'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('ᅠ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+];
+
+pub const HANIFI_ROHINGYA: &'static [(char, char)] =
+ &[('𐴀', '\u{10d27}'), ('𐴰', '𐴹')];
+
+pub const HANUNOO: &'static [(char, char)] = &[('ᜠ', '᜴')];
+
+pub const HATRAN: &'static [(char, char)] =
+ &[('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿')];
+
+pub const HEBREW: &'static [(char, char)] = &[
+ ('\u{591}', '\u{5c7}'),
+ ('א', 'ת'),
+ ('ׯ', '״'),
+ ('יִ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﭏ'),
+];
+
+pub const HIRAGANA: &'static [(char, char)] = &[
+ ('ぁ', 'ゖ'),
+ ('ゝ', 'ゟ'),
+ ('𛀁', '𛄟'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('🈀', '🈀'),
+];
+
+pub const IMPERIAL_ARAMAIC: &'static [(char, char)] =
+ &[('𐡀', '𐡕'), ('𐡗', '𐡟')];
+
+pub const INHERITED: &'static [(char, char)] = &[
+ ('\u{300}', '\u{36f}'),
+ ('\u{485}', '\u{486}'),
+ ('\u{64b}', '\u{655}'),
+ ('\u{670}', '\u{670}'),
+ ('\u{951}', '\u{954}'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', '\u{1ce0}'),
+ ('\u{1ce2}', '\u{1ce8}'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('\u{1cf8}', '\u{1cf9}'),
+ ('\u{1dc0}', '\u{1dff}'),
+ ('\u{200c}', '\u{200d}'),
+ ('\u{20d0}', '\u{20f0}'),
+ ('\u{302a}', '\u{302d}'),
+ ('\u{3099}', '\u{309a}'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2d}'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('\u{1133b}', '\u{1133b}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d167}', '\u{1d169}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] =
+ &[('𐭠', '𐭲'), ('𐭸', '𐭿')];
+
+pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] =
+ &[('𐭀', '𐭕'), ('𐭘', '𐭟')];
+
+pub const JAVANESE: &'static [(char, char)] =
+ &[('\u{a980}', '꧍'), ('꧐', '꧙'), ('꧞', '꧟')];
+
+pub const KAITHI: &'static [(char, char)] =
+ &[('\u{11080}', '\u{110c2}'), ('\u{110cd}', '\u{110cd}')];
+
+pub const KANNADA: &'static [(char, char)] = &[
+ ('ಀ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('\u{cbc}', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', '\u{ce3}'),
+ ('೦', '೯'),
+ ('ೱ', 'ೳ'),
+];
+
+pub const KATAKANA: &'static [(char, char)] = &[
+ ('ァ', 'ヺ'),
+ ('ヽ', 'ヿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㋐', '㋾'),
+ ('㌀', '㍗'),
+ ('ヲ', 'ッ'),
+ ('ア', 'ン'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛀀'),
+ ('𛄠', '𛄢'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+];
+
+pub const KAWI: &'static [(char, char)] =
+ &[('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '𑽙')];
+
+pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '\u{a92d}'), ('꤯', '꤯')];
+
+pub const KHAROSHTHI: &'static [(char, char)] = &[
+ ('𐨀', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '𐩈'),
+ ('𐩐', '𐩘'),
+];
+
+pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] =
+ &[('\u{16fe4}', '\u{16fe4}'), ('𘬀', '𘳕')];
+
+pub const KHMER: &'static [(char, char)] =
+ &[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')];
+
+pub const KHOJKI: &'static [(char, char)] = &[('𑈀', '𑈑'), ('𑈓', '\u{11241}')];
+
+pub const KHUDAWADI: &'static [(char, char)] =
+ &[('𑊰', '\u{112ea}'), ('𑋰', '𑋹')];
+
+pub const LAO: &'static [(char, char)] = &[
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('\u{ec8}', '\u{ece}'),
+ ('໐', '໙'),
+ ('ໜ', 'ໟ'),
+];
+
+pub const LATIN: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ʸ'),
+ ('ˠ', 'ˤ'),
+ ('ᴀ', 'ᴥ'),
+ ('ᴬ', 'ᵜ'),
+ ('ᵢ', 'ᵥ'),
+ ('ᵫ', 'ᵷ'),
+ ('ᵹ', 'ᶾ'),
+ ('Ḁ', 'ỿ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('K', 'Å'),
+ ('Ⅎ', 'Ⅎ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⱡ', 'Ɀ'),
+ ('Ꜣ', 'ꞇ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꟿ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭤ'),
+ ('ꭦ', 'ꭩ'),
+ ('ff', 'st'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+];
+
+pub const LEPCHA: &'static [(char, char)] =
+ &[('ᰀ', '\u{1c37}'), ('᰻', '᱉'), ('ᱍ', 'ᱏ')];
+
+pub const LIMBU: &'static [(char, char)] = &[
+ ('ᤀ', 'ᤞ'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('᥀', '᥀'),
+ ('᥄', '᥏'),
+];
+
+pub const LINEAR_A: &'static [(char, char)] =
+ &[('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧')];
+
+pub const LINEAR_B: &'static [(char, char)] = &[
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+];
+
+pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿'), ('𑾰', '𑾰')];
+
+pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')];
+
+pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')];
+
+pub const MAHAJANI: &'static [(char, char)] = &[('𑅐', '𑅶')];
+
+pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')];
+
+pub const MALAYALAM: &'static [(char, char)] = &[
+ ('\u{d00}', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', '൏'),
+ ('ൔ', '\u{d63}'),
+ ('൦', 'ൿ'),
+];
+
+pub const MANDAIC: &'static [(char, char)] = &[('ࡀ', '\u{85b}'), ('࡞', '࡞')];
+
+pub const MANICHAEAN: &'static [(char, char)] =
+ &[('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')];
+
+pub const MARCHEN: &'static [(char, char)] =
+ &[('𑱰', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}')];
+
+pub const MASARAM_GONDI: &'static [(char, char)] = &[
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d47}'),
+ ('𑵐', '𑵙'),
+];
+
+pub const MEDEFAIDRIN: &'static [(char, char)] = &[('𖹀', '𖺚')];
+
+pub const MEETEI_MAYEK: &'static [(char, char)] =
+ &[('ꫠ', '\u{aaf6}'), ('ꯀ', '\u{abed}'), ('꯰', '꯹')];
+
+pub const MENDE_KIKAKUI: &'static [(char, char)] =
+ &[('𞠀', '𞣄'), ('𞣇', '\u{1e8d6}')];
+
+pub const MEROITIC_CURSIVE: &'static [(char, char)] =
+ &[('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿')];
+
+pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')];
+
+pub const MIAO: &'static [(char, char)] =
+ &[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')];
+
+pub const MODI: &'static [(char, char)] = &[('𑘀', '𑙄'), ('𑙐', '𑙙')];
+
+pub const MONGOLIAN: &'static [(char, char)] =
+ &[('᠀', '᠁'), ('᠄', '᠄'), ('᠆', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('𑙠', '𑙬')];
+
+pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')];
+
+pub const MULTANI: &'static [(char, char)] =
+ &[('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')];
+
+pub const MYANMAR: &'static [(char, char)] =
+ &[('က', '႟'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')];
+
+pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')];
+
+pub const NAG_MUNDARI: &'static [(char, char)] = &[('𞓐', '𞓹')];
+
+pub const NANDINAGARI: &'static [(char, char)] =
+ &[('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧤')];
+
+pub const NEW_TAI_LUE: &'static [(char, char)] =
+ &[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')];
+
+pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '𑑡')];
+
+pub const NKO: &'static [(char, char)] = &[('߀', 'ߺ'), ('\u{7fd}', '߿')];
+
+pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')];
+
+pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] =
+ &[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')];
+
+pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')];
+
+pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')];
+
+pub const OLD_HUNGARIAN: &'static [(char, char)] =
+ &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')];
+
+pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')];
+
+pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')];
+
+pub const OLD_PERMIC: &'static [(char, char)] = &[('𐍐', '\u{1037a}')];
+
+pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')];
+
+pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')];
+
+pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('𐩠', '𐩿')];
+
+pub const OLD_TURKIC: &'static [(char, char)] = &[('𐰀', '𐱈')];
+
+pub const OLD_UYGHUR: &'static [(char, char)] = &[('𐽰', '𐾉')];
+
+pub const ORIYA: &'static [(char, char)] = &[
+ ('\u{b01}', 'ଃ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('\u{b3c}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', '\u{b63}'),
+ ('୦', '୷'),
+];
+
+pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')];
+
+pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')];
+
+pub const PAHAWH_HMONG: &'static [(char, char)] =
+ &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')];
+
+pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')];
+
+pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')];
+
+pub const PHAGS_PA: &'static [(char, char)] = &[('ꡀ', '꡷')];
+
+pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')];
+
+pub const PSALTER_PAHLAVI: &'static [(char, char)] =
+ &[('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')];
+
+pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')];
+
+pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')];
+
+pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')];
+
+pub const SAURASHTRA: &'static [(char, char)] =
+ &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')];
+
+pub const SHARADA: &'static [(char, char)] = &[('\u{11180}', '𑇟')];
+
+pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')];
+
+pub const SIDDHAM: &'static [(char, char)] =
+ &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')];
+
+pub const SIGNWRITING: &'static [(char, char)] =
+ &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')];
+
+pub const SINHALA: &'static [(char, char)] = &[
+ ('\u{d81}', 'ඃ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('෦', '෯'),
+ ('ෲ', '෴'),
+ ('𑇡', '𑇴'),
+];
+
+pub const SOGDIAN: &'static [(char, char)] = &[('𐼰', '𐽙')];
+
+pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')];
+
+pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')];
+
+pub const SUNDANESE: &'static [(char, char)] =
+ &[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')];
+
+pub const SYLOTI_NAGRI: &'static [(char, char)] = &[('ꠀ', '\u{a82c}')];
+
+pub const SYRIAC: &'static [(char, char)] =
+ &[('܀', '܍'), ('\u{70f}', '\u{74a}'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ')];
+
+pub const TAGALOG: &'static [(char, char)] = &[('ᜀ', '᜕'), ('ᜟ', 'ᜟ')];
+
+pub const TAGBANWA: &'static [(char, char)] =
+ &[('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')];
+
+pub const TAI_LE: &'static [(char, char)] = &[('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')];
+
+pub const TAI_THAM: &'static [(char, char)] = &[
+ ('ᨠ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '᪉'),
+ ('᪐', '᪙'),
+ ('᪠', '᪭'),
+];
+
+pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')];
+
+pub const TAKRI: &'static [(char, char)] = &[('𑚀', '𑚹'), ('𑛀', '𑛉')];
+
+pub const TAMIL: &'static [(char, char)] = &[
+ ('\u{b82}', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('ௐ', 'ௐ'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('௦', '௺'),
+ ('𑿀', '𑿱'),
+ ('𑿿', '𑿿'),
+];
+
+pub const TANGSA: &'static [(char, char)] = &[('𖩰', '𖪾'), ('𖫀', '𖫉')];
+
+pub const TANGUT: &'static [(char, char)] =
+ &[('𖿠', '𖿠'), ('𗀀', '𘟷'), ('𘠀', '𘫿'), ('𘴀', '𘴈')];
+
+pub const TELUGU: &'static [(char, char)] = &[
+ ('\u{c00}', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('\u{c3c}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', '\u{c63}'),
+ ('౦', '౯'),
+ ('౷', '౿'),
+];
+
+pub const THAANA: &'static [(char, char)] = &[('ހ', 'ޱ')];
+
+pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')];
+
+pub const TIBETAN: &'static [(char, char)] = &[
+ ('ༀ', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('\u{f71}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('྾', '࿌'),
+ ('࿎', '࿔'),
+ ('࿙', '࿚'),
+];
+
+pub const TIFINAGH: &'static [(char, char)] =
+ &[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')];
+
+pub const TIRHUTA: &'static [(char, char)] = &[('𑒀', '𑓇'), ('𑓐', '𑓙')];
+
+pub const TOTO: &'static [(char, char)] = &[('𞊐', '\u{1e2ae}')];
+
+pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')];
+
+pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')];
+
+pub const VITHKUQI: &'static [(char, char)] = &[
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+];
+
+pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')];
+
+pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')];
+
+pub const YEZIDI: &'static [(char, char)] =
+ &[('𐺀', '𐺩'), ('\u{10eab}', '𐺭'), ('𐺰', '𐺱')];
+
+pub const YI: &'static [(char, char)] = &[('ꀀ', 'ꒌ'), ('꒐', '꓆')];
+
+pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('𑨀', '\u{11a47}')];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/script_extension.rs b/third_party/rust/regex-syntax/src/unicode_tables/script_extension.rs
new file mode 100644
index 0000000000..42625e21b9
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/script_extension.rs
@@ -0,0 +1,1457 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate script-extension ucd-15.0.0 --chars
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
+ ("Adlam", ADLAM),
+ ("Ahom", AHOM),
+ ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS),
+ ("Arabic", ARABIC),
+ ("Armenian", ARMENIAN),
+ ("Avestan", AVESTAN),
+ ("Balinese", BALINESE),
+ ("Bamum", BAMUM),
+ ("Bassa_Vah", BASSA_VAH),
+ ("Batak", BATAK),
+ ("Bengali", BENGALI),
+ ("Bhaiksuki", BHAIKSUKI),
+ ("Bopomofo", BOPOMOFO),
+ ("Brahmi", BRAHMI),
+ ("Braille", BRAILLE),
+ ("Buginese", BUGINESE),
+ ("Buhid", BUHID),
+ ("Canadian_Aboriginal", CANADIAN_ABORIGINAL),
+ ("Carian", CARIAN),
+ ("Caucasian_Albanian", CAUCASIAN_ALBANIAN),
+ ("Chakma", CHAKMA),
+ ("Cham", CHAM),
+ ("Cherokee", CHEROKEE),
+ ("Chorasmian", CHORASMIAN),
+ ("Common", COMMON),
+ ("Coptic", COPTIC),
+ ("Cuneiform", CUNEIFORM),
+ ("Cypriot", CYPRIOT),
+ ("Cypro_Minoan", CYPRO_MINOAN),
+ ("Cyrillic", CYRILLIC),
+ ("Deseret", DESERET),
+ ("Devanagari", DEVANAGARI),
+ ("Dives_Akuru", DIVES_AKURU),
+ ("Dogra", DOGRA),
+ ("Duployan", DUPLOYAN),
+ ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
+ ("Elbasan", ELBASAN),
+ ("Elymaic", ELYMAIC),
+ ("Ethiopic", ETHIOPIC),
+ ("Georgian", GEORGIAN),
+ ("Glagolitic", GLAGOLITIC),
+ ("Gothic", GOTHIC),
+ ("Grantha", GRANTHA),
+ ("Greek", GREEK),
+ ("Gujarati", GUJARATI),
+ ("Gunjala_Gondi", GUNJALA_GONDI),
+ ("Gurmukhi", GURMUKHI),
+ ("Han", HAN),
+ ("Hangul", HANGUL),
+ ("Hanifi_Rohingya", HANIFI_ROHINGYA),
+ ("Hanunoo", HANUNOO),
+ ("Hatran", HATRAN),
+ ("Hebrew", HEBREW),
+ ("Hiragana", HIRAGANA),
+ ("Imperial_Aramaic", IMPERIAL_ARAMAIC),
+ ("Inherited", INHERITED),
+ ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI),
+ ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN),
+ ("Javanese", JAVANESE),
+ ("Kaithi", KAITHI),
+ ("Kannada", KANNADA),
+ ("Katakana", KATAKANA),
+ ("Kawi", KAWI),
+ ("Kayah_Li", KAYAH_LI),
+ ("Kharoshthi", KHAROSHTHI),
+ ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
+ ("Khmer", KHMER),
+ ("Khojki", KHOJKI),
+ ("Khudawadi", KHUDAWADI),
+ ("Lao", LAO),
+ ("Latin", LATIN),
+ ("Lepcha", LEPCHA),
+ ("Limbu", LIMBU),
+ ("Linear_A", LINEAR_A),
+ ("Linear_B", LINEAR_B),
+ ("Lisu", LISU),
+ ("Lycian", LYCIAN),
+ ("Lydian", LYDIAN),
+ ("Mahajani", MAHAJANI),
+ ("Makasar", MAKASAR),
+ ("Malayalam", MALAYALAM),
+ ("Mandaic", MANDAIC),
+ ("Manichaean", MANICHAEAN),
+ ("Marchen", MARCHEN),
+ ("Masaram_Gondi", MASARAM_GONDI),
+ ("Medefaidrin", MEDEFAIDRIN),
+ ("Meetei_Mayek", MEETEI_MAYEK),
+ ("Mende_Kikakui", MENDE_KIKAKUI),
+ ("Meroitic_Cursive", MEROITIC_CURSIVE),
+ ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS),
+ ("Miao", MIAO),
+ ("Modi", MODI),
+ ("Mongolian", MONGOLIAN),
+ ("Mro", MRO),
+ ("Multani", MULTANI),
+ ("Myanmar", MYANMAR),
+ ("Nabataean", NABATAEAN),
+ ("Nag_Mundari", NAG_MUNDARI),
+ ("Nandinagari", NANDINAGARI),
+ ("New_Tai_Lue", NEW_TAI_LUE),
+ ("Newa", NEWA),
+ ("Nko", NKO),
+ ("Nushu", NUSHU),
+ ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG),
+ ("Ogham", OGHAM),
+ ("Ol_Chiki", OL_CHIKI),
+ ("Old_Hungarian", OLD_HUNGARIAN),
+ ("Old_Italic", OLD_ITALIC),
+ ("Old_North_Arabian", OLD_NORTH_ARABIAN),
+ ("Old_Permic", OLD_PERMIC),
+ ("Old_Persian", OLD_PERSIAN),
+ ("Old_Sogdian", OLD_SOGDIAN),
+ ("Old_South_Arabian", OLD_SOUTH_ARABIAN),
+ ("Old_Turkic", OLD_TURKIC),
+ ("Old_Uyghur", OLD_UYGHUR),
+ ("Oriya", ORIYA),
+ ("Osage", OSAGE),
+ ("Osmanya", OSMANYA),
+ ("Pahawh_Hmong", PAHAWH_HMONG),
+ ("Palmyrene", PALMYRENE),
+ ("Pau_Cin_Hau", PAU_CIN_HAU),
+ ("Phags_Pa", PHAGS_PA),
+ ("Phoenician", PHOENICIAN),
+ ("Psalter_Pahlavi", PSALTER_PAHLAVI),
+ ("Rejang", REJANG),
+ ("Runic", RUNIC),
+ ("Samaritan", SAMARITAN),
+ ("Saurashtra", SAURASHTRA),
+ ("Sharada", SHARADA),
+ ("Shavian", SHAVIAN),
+ ("Siddham", SIDDHAM),
+ ("SignWriting", SIGNWRITING),
+ ("Sinhala", SINHALA),
+ ("Sogdian", SOGDIAN),
+ ("Sora_Sompeng", SORA_SOMPENG),
+ ("Soyombo", SOYOMBO),
+ ("Sundanese", SUNDANESE),
+ ("Syloti_Nagri", SYLOTI_NAGRI),
+ ("Syriac", SYRIAC),
+ ("Tagalog", TAGALOG),
+ ("Tagbanwa", TAGBANWA),
+ ("Tai_Le", TAI_LE),
+ ("Tai_Tham", TAI_THAM),
+ ("Tai_Viet", TAI_VIET),
+ ("Takri", TAKRI),
+ ("Tamil", TAMIL),
+ ("Tangsa", TANGSA),
+ ("Tangut", TANGUT),
+ ("Telugu", TELUGU),
+ ("Thaana", THAANA),
+ ("Thai", THAI),
+ ("Tibetan", TIBETAN),
+ ("Tifinagh", TIFINAGH),
+ ("Tirhuta", TIRHUTA),
+ ("Toto", TOTO),
+ ("Ugaritic", UGARITIC),
+ ("Vai", VAI),
+ ("Vithkuqi", VITHKUQI),
+ ("Wancho", WANCHO),
+ ("Warang_Citi", WARANG_CITI),
+ ("Yezidi", YEZIDI),
+ ("Yi", YI),
+ ("Zanabazar_Square", ZANABAZAR_SQUARE),
+];
+
+pub const ADLAM: &'static [(char, char)] =
+ &[('؟', '؟'), ('ـ', 'ـ'), ('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')];
+
+pub const AHOM: &'static [(char, char)] =
+ &[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑝆')];
+
+pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𔐀', '𔙆')];
+
+pub const ARABIC: &'static [(char, char)] = &[
+ ('\u{600}', '\u{604}'),
+ ('؆', '\u{6dc}'),
+ ('۞', 'ۿ'),
+ ('ݐ', 'ݿ'),
+ ('ࡰ', 'ࢎ'),
+ ('\u{890}', '\u{891}'),
+ ('\u{898}', '\u{8e1}'),
+ ('\u{8e3}', '\u{8ff}'),
+ ('ﭐ', '﯂'),
+ ('ﯓ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('﷏', '﷏'),
+ ('ﷰ', '﷿'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('\u{102e0}', '𐋻'),
+ ('𐹠', '𐹾'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𞻰', '𞻱'),
+];
+
+pub const ARMENIAN: &'static [(char, char)] =
+ &[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')];
+
+pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')];
+
+pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭌ'), ('᭐', '᭾')];
+
+pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')];
+
+pub const BASSA_VAH: &'static [(char, char)] =
+ &[('𖫐', '𖫭'), ('\u{16af0}', '𖫵')];
+
+pub const BATAK: &'static [(char, char)] = &[('ᯀ', '᯳'), ('᯼', '᯿')];
+
+pub const BENGALI: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('ঀ', 'ঃ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('\u{9bc}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৎ'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('ড়', 'ঢ়'),
+ ('য়', '\u{9e3}'),
+ ('০', '\u{9fe}'),
+ ('\u{1cd0}', '\u{1cd0}'),
+ ('\u{1cd2}', '\u{1cd2}'),
+ ('\u{1cd5}', '\u{1cd6}'),
+ ('\u{1cd8}', '\u{1cd8}'),
+ ('᳡', '᳡'),
+ ('ᳪ', 'ᳪ'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('ᳲ', 'ᳲ'),
+ ('ᳵ', '᳷'),
+ ('\u{a8f1}', '\u{a8f1}'),
+];
+
+pub const BHAIKSUKI: &'static [(char, char)] =
+ &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')];
+
+pub const BOPOMOFO: &'static [(char, char)] = &[
+ ('˪', '˫'),
+ ('、', '〃'),
+ ('〈', '】'),
+ ('〓', '〟'),
+ ('\u{302a}', '\u{302d}'),
+ ('〰', '〰'),
+ ('〷', '〷'),
+ ('・', '・'),
+ ('ㄅ', 'ㄯ'),
+ ('ㆠ', 'ㆿ'),
+ ('﹅', '﹆'),
+ ('。', '・'),
+];
+
+pub const BRAHMI: &'static [(char, char)] =
+ &[('𑀀', '𑁍'), ('𑁒', '𑁵'), ('\u{1107f}', '\u{1107f}')];
+
+pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')];
+
+pub const BUGINESE: &'static [(char, char)] =
+ &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟'), ('ꧏ', 'ꧏ')];
+
+pub const BUHID: &'static [(char, char)] = &[('᜵', '᜶'), ('ᝀ', '\u{1753}')];
+
+pub const CANADIAN_ABORIGINAL: &'static [(char, char)] =
+ &[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), ('𑪰', '𑪿')];
+
+pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')];
+
+pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] =
+ &[('𐔰', '𐕣'), ('𐕯', '𐕯')];
+
+pub const CHAKMA: &'static [(char, char)] =
+ &[('০', '৯'), ('၀', '၉'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑅇')];
+
+pub const CHAM: &'static [(char, char)] =
+ &[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')];
+
+pub const CHEROKEE: &'static [(char, char)] =
+ &[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')];
+
+pub const CHORASMIAN: &'static [(char, char)] = &[('𐾰', '𐿋')];
+
+pub const COMMON: &'static [(char, char)] = &[
+ ('\0', '@'),
+ ('[', '`'),
+ ('{', '©'),
+ ('«', '¹'),
+ ('»', '¿'),
+ ('×', '×'),
+ ('÷', '÷'),
+ ('ʹ', '˟'),
+ ('˥', '˩'),
+ ('ˬ', '˿'),
+ ('ʹ', 'ʹ'),
+ (';', ';'),
+ ('΅', '΅'),
+ ('·', '·'),
+ ('\u{605}', '\u{605}'),
+ ('\u{6dd}', '\u{6dd}'),
+ ('\u{8e2}', '\u{8e2}'),
+ ('฿', '฿'),
+ ('࿕', '࿘'),
+ ('᛫', '᛭'),
+ ('\u{2000}', '\u{200b}'),
+ ('\u{200e}', '\u{202e}'),
+ ('‰', '\u{2064}'),
+ ('\u{2066}', '⁰'),
+ ('⁴', '⁾'),
+ ('₀', '₎'),
+ ('₠', '⃀'),
+ ('℀', '℥'),
+ ('℧', '℩'),
+ ('ℬ', 'ℱ'),
+ ('ℳ', '⅍'),
+ ('⅏', '⅟'),
+ ('↉', '↋'),
+ ('←', '␦'),
+ ('⑀', '⑊'),
+ ('①', '⟿'),
+ ('⤀', '⭳'),
+ ('⭶', '⮕'),
+ ('⮗', '⯿'),
+ ('⸀', '⹂'),
+ ('⹄', '⹝'),
+ ('⿰', '⿻'),
+ ('\u{3000}', '\u{3000}'),
+ ('〄', '〄'),
+ ('〒', '〒'),
+ ('〠', '〠'),
+ ('〶', '〶'),
+ ('㉈', '㉟'),
+ ('㉿', '㉿'),
+ ('㊱', '㊿'),
+ ('㋌', '㋏'),
+ ('㍱', '㍺'),
+ ('㎀', '㏟'),
+ ('㏿', '㏿'),
+ ('䷀', '䷿'),
+ ('꜈', '꜡'),
+ ('ꞈ', '꞊'),
+ ('꭛', '꭛'),
+ ('꭪', '꭫'),
+ ('︐', '︙'),
+ ('︰', '﹄'),
+ ('﹇', '﹒'),
+ ('﹔', '﹦'),
+ ('﹨', '﹫'),
+ ('\u{feff}', '\u{feff}'),
+ ('!', '@'),
+ ('[', '`'),
+ ('{', '⦆'),
+ ('¢', '₩'),
+ ('│', '○'),
+ ('\u{fff9}', '�'),
+ ('𐆐', '𐆜'),
+ ('𐇐', '𐇼'),
+ ('𜽐', '𜿃'),
+ ('𝀀', '𝃵'),
+ ('𝄀', '𝄦'),
+ ('𝄩', '𝅦'),
+ ('𝅪', '\u{1d17a}'),
+ ('𝆃', '𝆄'),
+ ('𝆌', '𝆩'),
+ ('𝆮', '𝇪'),
+ ('𝋀', '𝋓'),
+ ('𝋠', '𝋳'),
+ ('𝌀', '𝍖'),
+ ('𝍲', '𝍸'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝟋'),
+ ('𝟎', '𝟿'),
+ ('𞱱', '𞲴'),
+ ('𞴁', '𞴽'),
+ ('🀀', '🀫'),
+ ('🀰', '🂓'),
+ ('🂠', '🂮'),
+ ('🂱', '🂿'),
+ ('🃁', '🃏'),
+ ('🃑', '🃵'),
+ ('🄀', '🆭'),
+ ('🇦', '🇿'),
+ ('🈁', '🈂'),
+ ('🈐', '🈻'),
+ ('🉀', '🉈'),
+ ('🉠', '🉥'),
+ ('🌀', '🛗'),
+ ('🛜', '🛬'),
+ ('🛰', '🛼'),
+ ('🜀', '🝶'),
+ ('🝻', '🟙'),
+ ('🟠', '🟫'),
+ ('🟰', '🟰'),
+ ('🠀', '🠋'),
+ ('🠐', '🡇'),
+ ('🡐', '🡙'),
+ ('🡠', '🢇'),
+ ('🢐', '🢭'),
+ ('🢰', '🢱'),
+ ('🤀', '🩓'),
+ ('🩠', '🩭'),
+ ('🩰', '🩼'),
+ ('🪀', '🪈'),
+ ('🪐', '🪽'),
+ ('🪿', '🫅'),
+ ('🫎', '🫛'),
+ ('🫠', '🫨'),
+ ('🫰', '🫸'),
+ ('🬀', '🮒'),
+ ('🮔', '🯊'),
+ ('🯰', '🯹'),
+ ('\u{e0001}', '\u{e0001}'),
+ ('\u{e0020}', '\u{e007f}'),
+];
+
+pub const COPTIC: &'static [(char, char)] =
+ &[('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿'), ('\u{102e0}', '𐋻')];
+
+pub const CUNEIFORM: &'static [(char, char)] =
+ &[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')];
+
+pub const CYPRIOT: &'static [(char, char)] = &[
+ ('𐄀', '𐄂'),
+ ('𐄇', '𐄳'),
+ ('𐄷', '𐄿'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐠿'),
+];
+
+pub const CYPRO_MINOAN: &'static [(char, char)] = &[('𐄀', '𐄁'), ('𒾐', '𒿲')];
+
+pub const CYRILLIC: &'static [(char, char)] = &[
+ ('Ѐ', 'ԯ'),
+ ('ᲀ', 'ᲈ'),
+ ('ᴫ', 'ᴫ'),
+ ('ᵸ', 'ᵸ'),
+ ('\u{1df8}', '\u{1df8}'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('⹃', '⹃'),
+ ('Ꙁ', '\u{a69f}'),
+ ('\u{fe2e}', '\u{fe2f}'),
+ ('𞀰', '𞁭'),
+ ('\u{1e08f}', '\u{1e08f}'),
+];
+
+pub const DESERET: &'static [(char, char)] = &[('𐐀', '𐑏')];
+
+pub const DEVANAGARI: &'static [(char, char)] = &[
+ ('\u{900}', '\u{952}'),
+ ('\u{955}', 'ॿ'),
+ ('\u{1cd0}', 'ᳶ'),
+ ('\u{1cf8}', '\u{1cf9}'),
+ ('\u{20f0}', '\u{20f0}'),
+ ('꠰', '꠹'),
+ ('\u{a8e0}', '\u{a8ff}'),
+ ('𑬀', '𑬉'),
+];
+
+pub const DIVES_AKURU: &'static [(char, char)] = &[
+ ('𑤀', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '𑥆'),
+ ('𑥐', '𑥙'),
+];
+
+pub const DOGRA: &'static [(char, char)] =
+ &[('।', '९'), ('꠰', '꠹'), ('𑠀', '𑠻')];
+
+pub const DUPLOYAN: &'static [(char, char)] =
+ &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '\u{1bca3}')];
+
+pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] =
+ &[('𓀀', '\u{13455}')];
+
+pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')];
+
+pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')];
+
+pub const ETHIOPIC: &'static [(char, char)] = &[
+ ('ሀ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('\u{135d}', '፼'),
+ ('ᎀ', '᎙'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+];
+
+pub const GEORGIAN: &'static [(char, char)] = &[
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჿ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+];
+
+pub const GLAGOLITIC: &'static [(char, char)] = &[
+ ('\u{484}', '\u{484}'),
+ ('\u{487}', '\u{487}'),
+ ('Ⰰ', 'ⱟ'),
+ ('⹃', '⹃'),
+ ('\u{a66f}', '\u{a66f}'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+];
+
+pub const GOTHIC: &'static [(char, char)] = &[('𐌰', '𐍊')];
+
+pub const GRANTHA: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('௦', '௳'),
+ ('\u{1cd0}', '\u{1cd0}'),
+ ('\u{1cd2}', '᳓'),
+ ('ᳲ', '\u{1cf4}'),
+ ('\u{1cf8}', '\u{1cf9}'),
+ ('\u{20f0}', '\u{20f0}'),
+ ('\u{11300}', '𑌃'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('\u{1133b}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('𑍐', '𑍐'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍝', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('𑿐', '𑿑'),
+ ('𑿓', '𑿓'),
+];
+
+pub const GREEK: &'static [(char, char)] = &[
+ ('\u{342}', '\u{342}'),
+ ('\u{345}', '\u{345}'),
+ ('Ͱ', 'ͳ'),
+ ('͵', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('΄', '΄'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϡ'),
+ ('ϰ', 'Ͽ'),
+ ('ᴦ', 'ᴪ'),
+ ('ᵝ', 'ᵡ'),
+ ('ᵦ', 'ᵪ'),
+ ('ᶿ', '\u{1dc1}'),
+ ('ἀ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ῄ'),
+ ('ῆ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('῝', '`'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', '῾'),
+ ('Ω', 'Ω'),
+ ('ꭥ', 'ꭥ'),
+ ('𐅀', '𐆎'),
+ ('𐆠', '𐆠'),
+ ('𝈀', '𝉅'),
+];
+
+pub const GUJARATI: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('\u{a81}', 'ઃ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('\u{abc}', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', '\u{ae3}'),
+ ('૦', '૱'),
+ ('ૹ', '\u{aff}'),
+ ('꠰', '꠹'),
+];
+
+pub const GUNJALA_GONDI: &'static [(char, char)] = &[
+ ('।', '॥'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '𑶘'),
+ ('𑶠', '𑶩'),
+];
+
+pub const GURMUKHI: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('\u{a01}', 'ਃ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('੦', '੶'),
+ ('꠰', '꠹'),
+];
+
+pub const HAN: &'static [(char, char)] = &[
+ ('⺀', '⺙'),
+ ('⺛', '⻳'),
+ ('⼀', '⿕'),
+ ('、', '〃'),
+ ('々', '】'),
+ ('〓', '〟'),
+ ('〡', '\u{302d}'),
+ ('〰', '〰'),
+ ('〷', '〿'),
+ ('・', '・'),
+ ('㆐', '㆟'),
+ ('㇀', '㇣'),
+ ('㈠', '㉇'),
+ ('㊀', '㊰'),
+ ('㋀', '㋋'),
+ ('㋿', '㋿'),
+ ('㍘', '㍰'),
+ ('㍻', '㍿'),
+ ('㏠', '㏾'),
+ ('㐀', '䶿'),
+ ('一', '鿿'),
+ ('꜀', '꜇'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('﹅', '﹆'),
+ ('。', '・'),
+ ('𖿢', '𖿣'),
+ ('𖿰', '𖿱'),
+ ('𝍠', '𝍱'),
+ ('🉐', '🉑'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const HANGUL: &'static [(char, char)] = &[
+ ('ᄀ', 'ᇿ'),
+ ('、', '〃'),
+ ('〈', '】'),
+ ('〓', '〟'),
+ ('\u{302e}', '〰'),
+ ('〷', '〷'),
+ ('・', '・'),
+ ('ㄱ', 'ㆎ'),
+ ('㈀', '㈞'),
+ ('㉠', '㉾'),
+ ('ꥠ', 'ꥼ'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('﹅', '﹆'),
+ ('。', '・'),
+ ('ᅠ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+];
+
+pub const HANIFI_ROHINGYA: &'static [(char, char)] = &[
+ ('،', '،'),
+ ('؛', '؛'),
+ ('؟', '؟'),
+ ('ـ', 'ـ'),
+ ('۔', '۔'),
+ ('𐴀', '\u{10d27}'),
+ ('𐴰', '𐴹'),
+];
+
+pub const HANUNOO: &'static [(char, char)] = &[('ᜠ', '᜶')];
+
+pub const HATRAN: &'static [(char, char)] =
+ &[('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿')];
+
+pub const HEBREW: &'static [(char, char)] = &[
+ ('\u{591}', '\u{5c7}'),
+ ('א', 'ת'),
+ ('ׯ', '״'),
+ ('יִ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﭏ'),
+];
+
+pub const HIRAGANA: &'static [(char, char)] = &[
+ ('、', '〃'),
+ ('〈', '】'),
+ ('〓', '〟'),
+ ('〰', '〵'),
+ ('〷', '〷'),
+ ('〼', '〽'),
+ ('ぁ', 'ゖ'),
+ ('\u{3099}', '゠'),
+ ('・', 'ー'),
+ ('﹅', '﹆'),
+ ('。', '・'),
+ ('ー', 'ー'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ ('𛀁', '𛄟'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('🈀', '🈀'),
+];
+
+pub const IMPERIAL_ARAMAIC: &'static [(char, char)] =
+ &[('𐡀', '𐡕'), ('𐡗', '𐡟')];
+
+pub const INHERITED: &'static [(char, char)] = &[
+ ('\u{300}', '\u{341}'),
+ ('\u{343}', '\u{344}'),
+ ('\u{346}', '\u{362}'),
+ ('\u{953}', '\u{954}'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1dc2}', '\u{1df7}'),
+ ('\u{1df9}', '\u{1df9}'),
+ ('\u{1dfb}', '\u{1dff}'),
+ ('\u{200c}', '\u{200d}'),
+ ('\u{20d0}', '\u{20ef}'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2d}'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d167}', '\u{1d169}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] =
+ &[('𐭠', '𐭲'), ('𐭸', '𐭿')];
+
+pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] =
+ &[('𐭀', '𐭕'), ('𐭘', '𐭟')];
+
+pub const JAVANESE: &'static [(char, char)] =
+ &[('\u{a980}', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟')];
+
+pub const KAITHI: &'static [(char, char)] = &[
+ ('०', '९'),
+ ('꠰', '꠹'),
+ ('\u{11080}', '\u{110c2}'),
+ ('\u{110cd}', '\u{110cd}'),
+];
+
+pub const KANNADA: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('ಀ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('\u{cbc}', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', '\u{ce3}'),
+ ('೦', '೯'),
+ ('ೱ', 'ೳ'),
+ ('\u{1cd0}', '\u{1cd0}'),
+ ('\u{1cd2}', '\u{1cd2}'),
+ ('\u{1cda}', '\u{1cda}'),
+ ('ᳲ', 'ᳲ'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('꠰', '꠵'),
+];
+
+pub const KATAKANA: &'static [(char, char)] = &[
+ ('、', '〃'),
+ ('〈', '】'),
+ ('〓', '〟'),
+ ('〰', '〵'),
+ ('〷', '〷'),
+ ('〼', '〽'),
+ ('\u{3099}', '゜'),
+ ('゠', 'ヿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㋐', '㋾'),
+ ('㌀', '㍗'),
+ ('﹅', '﹆'),
+ ('。', '\u{ff9f}'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛀀'),
+ ('𛄠', '𛄢'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+];
+
+pub const KAWI: &'static [(char, char)] =
+ &[('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '𑽙')];
+
+pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '꤯')];
+
+pub const KHAROSHTHI: &'static [(char, char)] = &[
+ ('𐨀', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '𐩈'),
+ ('𐩐', '𐩘'),
+];
+
+pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] =
+ &[('\u{16fe4}', '\u{16fe4}'), ('𘬀', '𘳕')];
+
+pub const KHMER: &'static [(char, char)] =
+ &[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')];
+
+pub const KHOJKI: &'static [(char, char)] =
+ &[('૦', '૯'), ('꠰', '꠹'), ('𑈀', '𑈑'), ('𑈓', '\u{11241}')];
+
+pub const KHUDAWADI: &'static [(char, char)] =
+ &[('।', '॥'), ('꠰', '꠹'), ('𑊰', '\u{112ea}'), ('𑋰', '𑋹')];
+
+pub const LAO: &'static [(char, char)] = &[
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('\u{ec8}', '\u{ece}'),
+ ('໐', '໙'),
+ ('ໜ', 'ໟ'),
+];
+
+pub const LATIN: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ʸ'),
+ ('ˠ', 'ˤ'),
+ ('\u{363}', '\u{36f}'),
+ ('\u{485}', '\u{486}'),
+ ('\u{951}', '\u{952}'),
+ ('჻', '჻'),
+ ('ᴀ', 'ᴥ'),
+ ('ᴬ', 'ᵜ'),
+ ('ᵢ', 'ᵥ'),
+ ('ᵫ', 'ᵷ'),
+ ('ᵹ', 'ᶾ'),
+ ('Ḁ', 'ỿ'),
+ ('\u{202f}', '\u{202f}'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('\u{20f0}', '\u{20f0}'),
+ ('K', 'Å'),
+ ('Ⅎ', 'Ⅎ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⱡ', 'Ɀ'),
+ ('꜀', '꜇'),
+ ('Ꜣ', 'ꞇ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꟿ'),
+ ('꤮', '꤮'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭤ'),
+ ('ꭦ', 'ꭩ'),
+ ('ff', 'st'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+];
+
+pub const LEPCHA: &'static [(char, char)] =
+ &[('ᰀ', '\u{1c37}'), ('᰻', '᱉'), ('ᱍ', 'ᱏ')];
+
+pub const LIMBU: &'static [(char, char)] = &[
+ ('॥', '॥'),
+ ('ᤀ', 'ᤞ'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('᥀', '᥀'),
+ ('᥄', '᥏'),
+];
+
+pub const LINEAR_A: &'static [(char, char)] =
+ &[('𐄇', '𐄳'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧')];
+
+pub const LINEAR_B: &'static [(char, char)] = &[
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐄀', '𐄂'),
+ ('𐄇', '𐄳'),
+ ('𐄷', '𐄿'),
+];
+
+pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿'), ('𑾰', '𑾰')];
+
+pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')];
+
+pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')];
+
+pub const MAHAJANI: &'static [(char, char)] =
+ &[('।', '९'), ('꠰', '꠹'), ('𑅐', '𑅶')];
+
+pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')];
+
+pub const MALAYALAM: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('\u{d00}', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', '൏'),
+ ('ൔ', '\u{d63}'),
+ ('൦', 'ൿ'),
+ ('\u{1cda}', '\u{1cda}'),
+ ('꠰', '꠲'),
+];
+
+pub const MANDAIC: &'static [(char, char)] =
+ &[('ـ', 'ـ'), ('ࡀ', '\u{85b}'), ('࡞', '࡞')];
+
+pub const MANICHAEAN: &'static [(char, char)] =
+ &[('ـ', 'ـ'), ('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')];
+
+pub const MARCHEN: &'static [(char, char)] =
+ &[('𑱰', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}')];
+
+pub const MASARAM_GONDI: &'static [(char, char)] = &[
+ ('।', '॥'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d47}'),
+ ('𑵐', '𑵙'),
+];
+
+pub const MEDEFAIDRIN: &'static [(char, char)] = &[('𖹀', '𖺚')];
+
+pub const MEETEI_MAYEK: &'static [(char, char)] =
+ &[('ꫠ', '\u{aaf6}'), ('ꯀ', '\u{abed}'), ('꯰', '꯹')];
+
+pub const MENDE_KIKAKUI: &'static [(char, char)] =
+ &[('𞠀', '𞣄'), ('𞣇', '\u{1e8d6}')];
+
+pub const MEROITIC_CURSIVE: &'static [(char, char)] =
+ &[('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿')];
+
+pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')];
+
+pub const MIAO: &'static [(char, char)] =
+ &[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')];
+
+pub const MODI: &'static [(char, char)] =
+ &[('꠰', '꠹'), ('𑘀', '𑙄'), ('𑙐', '𑙙')];
+
+pub const MONGOLIAN: &'static [(char, char)] = &[
+ ('᠀', '᠙'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢪ'),
+ ('\u{202f}', '\u{202f}'),
+ ('𑙠', '𑙬'),
+];
+
+pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')];
+
+pub const MULTANI: &'static [(char, char)] =
+ &[('੦', '੯'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')];
+
+pub const MYANMAR: &'static [(char, char)] =
+ &[('က', '႟'), ('꤮', '꤮'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')];
+
+pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')];
+
+pub const NAG_MUNDARI: &'static [(char, char)] = &[('𞓐', '𞓹')];
+
+pub const NANDINAGARI: &'static [(char, char)] = &[
+ ('।', '॥'),
+ ('೦', '೯'),
+ ('ᳩ', 'ᳩ'),
+ ('ᳲ', 'ᳲ'),
+ ('ᳺ', 'ᳺ'),
+ ('꠰', '꠵'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '\u{119d7}'),
+ ('\u{119da}', '𑧤'),
+];
+
+pub const NEW_TAI_LUE: &'static [(char, char)] =
+ &[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')];
+
+pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '𑑡')];
+
+pub const NKO: &'static [(char, char)] = &[
+ ('،', '،'),
+ ('؛', '؛'),
+ ('؟', '؟'),
+ ('߀', 'ߺ'),
+ ('\u{7fd}', '߿'),
+ ('﴾', '﴿'),
+];
+
+pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')];
+
+pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] =
+ &[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')];
+
+pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')];
+
+pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')];
+
+pub const OLD_HUNGARIAN: &'static [(char, char)] =
+ &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')];
+
+pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')];
+
+pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')];
+
+pub const OLD_PERMIC: &'static [(char, char)] =
+ &[('\u{483}', '\u{483}'), ('𐍐', '\u{1037a}')];
+
+pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')];
+
+pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')];
+
+pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('𐩠', '𐩿')];
+
+pub const OLD_TURKIC: &'static [(char, char)] = &[('𐰀', '𐱈')];
+
+pub const OLD_UYGHUR: &'static [(char, char)] =
+ &[('ـ', 'ـ'), ('𐫲', '𐫲'), ('𐽰', '𐾉')];
+
+pub const ORIYA: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('\u{b01}', 'ଃ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('\u{b3c}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', '\u{b63}'),
+ ('୦', '୷'),
+ ('\u{1cda}', '\u{1cda}'),
+ ('ᳲ', 'ᳲ'),
+];
+
+pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')];
+
+pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')];
+
+pub const PAHAWH_HMONG: &'static [(char, char)] =
+ &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')];
+
+pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')];
+
+pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')];
+
+pub const PHAGS_PA: &'static [(char, char)] =
+ &[('᠂', '᠃'), ('᠅', '᠅'), ('ꡀ', '꡷')];
+
+pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')];
+
+pub const PSALTER_PAHLAVI: &'static [(char, char)] =
+ &[('ـ', 'ـ'), ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')];
+
+pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')];
+
+pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')];
+
+pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')];
+
+pub const SAURASHTRA: &'static [(char, char)] =
+ &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')];
+
+pub const SHARADA: &'static [(char, char)] = &[
+ ('\u{951}', '\u{951}'),
+ ('\u{1cd7}', '\u{1cd7}'),
+ ('\u{1cd9}', '\u{1cd9}'),
+ ('\u{1cdc}', '\u{1cdd}'),
+ ('\u{1ce0}', '\u{1ce0}'),
+ ('\u{11180}', '𑇟'),
+];
+
+pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')];
+
+pub const SIDDHAM: &'static [(char, char)] =
+ &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')];
+
+pub const SIGNWRITING: &'static [(char, char)] =
+ &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')];
+
+pub const SINHALA: &'static [(char, char)] = &[
+ ('।', '॥'),
+ ('\u{d81}', 'ඃ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('෦', '෯'),
+ ('ෲ', '෴'),
+ ('𑇡', '𑇴'),
+];
+
+pub const SOGDIAN: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐼰', '𐽙')];
+
+pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')];
+
+pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')];
+
+pub const SUNDANESE: &'static [(char, char)] =
+ &[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')];
+
+pub const SYLOTI_NAGRI: &'static [(char, char)] =
+ &[('।', '॥'), ('০', '৯'), ('ꠀ', '\u{a82c}')];
+
+pub const SYRIAC: &'static [(char, char)] = &[
+ ('،', '،'),
+ ('؛', '\u{61c}'),
+ ('؟', '؟'),
+ ('ـ', 'ـ'),
+ ('\u{64b}', '\u{655}'),
+ ('\u{670}', '\u{670}'),
+ ('܀', '܍'),
+ ('\u{70f}', '\u{74a}'),
+ ('ݍ', 'ݏ'),
+ ('ࡠ', 'ࡪ'),
+ ('\u{1df8}', '\u{1df8}'),
+ ('\u{1dfa}', '\u{1dfa}'),
+];
+
+pub const TAGALOG: &'static [(char, char)] =
+ &[('ᜀ', '᜕'), ('ᜟ', 'ᜟ'), ('᜵', '᜶')];
+
+pub const TAGBANWA: &'static [(char, char)] =
+ &[('᜵', '᜶'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')];
+
+pub const TAI_LE: &'static [(char, char)] =
+ &[('၀', '၉'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')];
+
+pub const TAI_THAM: &'static [(char, char)] = &[
+ ('ᨠ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '᪉'),
+ ('᪐', '᪙'),
+ ('᪠', '᪭'),
+];
+
+pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')];
+
+pub const TAKRI: &'static [(char, char)] =
+ &[('।', '॥'), ('꠰', '꠹'), ('𑚀', '𑚹'), ('𑛀', '𑛉')];
+
+pub const TAMIL: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('\u{b82}', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('ௐ', 'ௐ'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('௦', '௺'),
+ ('\u{1cda}', '\u{1cda}'),
+ ('ꣳ', 'ꣳ'),
+ ('\u{11301}', '\u{11301}'),
+ ('𑌃', '𑌃'),
+ ('\u{1133b}', '\u{1133c}'),
+ ('𑿀', '𑿱'),
+ ('𑿿', '𑿿'),
+];
+
+pub const TANGSA: &'static [(char, char)] = &[('𖩰', '𖪾'), ('𖫀', '𖫉')];
+
+pub const TANGUT: &'static [(char, char)] =
+ &[('𖿠', '𖿠'), ('𗀀', '𘟷'), ('𘠀', '𘫿'), ('𘴀', '𘴈')];
+
+pub const TELUGU: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('\u{c00}', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('\u{c3c}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', '\u{c63}'),
+ ('౦', '౯'),
+ ('౷', '౿'),
+ ('\u{1cda}', '\u{1cda}'),
+ ('ᳲ', 'ᳲ'),
+];
+
+pub const THAANA: &'static [(char, char)] = &[
+ ('،', '،'),
+ ('؛', '\u{61c}'),
+ ('؟', '؟'),
+ ('٠', '٩'),
+ ('ހ', 'ޱ'),
+ ('ﷲ', 'ﷲ'),
+ ('﷽', '﷽'),
+];
+
+pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')];
+
+pub const TIBETAN: &'static [(char, char)] = &[
+ ('ༀ', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('\u{f71}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('྾', '࿌'),
+ ('࿎', '࿔'),
+ ('࿙', '࿚'),
+];
+
+pub const TIFINAGH: &'static [(char, char)] =
+ &[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')];
+
+pub const TIRHUTA: &'static [(char, char)] = &[
+ ('\u{951}', '\u{952}'),
+ ('।', '॥'),
+ ('ᳲ', 'ᳲ'),
+ ('꠰', '꠹'),
+ ('𑒀', '𑓇'),
+ ('𑓐', '𑓙'),
+];
+
+pub const TOTO: &'static [(char, char)] = &[('𞊐', '\u{1e2ae}')];
+
+pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')];
+
+pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')];
+
+pub const VITHKUQI: &'static [(char, char)] = &[
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+];
+
+pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')];
+
+pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')];
+
+pub const YEZIDI: &'static [(char, char)] = &[
+ ('،', '،'),
+ ('؛', '؛'),
+ ('؟', '؟'),
+ ('٠', '٩'),
+ ('𐺀', '𐺩'),
+ ('\u{10eab}', '𐺭'),
+ ('𐺰', '𐺱'),
+];
+
+pub const YI: &'static [(char, char)] = &[
+ ('、', '。'),
+ ('〈', '】'),
+ ('〔', '〛'),
+ ('・', '・'),
+ ('ꀀ', 'ꒌ'),
+ ('꒐', '꓆'),
+ ('。', '・'),
+];
+
+pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('𑨀', '\u{11a47}')];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/sentence_break.rs b/third_party/rust/regex-syntax/src/unicode_tables/sentence_break.rs
new file mode 100644
index 0000000000..24348736f2
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/sentence_break.rs
@@ -0,0 +1,2477 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate sentence-break ucd-15.0.0 --chars
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
+ ("ATerm", ATERM),
+ ("CR", CR),
+ ("Close", CLOSE),
+ ("Extend", EXTEND),
+ ("Format", FORMAT),
+ ("LF", LF),
+ ("Lower", LOWER),
+ ("Numeric", NUMERIC),
+ ("OLetter", OLETTER),
+ ("SContinue", SCONTINUE),
+ ("STerm", STERM),
+ ("Sep", SEP),
+ ("Sp", SP),
+ ("Upper", UPPER),
+];
+
+pub const ATERM: &'static [(char, char)] =
+ &[('.', '.'), ('․', '․'), ('﹒', '﹒'), ('.', '.')];
+
+pub const CR: &'static [(char, char)] = &[('\r', '\r')];
+
+pub const CLOSE: &'static [(char, char)] = &[
+ ('"', '"'),
+ ('\'', ')'),
+ ('[', '['),
+ (']', ']'),
+ ('{', '{'),
+ ('}', '}'),
+ ('«', '«'),
+ ('»', '»'),
+ ('༺', '༽'),
+ ('᚛', '᚜'),
+ ('‘', '‟'),
+ ('‹', '›'),
+ ('⁅', '⁆'),
+ ('⁽', '⁾'),
+ ('₍', '₎'),
+ ('⌈', '⌋'),
+ ('〈', '〉'),
+ ('❛', '❠'),
+ ('❨', '❵'),
+ ('⟅', '⟆'),
+ ('⟦', '⟯'),
+ ('⦃', '⦘'),
+ ('⧘', '⧛'),
+ ('⧼', '⧽'),
+ ('⸀', '⸍'),
+ ('⸜', '⸝'),
+ ('⸠', '⸩'),
+ ('⹂', '⹂'),
+ ('⹕', '⹜'),
+ ('〈', '】'),
+ ('〔', '〛'),
+ ('〝', '〟'),
+ ('﴾', '﴿'),
+ ('︗', '︘'),
+ ('︵', '﹄'),
+ ('﹇', '﹈'),
+ ('﹙', '﹞'),
+ ('(', ')'),
+ ('[', '['),
+ (']', ']'),
+ ('{', '{'),
+ ('}', '}'),
+ ('⦅', '⦆'),
+ ('「', '」'),
+ ('🙶', '🙸'),
+];
+
+pub const EXTEND: &'static [(char, char)] = &[
+ ('\u{300}', '\u{36f}'),
+ ('\u{483}', '\u{489}'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('\u{610}', '\u{61a}'),
+ ('\u{64b}', '\u{65f}'),
+ ('\u{670}', '\u{670}'),
+ ('\u{6d6}', '\u{6dc}'),
+ ('\u{6df}', '\u{6e4}'),
+ ('\u{6e7}', '\u{6e8}'),
+ ('\u{6ea}', '\u{6ed}'),
+ ('\u{711}', '\u{711}'),
+ ('\u{730}', '\u{74a}'),
+ ('\u{7a6}', '\u{7b0}'),
+ ('\u{7eb}', '\u{7f3}'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('\u{816}', '\u{819}'),
+ ('\u{81b}', '\u{823}'),
+ ('\u{825}', '\u{827}'),
+ ('\u{829}', '\u{82d}'),
+ ('\u{859}', '\u{85b}'),
+ ('\u{898}', '\u{89f}'),
+ ('\u{8ca}', '\u{8e1}'),
+ ('\u{8e3}', 'ः'),
+ ('\u{93a}', '\u{93c}'),
+ ('ा', 'ॏ'),
+ ('\u{951}', '\u{957}'),
+ ('\u{962}', '\u{963}'),
+ ('\u{981}', 'ঃ'),
+ ('\u{9bc}', '\u{9bc}'),
+ ('\u{9be}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', '\u{9cd}'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('\u{9e2}', '\u{9e3}'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', 'ਃ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('\u{a70}', '\u{a71}'),
+ ('\u{a75}', '\u{a75}'),
+ ('\u{a81}', 'ઃ'),
+ ('\u{abc}', '\u{abc}'),
+ ('ા', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('\u{ae2}', '\u{ae3}'),
+ ('\u{afa}', '\u{aff}'),
+ ('\u{b01}', 'ଃ'),
+ ('\u{b3c}', '\u{b3c}'),
+ ('\u{b3e}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('\u{b62}', '\u{b63}'),
+ ('\u{b82}', '\u{b82}'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('\u{c00}', '\u{c04}'),
+ ('\u{c3c}', '\u{c3c}'),
+ ('\u{c3e}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('\u{c62}', '\u{c63}'),
+ ('\u{c81}', 'ಃ'),
+ ('\u{cbc}', '\u{cbc}'),
+ ('ಾ', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('\u{ce2}', '\u{ce3}'),
+ ('ೳ', 'ೳ'),
+ ('\u{d00}', 'ഃ'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('\u{d3e}', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', '\u{d4d}'),
+ ('\u{d57}', '\u{d57}'),
+ ('\u{d62}', '\u{d63}'),
+ ('\u{d81}', 'ඃ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('ෲ', 'ෳ'),
+ ('\u{e31}', '\u{e31}'),
+ ('\u{e34}', '\u{e3a}'),
+ ('\u{e47}', '\u{e4e}'),
+ ('\u{eb1}', '\u{eb1}'),
+ ('\u{eb4}', '\u{ebc}'),
+ ('\u{ec8}', '\u{ece}'),
+ ('\u{f18}', '\u{f19}'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('༾', '༿'),
+ ('\u{f71}', '\u{f84}'),
+ ('\u{f86}', '\u{f87}'),
+ ('\u{f8d}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('ါ', '\u{103e}'),
+ ('ၖ', '\u{1059}'),
+ ('\u{105e}', '\u{1060}'),
+ ('ၢ', 'ၤ'),
+ ('ၧ', 'ၭ'),
+ ('\u{1071}', '\u{1074}'),
+ ('\u{1082}', '\u{108d}'),
+ ('ႏ', 'ႏ'),
+ ('ႚ', '\u{109d}'),
+ ('\u{135d}', '\u{135f}'),
+ ('\u{1712}', '᜕'),
+ ('\u{1732}', '᜴'),
+ ('\u{1752}', '\u{1753}'),
+ ('\u{1772}', '\u{1773}'),
+ ('\u{17b4}', '\u{17d3}'),
+ ('\u{17dd}', '\u{17dd}'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '\u{180f}'),
+ ('\u{1885}', '\u{1886}'),
+ ('\u{18a9}', '\u{18a9}'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('\u{1a17}', '\u{1a1b}'),
+ ('ᩕ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '\u{1a7f}'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1b00}', 'ᬄ'),
+ ('\u{1b34}', '᭄'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', 'ᮂ'),
+ ('ᮡ', '\u{1bad}'),
+ ('\u{1be6}', '᯳'),
+ ('ᰤ', '\u{1c37}'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', '\u{1ce8}'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('᳷', '\u{1cf9}'),
+ ('\u{1dc0}', '\u{1dff}'),
+ ('\u{200c}', '\u{200d}'),
+ ('\u{20d0}', '\u{20f0}'),
+ ('\u{2cef}', '\u{2cf1}'),
+ ('\u{2d7f}', '\u{2d7f}'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('\u{302a}', '\u{302f}'),
+ ('\u{3099}', '\u{309a}'),
+ ('\u{a66f}', '\u{a672}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('\u{a69e}', '\u{a69f}'),
+ ('\u{a6f0}', '\u{a6f1}'),
+ ('\u{a802}', '\u{a802}'),
+ ('\u{a806}', '\u{a806}'),
+ ('\u{a80b}', '\u{a80b}'),
+ ('ꠣ', 'ꠧ'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('ꢀ', 'ꢁ'),
+ ('ꢴ', '\u{a8c5}'),
+ ('\u{a8e0}', '\u{a8f1}'),
+ ('\u{a8ff}', '\u{a8ff}'),
+ ('\u{a926}', '\u{a92d}'),
+ ('\u{a947}', '꥓'),
+ ('\u{a980}', 'ꦃ'),
+ ('\u{a9b3}', '꧀'),
+ ('\u{a9e5}', '\u{a9e5}'),
+ ('\u{aa29}', '\u{aa36}'),
+ ('\u{aa43}', '\u{aa43}'),
+ ('\u{aa4c}', 'ꩍ'),
+ ('ꩻ', 'ꩽ'),
+ ('\u{aab0}', '\u{aab0}'),
+ ('\u{aab2}', '\u{aab4}'),
+ ('\u{aab7}', '\u{aab8}'),
+ ('\u{aabe}', '\u{aabf}'),
+ ('\u{aac1}', '\u{aac1}'),
+ ('ꫫ', 'ꫯ'),
+ ('ꫵ', '\u{aaf6}'),
+ ('ꯣ', 'ꯪ'),
+ ('꯬', '\u{abed}'),
+ ('\u{fb1e}', '\u{fb1e}'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('\u{10376}', '\u{1037a}'),
+ ('\u{10a01}', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '\u{10a0f}'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('\u{10ae5}', '\u{10ae6}'),
+ ('\u{10d24}', '\u{10d27}'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('\u{10f46}', '\u{10f50}'),
+ ('\u{10f82}', '\u{10f85}'),
+ ('𑀀', '𑀂'),
+ ('\u{11038}', '\u{11046}'),
+ ('\u{11070}', '\u{11070}'),
+ ('\u{11073}', '\u{11074}'),
+ ('\u{1107f}', '𑂂'),
+ ('𑂰', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('\u{11100}', '\u{11102}'),
+ ('\u{11127}', '\u{11134}'),
+ ('𑅅', '𑅆'),
+ ('\u{11173}', '\u{11173}'),
+ ('\u{11180}', '𑆂'),
+ ('𑆳', '𑇀'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('𑇎', '\u{111cf}'),
+ ('𑈬', '\u{11237}'),
+ ('\u{1123e}', '\u{1123e}'),
+ ('\u{11241}', '\u{11241}'),
+ ('\u{112df}', '\u{112ea}'),
+ ('\u{11300}', '𑌃'),
+ ('\u{1133b}', '\u{1133c}'),
+ ('\u{1133e}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍢', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('𑐵', '\u{11446}'),
+ ('\u{1145e}', '\u{1145e}'),
+ ('\u{114b0}', '\u{114c3}'),
+ ('\u{115af}', '\u{115b5}'),
+ ('𑖸', '\u{115c0}'),
+ ('\u{115dc}', '\u{115dd}'),
+ ('𑘰', '\u{11640}'),
+ ('\u{116ab}', '\u{116b7}'),
+ ('\u{1171d}', '\u{1172b}'),
+ ('𑠬', '\u{1183a}'),
+ ('\u{11930}', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '\u{1193e}'),
+ ('𑥀', '𑥀'),
+ ('𑥂', '\u{11943}'),
+ ('𑧑', '\u{119d7}'),
+ ('\u{119da}', '\u{119e0}'),
+ ('𑧤', '𑧤'),
+ ('\u{11a01}', '\u{11a0a}'),
+ ('\u{11a33}', '𑨹'),
+ ('\u{11a3b}', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('\u{11a51}', '\u{11a5b}'),
+ ('\u{11a8a}', '\u{11a99}'),
+ ('𑰯', '\u{11c36}'),
+ ('\u{11c38}', '\u{11c3f}'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('\u{11d31}', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d45}'),
+ ('\u{11d47}', '\u{11d47}'),
+ ('𑶊', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '\u{11d97}'),
+ ('\u{11ef3}', '𑻶'),
+ ('\u{11f00}', '\u{11f01}'),
+ ('𑼃', '𑼃'),
+ ('𑼴', '\u{11f3a}'),
+ ('𑼾', '\u{11f42}'),
+ ('\u{13440}', '\u{13440}'),
+ ('\u{13447}', '\u{13455}'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('\u{16b30}', '\u{16b36}'),
+ ('\u{16f4f}', '\u{16f4f}'),
+ ('𖽑', '𖾇'),
+ ('\u{16f8f}', '\u{16f92}'),
+ ('\u{16fe4}', '\u{16fe4}'),
+ ('𖿰', '𖿱'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d165}', '\u{1d169}'),
+ ('𝅭', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('\u{1e130}', '\u{1e136}'),
+ ('\u{1e2ae}', '\u{1e2ae}'),
+ ('\u{1e2ec}', '\u{1e2ef}'),
+ ('\u{1e4ec}', '\u{1e4ef}'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('\u{1e944}', '\u{1e94a}'),
+ ('\u{e0020}', '\u{e007f}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const FORMAT: &'static [(char, char)] = &[
+ ('\u{ad}', '\u{ad}'),
+ ('\u{600}', '\u{605}'),
+ ('\u{61c}', '\u{61c}'),
+ ('\u{6dd}', '\u{6dd}'),
+ ('\u{70f}', '\u{70f}'),
+ ('\u{890}', '\u{891}'),
+ ('\u{8e2}', '\u{8e2}'),
+ ('\u{180e}', '\u{180e}'),
+ ('\u{200b}', '\u{200b}'),
+ ('\u{200e}', '\u{200f}'),
+ ('\u{202a}', '\u{202e}'),
+ ('\u{2060}', '\u{2064}'),
+ ('\u{2066}', '\u{206f}'),
+ ('\u{feff}', '\u{feff}'),
+ ('\u{fff9}', '\u{fffb}'),
+ ('\u{110bd}', '\u{110bd}'),
+ ('\u{110cd}', '\u{110cd}'),
+ ('\u{13430}', '\u{1343f}'),
+ ('\u{1bca0}', '\u{1bca3}'),
+ ('\u{1d173}', '\u{1d17a}'),
+ ('\u{e0001}', '\u{e0001}'),
+];
+
+pub const LF: &'static [(char, char)] = &[('\n', '\n')];
+
+pub const LOWER: &'static [(char, char)] = &[
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('ß', 'ö'),
+ ('ø', 'ÿ'),
+ ('ā', 'ā'),
+ ('ă', 'ă'),
+ ('ą', 'ą'),
+ ('ć', 'ć'),
+ ('ĉ', 'ĉ'),
+ ('ċ', 'ċ'),
+ ('č', 'č'),
+ ('ď', 'ď'),
+ ('đ', 'đ'),
+ ('ē', 'ē'),
+ ('ĕ', 'ĕ'),
+ ('ė', 'ė'),
+ ('ę', 'ę'),
+ ('ě', 'ě'),
+ ('ĝ', 'ĝ'),
+ ('ğ', 'ğ'),
+ ('ġ', 'ġ'),
+ ('ģ', 'ģ'),
+ ('ĥ', 'ĥ'),
+ ('ħ', 'ħ'),
+ ('ĩ', 'ĩ'),
+ ('ī', 'ī'),
+ ('ĭ', 'ĭ'),
+ ('į', 'į'),
+ ('ı', 'ı'),
+ ('ij', 'ij'),
+ ('ĵ', 'ĵ'),
+ ('ķ', 'ĸ'),
+ ('ĺ', 'ĺ'),
+ ('ļ', 'ļ'),
+ ('ľ', 'ľ'),
+ ('ŀ', 'ŀ'),
+ ('ł', 'ł'),
+ ('ń', 'ń'),
+ ('ņ', 'ņ'),
+ ('ň', 'ʼn'),
+ ('ŋ', 'ŋ'),
+ ('ō', 'ō'),
+ ('ŏ', 'ŏ'),
+ ('ő', 'ő'),
+ ('œ', 'œ'),
+ ('ŕ', 'ŕ'),
+ ('ŗ', 'ŗ'),
+ ('ř', 'ř'),
+ ('ś', 'ś'),
+ ('ŝ', 'ŝ'),
+ ('ş', 'ş'),
+ ('š', 'š'),
+ ('ţ', 'ţ'),
+ ('ť', 'ť'),
+ ('ŧ', 'ŧ'),
+ ('ũ', 'ũ'),
+ ('ū', 'ū'),
+ ('ŭ', 'ŭ'),
+ ('ů', 'ů'),
+ ('ű', 'ű'),
+ ('ų', 'ų'),
+ ('ŵ', 'ŵ'),
+ ('ŷ', 'ŷ'),
+ ('ź', 'ź'),
+ ('ż', 'ż'),
+ ('ž', 'ƀ'),
+ ('ƃ', 'ƃ'),
+ ('ƅ', 'ƅ'),
+ ('ƈ', 'ƈ'),
+ ('ƌ', 'ƍ'),
+ ('ƒ', 'ƒ'),
+ ('ƕ', 'ƕ'),
+ ('ƙ', 'ƛ'),
+ ('ƞ', 'ƞ'),
+ ('ơ', 'ơ'),
+ ('ƣ', 'ƣ'),
+ ('ƥ', 'ƥ'),
+ ('ƨ', 'ƨ'),
+ ('ƪ', 'ƫ'),
+ ('ƭ', 'ƭ'),
+ ('ư', 'ư'),
+ ('ƴ', 'ƴ'),
+ ('ƶ', 'ƶ'),
+ ('ƹ', 'ƺ'),
+ ('ƽ', 'ƿ'),
+ ('dž', 'dž'),
+ ('lj', 'lj'),
+ ('nj', 'nj'),
+ ('ǎ', 'ǎ'),
+ ('ǐ', 'ǐ'),
+ ('ǒ', 'ǒ'),
+ ('ǔ', 'ǔ'),
+ ('ǖ', 'ǖ'),
+ ('ǘ', 'ǘ'),
+ ('ǚ', 'ǚ'),
+ ('ǜ', 'ǝ'),
+ ('ǟ', 'ǟ'),
+ ('ǡ', 'ǡ'),
+ ('ǣ', 'ǣ'),
+ ('ǥ', 'ǥ'),
+ ('ǧ', 'ǧ'),
+ ('ǩ', 'ǩ'),
+ ('ǫ', 'ǫ'),
+ ('ǭ', 'ǭ'),
+ ('ǯ', 'ǰ'),
+ ('dz', 'dz'),
+ ('ǵ', 'ǵ'),
+ ('ǹ', 'ǹ'),
+ ('ǻ', 'ǻ'),
+ ('ǽ', 'ǽ'),
+ ('ǿ', 'ǿ'),
+ ('ȁ', 'ȁ'),
+ ('ȃ', 'ȃ'),
+ ('ȅ', 'ȅ'),
+ ('ȇ', 'ȇ'),
+ ('ȉ', 'ȉ'),
+ ('ȋ', 'ȋ'),
+ ('ȍ', 'ȍ'),
+ ('ȏ', 'ȏ'),
+ ('ȑ', 'ȑ'),
+ ('ȓ', 'ȓ'),
+ ('ȕ', 'ȕ'),
+ ('ȗ', 'ȗ'),
+ ('ș', 'ș'),
+ ('ț', 'ț'),
+ ('ȝ', 'ȝ'),
+ ('ȟ', 'ȟ'),
+ ('ȡ', 'ȡ'),
+ ('ȣ', 'ȣ'),
+ ('ȥ', 'ȥ'),
+ ('ȧ', 'ȧ'),
+ ('ȩ', 'ȩ'),
+ ('ȫ', 'ȫ'),
+ ('ȭ', 'ȭ'),
+ ('ȯ', 'ȯ'),
+ ('ȱ', 'ȱ'),
+ ('ȳ', 'ȹ'),
+ ('ȼ', 'ȼ'),
+ ('ȿ', 'ɀ'),
+ ('ɂ', 'ɂ'),
+ ('ɇ', 'ɇ'),
+ ('ɉ', 'ɉ'),
+ ('ɋ', 'ɋ'),
+ ('ɍ', 'ɍ'),
+ ('ɏ', 'ʓ'),
+ ('ʕ', 'ʸ'),
+ ('ˀ', 'ˁ'),
+ ('ˠ', 'ˤ'),
+ ('ͱ', 'ͱ'),
+ ('ͳ', 'ͳ'),
+ ('ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('ΐ', 'ΐ'),
+ ('ά', 'ώ'),
+ ('ϐ', 'ϑ'),
+ ('ϕ', 'ϗ'),
+ ('ϙ', 'ϙ'),
+ ('ϛ', 'ϛ'),
+ ('ϝ', 'ϝ'),
+ ('ϟ', 'ϟ'),
+ ('ϡ', 'ϡ'),
+ ('ϣ', 'ϣ'),
+ ('ϥ', 'ϥ'),
+ ('ϧ', 'ϧ'),
+ ('ϩ', 'ϩ'),
+ ('ϫ', 'ϫ'),
+ ('ϭ', 'ϭ'),
+ ('ϯ', 'ϳ'),
+ ('ϵ', 'ϵ'),
+ ('ϸ', 'ϸ'),
+ ('ϻ', 'ϼ'),
+ ('а', 'џ'),
+ ('ѡ', 'ѡ'),
+ ('ѣ', 'ѣ'),
+ ('ѥ', 'ѥ'),
+ ('ѧ', 'ѧ'),
+ ('ѩ', 'ѩ'),
+ ('ѫ', 'ѫ'),
+ ('ѭ', 'ѭ'),
+ ('ѯ', 'ѯ'),
+ ('ѱ', 'ѱ'),
+ ('ѳ', 'ѳ'),
+ ('ѵ', 'ѵ'),
+ ('ѷ', 'ѷ'),
+ ('ѹ', 'ѹ'),
+ ('ѻ', 'ѻ'),
+ ('ѽ', 'ѽ'),
+ ('ѿ', 'ѿ'),
+ ('ҁ', 'ҁ'),
+ ('ҋ', 'ҋ'),
+ ('ҍ', 'ҍ'),
+ ('ҏ', 'ҏ'),
+ ('ґ', 'ґ'),
+ ('ғ', 'ғ'),
+ ('ҕ', 'ҕ'),
+ ('җ', 'җ'),
+ ('ҙ', 'ҙ'),
+ ('қ', 'қ'),
+ ('ҝ', 'ҝ'),
+ ('ҟ', 'ҟ'),
+ ('ҡ', 'ҡ'),
+ ('ң', 'ң'),
+ ('ҥ', 'ҥ'),
+ ('ҧ', 'ҧ'),
+ ('ҩ', 'ҩ'),
+ ('ҫ', 'ҫ'),
+ ('ҭ', 'ҭ'),
+ ('ү', 'ү'),
+ ('ұ', 'ұ'),
+ ('ҳ', 'ҳ'),
+ ('ҵ', 'ҵ'),
+ ('ҷ', 'ҷ'),
+ ('ҹ', 'ҹ'),
+ ('һ', 'һ'),
+ ('ҽ', 'ҽ'),
+ ('ҿ', 'ҿ'),
+ ('ӂ', 'ӂ'),
+ ('ӄ', 'ӄ'),
+ ('ӆ', 'ӆ'),
+ ('ӈ', 'ӈ'),
+ ('ӊ', 'ӊ'),
+ ('ӌ', 'ӌ'),
+ ('ӎ', 'ӏ'),
+ ('ӑ', 'ӑ'),
+ ('ӓ', 'ӓ'),
+ ('ӕ', 'ӕ'),
+ ('ӗ', 'ӗ'),
+ ('ә', 'ә'),
+ ('ӛ', 'ӛ'),
+ ('ӝ', 'ӝ'),
+ ('ӟ', 'ӟ'),
+ ('ӡ', 'ӡ'),
+ ('ӣ', 'ӣ'),
+ ('ӥ', 'ӥ'),
+ ('ӧ', 'ӧ'),
+ ('ө', 'ө'),
+ ('ӫ', 'ӫ'),
+ ('ӭ', 'ӭ'),
+ ('ӯ', 'ӯ'),
+ ('ӱ', 'ӱ'),
+ ('ӳ', 'ӳ'),
+ ('ӵ', 'ӵ'),
+ ('ӷ', 'ӷ'),
+ ('ӹ', 'ӹ'),
+ ('ӻ', 'ӻ'),
+ ('ӽ', 'ӽ'),
+ ('ӿ', 'ӿ'),
+ ('ԁ', 'ԁ'),
+ ('ԃ', 'ԃ'),
+ ('ԅ', 'ԅ'),
+ ('ԇ', 'ԇ'),
+ ('ԉ', 'ԉ'),
+ ('ԋ', 'ԋ'),
+ ('ԍ', 'ԍ'),
+ ('ԏ', 'ԏ'),
+ ('ԑ', 'ԑ'),
+ ('ԓ', 'ԓ'),
+ ('ԕ', 'ԕ'),
+ ('ԗ', 'ԗ'),
+ ('ԙ', 'ԙ'),
+ ('ԛ', 'ԛ'),
+ ('ԝ', 'ԝ'),
+ ('ԟ', 'ԟ'),
+ ('ԡ', 'ԡ'),
+ ('ԣ', 'ԣ'),
+ ('ԥ', 'ԥ'),
+ ('ԧ', 'ԧ'),
+ ('ԩ', 'ԩ'),
+ ('ԫ', 'ԫ'),
+ ('ԭ', 'ԭ'),
+ ('ԯ', 'ԯ'),
+ ('ՠ', 'ֈ'),
+ ('ჼ', 'ჼ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᲀ', 'ᲈ'),
+ ('ᴀ', 'ᶿ'),
+ ('ḁ', 'ḁ'),
+ ('ḃ', 'ḃ'),
+ ('ḅ', 'ḅ'),
+ ('ḇ', 'ḇ'),
+ ('ḉ', 'ḉ'),
+ ('ḋ', 'ḋ'),
+ ('ḍ', 'ḍ'),
+ ('ḏ', 'ḏ'),
+ ('ḑ', 'ḑ'),
+ ('ḓ', 'ḓ'),
+ ('ḕ', 'ḕ'),
+ ('ḗ', 'ḗ'),
+ ('ḙ', 'ḙ'),
+ ('ḛ', 'ḛ'),
+ ('ḝ', 'ḝ'),
+ ('ḟ', 'ḟ'),
+ ('ḡ', 'ḡ'),
+ ('ḣ', 'ḣ'),
+ ('ḥ', 'ḥ'),
+ ('ḧ', 'ḧ'),
+ ('ḩ', 'ḩ'),
+ ('ḫ', 'ḫ'),
+ ('ḭ', 'ḭ'),
+ ('ḯ', 'ḯ'),
+ ('ḱ', 'ḱ'),
+ ('ḳ', 'ḳ'),
+ ('ḵ', 'ḵ'),
+ ('ḷ', 'ḷ'),
+ ('ḹ', 'ḹ'),
+ ('ḻ', 'ḻ'),
+ ('ḽ', 'ḽ'),
+ ('ḿ', 'ḿ'),
+ ('ṁ', 'ṁ'),
+ ('ṃ', 'ṃ'),
+ ('ṅ', 'ṅ'),
+ ('ṇ', 'ṇ'),
+ ('ṉ', 'ṉ'),
+ ('ṋ', 'ṋ'),
+ ('ṍ', 'ṍ'),
+ ('ṏ', 'ṏ'),
+ ('ṑ', 'ṑ'),
+ ('ṓ', 'ṓ'),
+ ('ṕ', 'ṕ'),
+ ('ṗ', 'ṗ'),
+ ('ṙ', 'ṙ'),
+ ('ṛ', 'ṛ'),
+ ('ṝ', 'ṝ'),
+ ('ṟ', 'ṟ'),
+ ('ṡ', 'ṡ'),
+ ('ṣ', 'ṣ'),
+ ('ṥ', 'ṥ'),
+ ('ṧ', 'ṧ'),
+ ('ṩ', 'ṩ'),
+ ('ṫ', 'ṫ'),
+ ('ṭ', 'ṭ'),
+ ('ṯ', 'ṯ'),
+ ('ṱ', 'ṱ'),
+ ('ṳ', 'ṳ'),
+ ('ṵ', 'ṵ'),
+ ('ṷ', 'ṷ'),
+ ('ṹ', 'ṹ'),
+ ('ṻ', 'ṻ'),
+ ('ṽ', 'ṽ'),
+ ('ṿ', 'ṿ'),
+ ('ẁ', 'ẁ'),
+ ('ẃ', 'ẃ'),
+ ('ẅ', 'ẅ'),
+ ('ẇ', 'ẇ'),
+ ('ẉ', 'ẉ'),
+ ('ẋ', 'ẋ'),
+ ('ẍ', 'ẍ'),
+ ('ẏ', 'ẏ'),
+ ('ẑ', 'ẑ'),
+ ('ẓ', 'ẓ'),
+ ('ẕ', 'ẝ'),
+ ('ẟ', 'ẟ'),
+ ('ạ', 'ạ'),
+ ('ả', 'ả'),
+ ('ấ', 'ấ'),
+ ('ầ', 'ầ'),
+ ('ẩ', 'ẩ'),
+ ('ẫ', 'ẫ'),
+ ('ậ', 'ậ'),
+ ('ắ', 'ắ'),
+ ('ằ', 'ằ'),
+ ('ẳ', 'ẳ'),
+ ('ẵ', 'ẵ'),
+ ('ặ', 'ặ'),
+ ('ẹ', 'ẹ'),
+ ('ẻ', 'ẻ'),
+ ('ẽ', 'ẽ'),
+ ('ế', 'ế'),
+ ('ề', 'ề'),
+ ('ể', 'ể'),
+ ('ễ', 'ễ'),
+ ('ệ', 'ệ'),
+ ('ỉ', 'ỉ'),
+ ('ị', 'ị'),
+ ('ọ', 'ọ'),
+ ('ỏ', 'ỏ'),
+ ('ố', 'ố'),
+ ('ồ', 'ồ'),
+ ('ổ', 'ổ'),
+ ('ỗ', 'ỗ'),
+ ('ộ', 'ộ'),
+ ('ớ', 'ớ'),
+ ('ờ', 'ờ'),
+ ('ở', 'ở'),
+ ('ỡ', 'ỡ'),
+ ('ợ', 'ợ'),
+ ('ụ', 'ụ'),
+ ('ủ', 'ủ'),
+ ('ứ', 'ứ'),
+ ('ừ', 'ừ'),
+ ('ử', 'ử'),
+ ('ữ', 'ữ'),
+ ('ự', 'ự'),
+ ('ỳ', 'ỳ'),
+ ('ỵ', 'ỵ'),
+ ('ỷ', 'ỷ'),
+ ('ỹ', 'ỹ'),
+ ('ỻ', 'ỻ'),
+ ('ỽ', 'ỽ'),
+ ('ỿ', 'ἇ'),
+ ('ἐ', 'ἕ'),
+ ('ἠ', 'ἧ'),
+ ('ἰ', 'ἷ'),
+ ('ὀ', 'ὅ'),
+ ('ὐ', 'ὗ'),
+ ('ὠ', 'ὧ'),
+ ('ὰ', 'ώ'),
+ ('ᾀ', 'ᾇ'),
+ ('ᾐ', 'ᾗ'),
+ ('ᾠ', 'ᾧ'),
+ ('ᾰ', 'ᾴ'),
+ ('ᾶ', 'ᾷ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῇ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'ῗ'),
+ ('ῠ', 'ῧ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῷ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ℊ', 'ℊ'),
+ ('ℎ', 'ℏ'),
+ ('ℓ', 'ℓ'),
+ ('ℯ', 'ℯ'),
+ ('ℴ', 'ℴ'),
+ ('ℹ', 'ℹ'),
+ ('ℼ', 'ℽ'),
+ ('ⅆ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('ⅰ', 'ⅿ'),
+ ('ↄ', 'ↄ'),
+ ('ⓐ', 'ⓩ'),
+ ('ⰰ', 'ⱟ'),
+ ('ⱡ', 'ⱡ'),
+ ('ⱥ', 'ⱦ'),
+ ('ⱨ', 'ⱨ'),
+ ('ⱪ', 'ⱪ'),
+ ('ⱬ', 'ⱬ'),
+ ('ⱱ', 'ⱱ'),
+ ('ⱳ', 'ⱴ'),
+ ('ⱶ', 'ⱽ'),
+ ('ⲁ', 'ⲁ'),
+ ('ⲃ', 'ⲃ'),
+ ('ⲅ', 'ⲅ'),
+ ('ⲇ', 'ⲇ'),
+ ('ⲉ', 'ⲉ'),
+ ('ⲋ', 'ⲋ'),
+ ('ⲍ', 'ⲍ'),
+ ('ⲏ', 'ⲏ'),
+ ('ⲑ', 'ⲑ'),
+ ('ⲓ', 'ⲓ'),
+ ('ⲕ', 'ⲕ'),
+ ('ⲗ', 'ⲗ'),
+ ('ⲙ', 'ⲙ'),
+ ('ⲛ', 'ⲛ'),
+ ('ⲝ', 'ⲝ'),
+ ('ⲟ', 'ⲟ'),
+ ('ⲡ', 'ⲡ'),
+ ('ⲣ', 'ⲣ'),
+ ('ⲥ', 'ⲥ'),
+ ('ⲧ', 'ⲧ'),
+ ('ⲩ', 'ⲩ'),
+ ('ⲫ', 'ⲫ'),
+ ('ⲭ', 'ⲭ'),
+ ('ⲯ', 'ⲯ'),
+ ('ⲱ', 'ⲱ'),
+ ('ⲳ', 'ⲳ'),
+ ('ⲵ', 'ⲵ'),
+ ('ⲷ', 'ⲷ'),
+ ('ⲹ', 'ⲹ'),
+ ('ⲻ', 'ⲻ'),
+ ('ⲽ', 'ⲽ'),
+ ('ⲿ', 'ⲿ'),
+ ('ⳁ', 'ⳁ'),
+ ('ⳃ', 'ⳃ'),
+ ('ⳅ', 'ⳅ'),
+ ('ⳇ', 'ⳇ'),
+ ('ⳉ', 'ⳉ'),
+ ('ⳋ', 'ⳋ'),
+ ('ⳍ', 'ⳍ'),
+ ('ⳏ', 'ⳏ'),
+ ('ⳑ', 'ⳑ'),
+ ('ⳓ', 'ⳓ'),
+ ('ⳕ', 'ⳕ'),
+ ('ⳗ', 'ⳗ'),
+ ('ⳙ', 'ⳙ'),
+ ('ⳛ', 'ⳛ'),
+ ('ⳝ', 'ⳝ'),
+ ('ⳟ', 'ⳟ'),
+ ('ⳡ', 'ⳡ'),
+ ('ⳣ', 'ⳤ'),
+ ('ⳬ', 'ⳬ'),
+ ('ⳮ', 'ⳮ'),
+ ('ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ꙁ', 'ꙁ'),
+ ('ꙃ', 'ꙃ'),
+ ('ꙅ', 'ꙅ'),
+ ('ꙇ', 'ꙇ'),
+ ('ꙉ', 'ꙉ'),
+ ('ꙋ', 'ꙋ'),
+ ('ꙍ', 'ꙍ'),
+ ('ꙏ', 'ꙏ'),
+ ('ꙑ', 'ꙑ'),
+ ('ꙓ', 'ꙓ'),
+ ('ꙕ', 'ꙕ'),
+ ('ꙗ', 'ꙗ'),
+ ('ꙙ', 'ꙙ'),
+ ('ꙛ', 'ꙛ'),
+ ('ꙝ', 'ꙝ'),
+ ('ꙟ', 'ꙟ'),
+ ('ꙡ', 'ꙡ'),
+ ('ꙣ', 'ꙣ'),
+ ('ꙥ', 'ꙥ'),
+ ('ꙧ', 'ꙧ'),
+ ('ꙩ', 'ꙩ'),
+ ('ꙫ', 'ꙫ'),
+ ('ꙭ', 'ꙭ'),
+ ('ꚁ', 'ꚁ'),
+ ('ꚃ', 'ꚃ'),
+ ('ꚅ', 'ꚅ'),
+ ('ꚇ', 'ꚇ'),
+ ('ꚉ', 'ꚉ'),
+ ('ꚋ', 'ꚋ'),
+ ('ꚍ', 'ꚍ'),
+ ('ꚏ', 'ꚏ'),
+ ('ꚑ', 'ꚑ'),
+ ('ꚓ', 'ꚓ'),
+ ('ꚕ', 'ꚕ'),
+ ('ꚗ', 'ꚗ'),
+ ('ꚙ', 'ꚙ'),
+ ('ꚛ', 'ꚝ'),
+ ('ꜣ', 'ꜣ'),
+ ('ꜥ', 'ꜥ'),
+ ('ꜧ', 'ꜧ'),
+ ('ꜩ', 'ꜩ'),
+ ('ꜫ', 'ꜫ'),
+ ('ꜭ', 'ꜭ'),
+ ('ꜯ', 'ꜱ'),
+ ('ꜳ', 'ꜳ'),
+ ('ꜵ', 'ꜵ'),
+ ('ꜷ', 'ꜷ'),
+ ('ꜹ', 'ꜹ'),
+ ('ꜻ', 'ꜻ'),
+ ('ꜽ', 'ꜽ'),
+ ('ꜿ', 'ꜿ'),
+ ('ꝁ', 'ꝁ'),
+ ('ꝃ', 'ꝃ'),
+ ('ꝅ', 'ꝅ'),
+ ('ꝇ', 'ꝇ'),
+ ('ꝉ', 'ꝉ'),
+ ('ꝋ', 'ꝋ'),
+ ('ꝍ', 'ꝍ'),
+ ('ꝏ', 'ꝏ'),
+ ('ꝑ', 'ꝑ'),
+ ('ꝓ', 'ꝓ'),
+ ('ꝕ', 'ꝕ'),
+ ('ꝗ', 'ꝗ'),
+ ('ꝙ', 'ꝙ'),
+ ('ꝛ', 'ꝛ'),
+ ('ꝝ', 'ꝝ'),
+ ('ꝟ', 'ꝟ'),
+ ('ꝡ', 'ꝡ'),
+ ('ꝣ', 'ꝣ'),
+ ('ꝥ', 'ꝥ'),
+ ('ꝧ', 'ꝧ'),
+ ('ꝩ', 'ꝩ'),
+ ('ꝫ', 'ꝫ'),
+ ('ꝭ', 'ꝭ'),
+ ('ꝯ', 'ꝸ'),
+ ('ꝺ', 'ꝺ'),
+ ('ꝼ', 'ꝼ'),
+ ('ꝿ', 'ꝿ'),
+ ('ꞁ', 'ꞁ'),
+ ('ꞃ', 'ꞃ'),
+ ('ꞅ', 'ꞅ'),
+ ('ꞇ', 'ꞇ'),
+ ('ꞌ', 'ꞌ'),
+ ('ꞎ', 'ꞎ'),
+ ('ꞑ', 'ꞑ'),
+ ('ꞓ', 'ꞕ'),
+ ('ꞗ', 'ꞗ'),
+ ('ꞙ', 'ꞙ'),
+ ('ꞛ', 'ꞛ'),
+ ('ꞝ', 'ꞝ'),
+ ('ꞟ', 'ꞟ'),
+ ('ꞡ', 'ꞡ'),
+ ('ꞣ', 'ꞣ'),
+ ('ꞥ', 'ꞥ'),
+ ('ꞧ', 'ꞧ'),
+ ('ꞩ', 'ꞩ'),
+ ('ꞯ', 'ꞯ'),
+ ('ꞵ', 'ꞵ'),
+ ('ꞷ', 'ꞷ'),
+ ('ꞹ', 'ꞹ'),
+ ('ꞻ', 'ꞻ'),
+ ('ꞽ', 'ꞽ'),
+ ('ꞿ', 'ꞿ'),
+ ('ꟁ', 'ꟁ'),
+ ('ꟃ', 'ꟃ'),
+ ('ꟈ', 'ꟈ'),
+ ('ꟊ', 'ꟊ'),
+ ('ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟕ'),
+ ('ꟗ', 'ꟗ'),
+ ('ꟙ', 'ꟙ'),
+ ('ꟲ', 'ꟴ'),
+ ('ꟶ', 'ꟶ'),
+ ('ꟸ', 'ꟺ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꮿ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('a', 'z'),
+ ('𐐨', '𐑏'),
+ ('𐓘', '𐓻'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐞀', '𐞀'),
+ ('𐞃', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐳀', '𐳲'),
+ ('𑣀', '𑣟'),
+ ('𖹠', '𖹿'),
+ ('𝐚', '𝐳'),
+ ('𝑎', '𝑔'),
+ ('𝑖', '𝑧'),
+ ('𝒂', '𝒛'),
+ ('𝒶', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝓏'),
+ ('𝓪', '𝔃'),
+ ('𝔞', '𝔷'),
+ ('𝕒', '𝕫'),
+ ('𝖆', '𝖟'),
+ ('𝖺', '𝗓'),
+ ('𝗮', '𝘇'),
+ ('𝘢', '𝘻'),
+ ('𝙖', '𝙯'),
+ ('𝚊', '𝚥'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛡'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜛'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝕'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞏'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟉'),
+ ('𝟋', '𝟋'),
+ ('𝼀', '𝼉'),
+ ('𝼋', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞀰', '𞁭'),
+ ('𞤢', '𞥃'),
+];
+
+pub const NUMERIC: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('٠', '٩'),
+ ('٫', '٬'),
+ ('۰', '۹'),
+ ('߀', '߉'),
+ ('०', '९'),
+ ('০', '৯'),
+ ('੦', '੯'),
+ ('૦', '૯'),
+ ('୦', '୯'),
+ ('௦', '௯'),
+ ('౦', '౯'),
+ ('೦', '೯'),
+ ('൦', '൯'),
+ ('෦', '෯'),
+ ('๐', '๙'),
+ ('໐', '໙'),
+ ('༠', '༩'),
+ ('၀', '၉'),
+ ('႐', '႙'),
+ ('០', '៩'),
+ ('᠐', '᠙'),
+ ('᥆', '᥏'),
+ ('᧐', '᧙'),
+ ('᪀', '᪉'),
+ ('᪐', '᪙'),
+ ('᭐', '᭙'),
+ ('᮰', '᮹'),
+ ('᱀', '᱉'),
+ ('᱐', '᱙'),
+ ('꘠', '꘩'),
+ ('꣐', '꣙'),
+ ('꤀', '꤉'),
+ ('꧐', '꧙'),
+ ('꧰', '꧹'),
+ ('꩐', '꩙'),
+ ('꯰', '꯹'),
+ ('0', '9'),
+ ('𐒠', '𐒩'),
+ ('𐴰', '𐴹'),
+ ('𑁦', '𑁯'),
+ ('𑃰', '𑃹'),
+ ('𑄶', '𑄿'),
+ ('𑇐', '𑇙'),
+ ('𑋰', '𑋹'),
+ ('𑑐', '𑑙'),
+ ('𑓐', '𑓙'),
+ ('𑙐', '𑙙'),
+ ('𑛀', '𑛉'),
+ ('𑜰', '𑜹'),
+ ('𑣠', '𑣩'),
+ ('𑥐', '𑥙'),
+ ('𑱐', '𑱙'),
+ ('𑵐', '𑵙'),
+ ('𑶠', '𑶩'),
+ ('𑽐', '𑽙'),
+ ('𖩠', '𖩩'),
+ ('𖫀', '𖫉'),
+ ('𖭐', '𖭙'),
+ ('𝟎', '𝟿'),
+ ('𞅀', '𞅉'),
+ ('𞋰', '𞋹'),
+ ('𞓰', '𞓹'),
+ ('𞥐', '𞥙'),
+ ('🯰', '🯹'),
+];
+
+pub const OLETTER: &'static [(char, char)] = &[
+ ('ƻ', 'ƻ'),
+ ('ǀ', 'ǃ'),
+ ('ʔ', 'ʔ'),
+ ('ʹ', 'ʿ'),
+ ('ˆ', 'ˑ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('ʹ', 'ʹ'),
+ ('ՙ', 'ՙ'),
+ ('א', 'ת'),
+ ('ׯ', '׳'),
+ ('ؠ', 'ي'),
+ ('ٮ', 'ٯ'),
+ ('ٱ', 'ۓ'),
+ ('ە', 'ە'),
+ ('ۥ', 'ۦ'),
+ ('ۮ', 'ۯ'),
+ ('ۺ', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', 'ܐ'),
+ ('ܒ', 'ܯ'),
+ ('ݍ', 'ޥ'),
+ ('ޱ', 'ޱ'),
+ ('ߊ', 'ߪ'),
+ ('ߴ', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('ࠀ', 'ࠕ'),
+ ('ࠚ', 'ࠚ'),
+ ('ࠤ', 'ࠤ'),
+ ('ࠨ', 'ࠨ'),
+ ('ࡀ', 'ࡘ'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('ࢠ', 'ࣉ'),
+ ('ऄ', 'ह'),
+ ('ऽ', 'ऽ'),
+ ('ॐ', 'ॐ'),
+ ('क़', 'ॡ'),
+ ('ॱ', 'ঀ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('ঽ', 'ঽ'),
+ ('ৎ', 'ৎ'),
+ ('ড়', 'ঢ়'),
+ ('য়', 'ৡ'),
+ ('ৰ', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('ੲ', 'ੴ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('ઽ', 'ઽ'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', 'ૡ'),
+ ('ૹ', 'ૹ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('ଽ', 'ଽ'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', 'ୡ'),
+ ('ୱ', 'ୱ'),
+ ('ஃ', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('ௐ', 'ௐ'),
+ ('అ', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('ఽ', 'ఽ'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', 'ౡ'),
+ ('ಀ', 'ಀ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('ಽ', 'ಽ'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', 'ೡ'),
+ ('ೱ', 'ೲ'),
+ ('ഄ', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', 'ഺ'),
+ ('ഽ', 'ഽ'),
+ ('ൎ', 'ൎ'),
+ ('ൔ', 'ൖ'),
+ ('ൟ', 'ൡ'),
+ ('ൺ', 'ൿ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('ก', 'ะ'),
+ ('า', 'ำ'),
+ ('เ', 'ๆ'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ະ'),
+ ('າ', 'ຳ'),
+ ('ຽ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('ཀ', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('ྈ', 'ྌ'),
+ ('က', 'ဪ'),
+ ('ဿ', 'ဿ'),
+ ('ၐ', 'ၕ'),
+ ('ၚ', 'ၝ'),
+ ('ၡ', 'ၡ'),
+ ('ၥ', 'ၦ'),
+ ('ၮ', 'ၰ'),
+ ('ၵ', 'ႁ'),
+ ('ႎ', 'ႎ'),
+ ('ა', 'ჺ'),
+ ('ჽ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('ᎀ', 'ᎏ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛮ', 'ᛸ'),
+ ('ᜀ', 'ᜑ'),
+ ('ᜟ', 'ᜱ'),
+ ('ᝀ', 'ᝑ'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('ក', 'ឳ'),
+ ('ៗ', 'ៗ'),
+ ('ៜ', 'ៜ'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢄ'),
+ ('ᢇ', 'ᢨ'),
+ ('ᢪ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('ᥐ', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('ᨀ', 'ᨖ'),
+ ('ᨠ', 'ᩔ'),
+ ('ᪧ', 'ᪧ'),
+ ('ᬅ', 'ᬳ'),
+ ('ᭅ', 'ᭌ'),
+ ('ᮃ', 'ᮠ'),
+ ('ᮮ', 'ᮯ'),
+ ('ᮺ', 'ᯥ'),
+ ('ᰀ', 'ᰣ'),
+ ('ᱍ', 'ᱏ'),
+ ('ᱚ', 'ᱽ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ᳩ', 'ᳬ'),
+ ('ᳮ', 'ᳳ'),
+ ('ᳵ', 'ᳶ'),
+ ('ᳺ', 'ᳺ'),
+ ('ℵ', 'ℸ'),
+ ('ↀ', 'ↂ'),
+ ('ↅ', 'ↈ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('ⸯ', 'ⸯ'),
+ ('々', '〇'),
+ ('〡', '〩'),
+ ('〱', '〵'),
+ ('〸', '〼'),
+ ('ぁ', 'ゖ'),
+ ('ゝ', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘟ'),
+ ('ꘪ', 'ꘫ'),
+ ('ꙮ', 'ꙮ'),
+ ('ꙿ', 'ꙿ'),
+ ('ꚠ', 'ꛯ'),
+ ('ꜗ', 'ꜟ'),
+ ('ꞈ', 'ꞈ'),
+ ('ꞏ', 'ꞏ'),
+ ('ꟷ', 'ꟷ'),
+ ('ꟻ', 'ꠁ'),
+ ('ꠃ', 'ꠅ'),
+ ('ꠇ', 'ꠊ'),
+ ('ꠌ', 'ꠢ'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢂ', 'ꢳ'),
+ ('ꣲ', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', 'ꣾ'),
+ ('ꤊ', 'ꤥ'),
+ ('ꤰ', 'ꥆ'),
+ ('ꥠ', 'ꥼ'),
+ ('ꦄ', 'ꦲ'),
+ ('ꧏ', 'ꧏ'),
+ ('ꧠ', 'ꧤ'),
+ ('ꧦ', 'ꧯ'),
+ ('ꧺ', 'ꧾ'),
+ ('ꨀ', 'ꨨ'),
+ ('ꩀ', 'ꩂ'),
+ ('ꩄ', 'ꩋ'),
+ ('ꩠ', 'ꩶ'),
+ ('ꩺ', 'ꩺ'),
+ ('ꩾ', 'ꪯ'),
+ ('ꪱ', 'ꪱ'),
+ ('ꪵ', 'ꪶ'),
+ ('ꪹ', 'ꪽ'),
+ ('ꫀ', 'ꫀ'),
+ ('ꫂ', 'ꫂ'),
+ ('ꫛ', 'ꫝ'),
+ ('ꫠ', 'ꫪ'),
+ ('ꫲ', 'ꫴ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꯀ', 'ꯢ'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('יִ', 'יִ'),
+ ('ײַ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('ヲ', 'ン'),
+ ('ᅠ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐅀', '𐅴'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '𐍵'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐏑', '𐏕'),
+ ('𐑐', '𐒝'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞁', '𐞂'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '𐨀'),
+ ('𐨐', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '𐫤'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐴀', '𐴣'),
+ ('𐺀', '𐺩'),
+ ('𐺰', '𐺱'),
+ ('𐼀', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '𐽅'),
+ ('𐽰', '𐾁'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀃', '𑀷'),
+ ('𑁱', '𑁲'),
+ ('𑁵', '𑁵'),
+ ('𑂃', '𑂯'),
+ ('𑃐', '𑃨'),
+ ('𑄃', '𑄦'),
+ ('𑅄', '𑅄'),
+ ('𑅇', '𑅇'),
+ ('𑅐', '𑅲'),
+ ('𑅶', '𑅶'),
+ ('𑆃', '𑆲'),
+ ('𑇁', '𑇄'),
+ ('𑇚', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '𑈫'),
+ ('𑈿', '𑉀'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '𑋞'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('𑌽', '𑌽'),
+ ('𑍐', '𑍐'),
+ ('𑍝', '𑍡'),
+ ('𑐀', '𑐴'),
+ ('𑑇', '𑑊'),
+ ('𑑟', '𑑡'),
+ ('𑒀', '𑒯'),
+ ('𑓄', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑖀', '𑖮'),
+ ('𑗘', '𑗛'),
+ ('𑘀', '𑘯'),
+ ('𑙄', '𑙄'),
+ ('𑚀', '𑚪'),
+ ('𑚸', '𑚸'),
+ ('𑜀', '𑜚'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '𑠫'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤯'),
+ ('𑤿', '𑤿'),
+ ('𑥁', '𑥁'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '𑧐'),
+ ('𑧡', '𑧡'),
+ ('𑧣', '𑧣'),
+ ('𑨀', '𑨀'),
+ ('𑨋', '𑨲'),
+ ('𑨺', '𑨺'),
+ ('𑩐', '𑩐'),
+ ('𑩜', '𑪉'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '𑰮'),
+ ('𑱀', '𑱀'),
+ ('𑱲', '𑲏'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '𑴰'),
+ ('𑵆', '𑵆'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶉'),
+ ('𑶘', '𑶘'),
+ ('𑻠', '𑻲'),
+ ('𑼂', '𑼂'),
+ ('𑼄', '𑼐'),
+ ('𑼒', '𑼳'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('𓑁', '𓑆'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩰', '𖪾'),
+ ('𖫐', '𖫭'),
+ ('𖬀', '𖬯'),
+ ('𖭀', '𖭃'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖼀', '𖽊'),
+ ('𖽐', '𖽐'),
+ ('𖾓', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '𖿣'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('𝼊', '𝼊'),
+ ('𞄀', '𞄬'),
+ ('𞄷', '𞄽'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '𞊭'),
+ ('𞋀', '𞋫'),
+ ('𞓐', '𞓫'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('𞥋', '𞥋'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+];
+
+pub const SCONTINUE: &'static [(char, char)] = &[
+ (',', '-'),
+ (':', ':'),
+ ('՝', '՝'),
+ ('،', '؍'),
+ ('߸', '߸'),
+ ('᠂', '᠂'),
+ ('᠈', '᠈'),
+ ('–', '—'),
+ ('、', '、'),
+ ('︐', '︑'),
+ ('︓', '︓'),
+ ('︱', '︲'),
+ ('﹐', '﹑'),
+ ('﹕', '﹕'),
+ ('﹘', '﹘'),
+ ('﹣', '﹣'),
+ (',', '-'),
+ (':', ':'),
+ ('、', '、'),
+];
+
+pub const STERM: &'static [(char, char)] = &[
+ ('!', '!'),
+ ('?', '?'),
+ ('։', '։'),
+ ('؝', '؟'),
+ ('۔', '۔'),
+ ('܀', '܂'),
+ ('߹', '߹'),
+ ('࠷', '࠷'),
+ ('࠹', '࠹'),
+ ('࠽', '࠾'),
+ ('।', '॥'),
+ ('၊', '။'),
+ ('።', '።'),
+ ('፧', '፨'),
+ ('᙮', '᙮'),
+ ('᜵', '᜶'),
+ ('᠃', '᠃'),
+ ('᠉', '᠉'),
+ ('᥄', '᥅'),
+ ('᪨', '᪫'),
+ ('᭚', '᭛'),
+ ('᭞', '᭟'),
+ ('᭽', '᭾'),
+ ('᰻', '᰼'),
+ ('᱾', '᱿'),
+ ('‼', '‽'),
+ ('⁇', '⁉'),
+ ('⸮', '⸮'),
+ ('⸼', '⸼'),
+ ('⹓', '⹔'),
+ ('。', '。'),
+ ('꓿', '꓿'),
+ ('꘎', '꘏'),
+ ('꛳', '꛳'),
+ ('꛷', '꛷'),
+ ('꡶', '꡷'),
+ ('꣎', '꣏'),
+ ('꤯', '꤯'),
+ ('꧈', '꧉'),
+ ('꩝', '꩟'),
+ ('꫰', '꫱'),
+ ('꯫', '꯫'),
+ ('﹖', '﹗'),
+ ('!', '!'),
+ ('?', '?'),
+ ('。', '。'),
+ ('𐩖', '𐩗'),
+ ('𐽕', '𐽙'),
+ ('𐾆', '𐾉'),
+ ('𑁇', '𑁈'),
+ ('𑂾', '𑃁'),
+ ('𑅁', '𑅃'),
+ ('𑇅', '𑇆'),
+ ('𑇍', '𑇍'),
+ ('𑇞', '𑇟'),
+ ('𑈸', '𑈹'),
+ ('𑈻', '𑈼'),
+ ('𑊩', '𑊩'),
+ ('𑑋', '𑑌'),
+ ('𑗂', '𑗃'),
+ ('𑗉', '𑗗'),
+ ('𑙁', '𑙂'),
+ ('𑜼', '𑜾'),
+ ('𑥄', '𑥄'),
+ ('𑥆', '𑥆'),
+ ('𑩂', '𑩃'),
+ ('𑪛', '𑪜'),
+ ('𑱁', '𑱂'),
+ ('𑻷', '𑻸'),
+ ('𑽃', '𑽄'),
+ ('𖩮', '𖩯'),
+ ('𖫵', '𖫵'),
+ ('𖬷', '𖬸'),
+ ('𖭄', '𖭄'),
+ ('𖺘', '𖺘'),
+ ('𛲟', '𛲟'),
+ ('𝪈', '𝪈'),
+];
+
+pub const SEP: &'static [(char, char)] =
+ &[('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')];
+
+pub const SP: &'static [(char, char)] = &[
+ ('\t', '\t'),
+ ('\u{b}', '\u{c}'),
+ (' ', ' '),
+ ('\u{a0}', '\u{a0}'),
+ ('\u{1680}', '\u{1680}'),
+ ('\u{2000}', '\u{200a}'),
+ ('\u{202f}', '\u{202f}'),
+ ('\u{205f}', '\u{205f}'),
+ ('\u{3000}', '\u{3000}'),
+];
+
+pub const UPPER: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('À', 'Ö'),
+ ('Ø', 'Þ'),
+ ('Ā', 'Ā'),
+ ('Ă', 'Ă'),
+ ('Ą', 'Ą'),
+ ('Ć', 'Ć'),
+ ('Ĉ', 'Ĉ'),
+ ('Ċ', 'Ċ'),
+ ('Č', 'Č'),
+ ('Ď', 'Ď'),
+ ('Đ', 'Đ'),
+ ('Ē', 'Ē'),
+ ('Ĕ', 'Ĕ'),
+ ('Ė', 'Ė'),
+ ('Ę', 'Ę'),
+ ('Ě', 'Ě'),
+ ('Ĝ', 'Ĝ'),
+ ('Ğ', 'Ğ'),
+ ('Ġ', 'Ġ'),
+ ('Ģ', 'Ģ'),
+ ('Ĥ', 'Ĥ'),
+ ('Ħ', 'Ħ'),
+ ('Ĩ', 'Ĩ'),
+ ('Ī', 'Ī'),
+ ('Ĭ', 'Ĭ'),
+ ('Į', 'Į'),
+ ('İ', 'İ'),
+ ('IJ', 'IJ'),
+ ('Ĵ', 'Ĵ'),
+ ('Ķ', 'Ķ'),
+ ('Ĺ', 'Ĺ'),
+ ('Ļ', 'Ļ'),
+ ('Ľ', 'Ľ'),
+ ('Ŀ', 'Ŀ'),
+ ('Ł', 'Ł'),
+ ('Ń', 'Ń'),
+ ('Ņ', 'Ņ'),
+ ('Ň', 'Ň'),
+ ('Ŋ', 'Ŋ'),
+ ('Ō', 'Ō'),
+ ('Ŏ', 'Ŏ'),
+ ('Ő', 'Ő'),
+ ('Œ', 'Œ'),
+ ('Ŕ', 'Ŕ'),
+ ('Ŗ', 'Ŗ'),
+ ('Ř', 'Ř'),
+ ('Ś', 'Ś'),
+ ('Ŝ', 'Ŝ'),
+ ('Ş', 'Ş'),
+ ('Š', 'Š'),
+ ('Ţ', 'Ţ'),
+ ('Ť', 'Ť'),
+ ('Ŧ', 'Ŧ'),
+ ('Ũ', 'Ũ'),
+ ('Ū', 'Ū'),
+ ('Ŭ', 'Ŭ'),
+ ('Ů', 'Ů'),
+ ('Ű', 'Ű'),
+ ('Ų', 'Ų'),
+ ('Ŵ', 'Ŵ'),
+ ('Ŷ', 'Ŷ'),
+ ('Ÿ', 'Ź'),
+ ('Ż', 'Ż'),
+ ('Ž', 'Ž'),
+ ('Ɓ', 'Ƃ'),
+ ('Ƅ', 'Ƅ'),
+ ('Ɔ', 'Ƈ'),
+ ('Ɖ', 'Ƌ'),
+ ('Ǝ', 'Ƒ'),
+ ('Ɠ', 'Ɣ'),
+ ('Ɩ', 'Ƙ'),
+ ('Ɯ', 'Ɲ'),
+ ('Ɵ', 'Ơ'),
+ ('Ƣ', 'Ƣ'),
+ ('Ƥ', 'Ƥ'),
+ ('Ʀ', 'Ƨ'),
+ ('Ʃ', 'Ʃ'),
+ ('Ƭ', 'Ƭ'),
+ ('Ʈ', 'Ư'),
+ ('Ʊ', 'Ƴ'),
+ ('Ƶ', 'Ƶ'),
+ ('Ʒ', 'Ƹ'),
+ ('Ƽ', 'Ƽ'),
+ ('DŽ', 'Dž'),
+ ('LJ', 'Lj'),
+ ('NJ', 'Nj'),
+ ('Ǎ', 'Ǎ'),
+ ('Ǐ', 'Ǐ'),
+ ('Ǒ', 'Ǒ'),
+ ('Ǔ', 'Ǔ'),
+ ('Ǖ', 'Ǖ'),
+ ('Ǘ', 'Ǘ'),
+ ('Ǚ', 'Ǚ'),
+ ('Ǜ', 'Ǜ'),
+ ('Ǟ', 'Ǟ'),
+ ('Ǡ', 'Ǡ'),
+ ('Ǣ', 'Ǣ'),
+ ('Ǥ', 'Ǥ'),
+ ('Ǧ', 'Ǧ'),
+ ('Ǩ', 'Ǩ'),
+ ('Ǫ', 'Ǫ'),
+ ('Ǭ', 'Ǭ'),
+ ('Ǯ', 'Ǯ'),
+ ('DZ', 'Dz'),
+ ('Ǵ', 'Ǵ'),
+ ('Ƕ', 'Ǹ'),
+ ('Ǻ', 'Ǻ'),
+ ('Ǽ', 'Ǽ'),
+ ('Ǿ', 'Ǿ'),
+ ('Ȁ', 'Ȁ'),
+ ('Ȃ', 'Ȃ'),
+ ('Ȅ', 'Ȅ'),
+ ('Ȇ', 'Ȇ'),
+ ('Ȉ', 'Ȉ'),
+ ('Ȋ', 'Ȋ'),
+ ('Ȍ', 'Ȍ'),
+ ('Ȏ', 'Ȏ'),
+ ('Ȑ', 'Ȑ'),
+ ('Ȓ', 'Ȓ'),
+ ('Ȕ', 'Ȕ'),
+ ('Ȗ', 'Ȗ'),
+ ('Ș', 'Ș'),
+ ('Ț', 'Ț'),
+ ('Ȝ', 'Ȝ'),
+ ('Ȟ', 'Ȟ'),
+ ('Ƞ', 'Ƞ'),
+ ('Ȣ', 'Ȣ'),
+ ('Ȥ', 'Ȥ'),
+ ('Ȧ', 'Ȧ'),
+ ('Ȩ', 'Ȩ'),
+ ('Ȫ', 'Ȫ'),
+ ('Ȭ', 'Ȭ'),
+ ('Ȯ', 'Ȯ'),
+ ('Ȱ', 'Ȱ'),
+ ('Ȳ', 'Ȳ'),
+ ('Ⱥ', 'Ȼ'),
+ ('Ƚ', 'Ⱦ'),
+ ('Ɂ', 'Ɂ'),
+ ('Ƀ', 'Ɇ'),
+ ('Ɉ', 'Ɉ'),
+ ('Ɋ', 'Ɋ'),
+ ('Ɍ', 'Ɍ'),
+ ('Ɏ', 'Ɏ'),
+ ('Ͱ', 'Ͱ'),
+ ('Ͳ', 'Ͳ'),
+ ('Ͷ', 'Ͷ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ώ'),
+ ('Α', 'Ρ'),
+ ('Σ', 'Ϋ'),
+ ('Ϗ', 'Ϗ'),
+ ('ϒ', 'ϔ'),
+ ('Ϙ', 'Ϙ'),
+ ('Ϛ', 'Ϛ'),
+ ('Ϝ', 'Ϝ'),
+ ('Ϟ', 'Ϟ'),
+ ('Ϡ', 'Ϡ'),
+ ('Ϣ', 'Ϣ'),
+ ('Ϥ', 'Ϥ'),
+ ('Ϧ', 'Ϧ'),
+ ('Ϩ', 'Ϩ'),
+ ('Ϫ', 'Ϫ'),
+ ('Ϭ', 'Ϭ'),
+ ('Ϯ', 'Ϯ'),
+ ('ϴ', 'ϴ'),
+ ('Ϸ', 'Ϸ'),
+ ('Ϲ', 'Ϻ'),
+ ('Ͻ', 'Я'),
+ ('Ѡ', 'Ѡ'),
+ ('Ѣ', 'Ѣ'),
+ ('Ѥ', 'Ѥ'),
+ ('Ѧ', 'Ѧ'),
+ ('Ѩ', 'Ѩ'),
+ ('Ѫ', 'Ѫ'),
+ ('Ѭ', 'Ѭ'),
+ ('Ѯ', 'Ѯ'),
+ ('Ѱ', 'Ѱ'),
+ ('Ѳ', 'Ѳ'),
+ ('Ѵ', 'Ѵ'),
+ ('Ѷ', 'Ѷ'),
+ ('Ѹ', 'Ѹ'),
+ ('Ѻ', 'Ѻ'),
+ ('Ѽ', 'Ѽ'),
+ ('Ѿ', 'Ѿ'),
+ ('Ҁ', 'Ҁ'),
+ ('Ҋ', 'Ҋ'),
+ ('Ҍ', 'Ҍ'),
+ ('Ҏ', 'Ҏ'),
+ ('Ґ', 'Ґ'),
+ ('Ғ', 'Ғ'),
+ ('Ҕ', 'Ҕ'),
+ ('Җ', 'Җ'),
+ ('Ҙ', 'Ҙ'),
+ ('Қ', 'Қ'),
+ ('Ҝ', 'Ҝ'),
+ ('Ҟ', 'Ҟ'),
+ ('Ҡ', 'Ҡ'),
+ ('Ң', 'Ң'),
+ ('Ҥ', 'Ҥ'),
+ ('Ҧ', 'Ҧ'),
+ ('Ҩ', 'Ҩ'),
+ ('Ҫ', 'Ҫ'),
+ ('Ҭ', 'Ҭ'),
+ ('Ү', 'Ү'),
+ ('Ұ', 'Ұ'),
+ ('Ҳ', 'Ҳ'),
+ ('Ҵ', 'Ҵ'),
+ ('Ҷ', 'Ҷ'),
+ ('Ҹ', 'Ҹ'),
+ ('Һ', 'Һ'),
+ ('Ҽ', 'Ҽ'),
+ ('Ҿ', 'Ҿ'),
+ ('Ӏ', 'Ӂ'),
+ ('Ӄ', 'Ӄ'),
+ ('Ӆ', 'Ӆ'),
+ ('Ӈ', 'Ӈ'),
+ ('Ӊ', 'Ӊ'),
+ ('Ӌ', 'Ӌ'),
+ ('Ӎ', 'Ӎ'),
+ ('Ӑ', 'Ӑ'),
+ ('Ӓ', 'Ӓ'),
+ ('Ӕ', 'Ӕ'),
+ ('Ӗ', 'Ӗ'),
+ ('Ә', 'Ә'),
+ ('Ӛ', 'Ӛ'),
+ ('Ӝ', 'Ӝ'),
+ ('Ӟ', 'Ӟ'),
+ ('Ӡ', 'Ӡ'),
+ ('Ӣ', 'Ӣ'),
+ ('Ӥ', 'Ӥ'),
+ ('Ӧ', 'Ӧ'),
+ ('Ө', 'Ө'),
+ ('Ӫ', 'Ӫ'),
+ ('Ӭ', 'Ӭ'),
+ ('Ӯ', 'Ӯ'),
+ ('Ӱ', 'Ӱ'),
+ ('Ӳ', 'Ӳ'),
+ ('Ӵ', 'Ӵ'),
+ ('Ӷ', 'Ӷ'),
+ ('Ӹ', 'Ӹ'),
+ ('Ӻ', 'Ӻ'),
+ ('Ӽ', 'Ӽ'),
+ ('Ӿ', 'Ӿ'),
+ ('Ԁ', 'Ԁ'),
+ ('Ԃ', 'Ԃ'),
+ ('Ԅ', 'Ԅ'),
+ ('Ԇ', 'Ԇ'),
+ ('Ԉ', 'Ԉ'),
+ ('Ԋ', 'Ԋ'),
+ ('Ԍ', 'Ԍ'),
+ ('Ԏ', 'Ԏ'),
+ ('Ԑ', 'Ԑ'),
+ ('Ԓ', 'Ԓ'),
+ ('Ԕ', 'Ԕ'),
+ ('Ԗ', 'Ԗ'),
+ ('Ԙ', 'Ԙ'),
+ ('Ԛ', 'Ԛ'),
+ ('Ԝ', 'Ԝ'),
+ ('Ԟ', 'Ԟ'),
+ ('Ԡ', 'Ԡ'),
+ ('Ԣ', 'Ԣ'),
+ ('Ԥ', 'Ԥ'),
+ ('Ԧ', 'Ԧ'),
+ ('Ԩ', 'Ԩ'),
+ ('Ԫ', 'Ԫ'),
+ ('Ԭ', 'Ԭ'),
+ ('Ԯ', 'Ԯ'),
+ ('Ա', 'Ֆ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('Ḁ', 'Ḁ'),
+ ('Ḃ', 'Ḃ'),
+ ('Ḅ', 'Ḅ'),
+ ('Ḇ', 'Ḇ'),
+ ('Ḉ', 'Ḉ'),
+ ('Ḋ', 'Ḋ'),
+ ('Ḍ', 'Ḍ'),
+ ('Ḏ', 'Ḏ'),
+ ('Ḑ', 'Ḑ'),
+ ('Ḓ', 'Ḓ'),
+ ('Ḕ', 'Ḕ'),
+ ('Ḗ', 'Ḗ'),
+ ('Ḙ', 'Ḙ'),
+ ('Ḛ', 'Ḛ'),
+ ('Ḝ', 'Ḝ'),
+ ('Ḟ', 'Ḟ'),
+ ('Ḡ', 'Ḡ'),
+ ('Ḣ', 'Ḣ'),
+ ('Ḥ', 'Ḥ'),
+ ('Ḧ', 'Ḧ'),
+ ('Ḩ', 'Ḩ'),
+ ('Ḫ', 'Ḫ'),
+ ('Ḭ', 'Ḭ'),
+ ('Ḯ', 'Ḯ'),
+ ('Ḱ', 'Ḱ'),
+ ('Ḳ', 'Ḳ'),
+ ('Ḵ', 'Ḵ'),
+ ('Ḷ', 'Ḷ'),
+ ('Ḹ', 'Ḹ'),
+ ('Ḻ', 'Ḻ'),
+ ('Ḽ', 'Ḽ'),
+ ('Ḿ', 'Ḿ'),
+ ('Ṁ', 'Ṁ'),
+ ('Ṃ', 'Ṃ'),
+ ('Ṅ', 'Ṅ'),
+ ('Ṇ', 'Ṇ'),
+ ('Ṉ', 'Ṉ'),
+ ('Ṋ', 'Ṋ'),
+ ('Ṍ', 'Ṍ'),
+ ('Ṏ', 'Ṏ'),
+ ('Ṑ', 'Ṑ'),
+ ('Ṓ', 'Ṓ'),
+ ('Ṕ', 'Ṕ'),
+ ('Ṗ', 'Ṗ'),
+ ('Ṙ', 'Ṙ'),
+ ('Ṛ', 'Ṛ'),
+ ('Ṝ', 'Ṝ'),
+ ('Ṟ', 'Ṟ'),
+ ('Ṡ', 'Ṡ'),
+ ('Ṣ', 'Ṣ'),
+ ('Ṥ', 'Ṥ'),
+ ('Ṧ', 'Ṧ'),
+ ('Ṩ', 'Ṩ'),
+ ('Ṫ', 'Ṫ'),
+ ('Ṭ', 'Ṭ'),
+ ('Ṯ', 'Ṯ'),
+ ('Ṱ', 'Ṱ'),
+ ('Ṳ', 'Ṳ'),
+ ('Ṵ', 'Ṵ'),
+ ('Ṷ', 'Ṷ'),
+ ('Ṹ', 'Ṹ'),
+ ('Ṻ', 'Ṻ'),
+ ('Ṽ', 'Ṽ'),
+ ('Ṿ', 'Ṿ'),
+ ('Ẁ', 'Ẁ'),
+ ('Ẃ', 'Ẃ'),
+ ('Ẅ', 'Ẅ'),
+ ('Ẇ', 'Ẇ'),
+ ('Ẉ', 'Ẉ'),
+ ('Ẋ', 'Ẋ'),
+ ('Ẍ', 'Ẍ'),
+ ('Ẏ', 'Ẏ'),
+ ('Ẑ', 'Ẑ'),
+ ('Ẓ', 'Ẓ'),
+ ('Ẕ', 'Ẕ'),
+ ('ẞ', 'ẞ'),
+ ('Ạ', 'Ạ'),
+ ('Ả', 'Ả'),
+ ('Ấ', 'Ấ'),
+ ('Ầ', 'Ầ'),
+ ('Ẩ', 'Ẩ'),
+ ('Ẫ', 'Ẫ'),
+ ('Ậ', 'Ậ'),
+ ('Ắ', 'Ắ'),
+ ('Ằ', 'Ằ'),
+ ('Ẳ', 'Ẳ'),
+ ('Ẵ', 'Ẵ'),
+ ('Ặ', 'Ặ'),
+ ('Ẹ', 'Ẹ'),
+ ('Ẻ', 'Ẻ'),
+ ('Ẽ', 'Ẽ'),
+ ('Ế', 'Ế'),
+ ('Ề', 'Ề'),
+ ('Ể', 'Ể'),
+ ('Ễ', 'Ễ'),
+ ('Ệ', 'Ệ'),
+ ('Ỉ', 'Ỉ'),
+ ('Ị', 'Ị'),
+ ('Ọ', 'Ọ'),
+ ('Ỏ', 'Ỏ'),
+ ('Ố', 'Ố'),
+ ('Ồ', 'Ồ'),
+ ('Ổ', 'Ổ'),
+ ('Ỗ', 'Ỗ'),
+ ('Ộ', 'Ộ'),
+ ('Ớ', 'Ớ'),
+ ('Ờ', 'Ờ'),
+ ('Ở', 'Ở'),
+ ('Ỡ', 'Ỡ'),
+ ('Ợ', 'Ợ'),
+ ('Ụ', 'Ụ'),
+ ('Ủ', 'Ủ'),
+ ('Ứ', 'Ứ'),
+ ('Ừ', 'Ừ'),
+ ('Ử', 'Ử'),
+ ('Ữ', 'Ữ'),
+ ('Ự', 'Ự'),
+ ('Ỳ', 'Ỳ'),
+ ('Ỵ', 'Ỵ'),
+ ('Ỷ', 'Ỷ'),
+ ('Ỹ', 'Ỹ'),
+ ('Ỻ', 'Ỻ'),
+ ('Ỽ', 'Ỽ'),
+ ('Ỿ', 'Ỿ'),
+ ('Ἀ', 'Ἇ'),
+ ('Ἐ', 'Ἕ'),
+ ('Ἠ', 'Ἧ'),
+ ('Ἰ', 'Ἷ'),
+ ('Ὀ', 'Ὅ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'Ὗ'),
+ ('Ὠ', 'Ὧ'),
+ ('ᾈ', 'ᾏ'),
+ ('ᾘ', 'ᾟ'),
+ ('ᾨ', 'ᾯ'),
+ ('Ᾰ', 'ᾼ'),
+ ('Ὲ', 'ῌ'),
+ ('Ῐ', 'Ί'),
+ ('Ῠ', 'Ῥ'),
+ ('Ὸ', 'ῼ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℋ', 'ℍ'),
+ ('ℐ', 'ℒ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℰ', 'ℳ'),
+ ('ℾ', 'ℿ'),
+ ('ⅅ', 'ⅅ'),
+ ('Ⅰ', 'Ⅿ'),
+ ('Ↄ', 'Ↄ'),
+ ('Ⓐ', 'Ⓩ'),
+ ('Ⰰ', 'Ⱟ'),
+ ('Ⱡ', 'Ⱡ'),
+ ('Ɫ', 'Ɽ'),
+ ('Ⱨ', 'Ⱨ'),
+ ('Ⱪ', 'Ⱪ'),
+ ('Ⱬ', 'Ⱬ'),
+ ('Ɑ', 'Ɒ'),
+ ('Ⱳ', 'Ⱳ'),
+ ('Ⱶ', 'Ⱶ'),
+ ('Ȿ', 'Ⲁ'),
+ ('Ⲃ', 'Ⲃ'),
+ ('Ⲅ', 'Ⲅ'),
+ ('Ⲇ', 'Ⲇ'),
+ ('Ⲉ', 'Ⲉ'),
+ ('Ⲋ', 'Ⲋ'),
+ ('Ⲍ', 'Ⲍ'),
+ ('Ⲏ', 'Ⲏ'),
+ ('Ⲑ', 'Ⲑ'),
+ ('Ⲓ', 'Ⲓ'),
+ ('Ⲕ', 'Ⲕ'),
+ ('Ⲗ', 'Ⲗ'),
+ ('Ⲙ', 'Ⲙ'),
+ ('Ⲛ', 'Ⲛ'),
+ ('Ⲝ', 'Ⲝ'),
+ ('Ⲟ', 'Ⲟ'),
+ ('Ⲡ', 'Ⲡ'),
+ ('Ⲣ', 'Ⲣ'),
+ ('Ⲥ', 'Ⲥ'),
+ ('Ⲧ', 'Ⲧ'),
+ ('Ⲩ', 'Ⲩ'),
+ ('Ⲫ', 'Ⲫ'),
+ ('Ⲭ', 'Ⲭ'),
+ ('Ⲯ', 'Ⲯ'),
+ ('Ⲱ', 'Ⲱ'),
+ ('Ⲳ', 'Ⲳ'),
+ ('Ⲵ', 'Ⲵ'),
+ ('Ⲷ', 'Ⲷ'),
+ ('Ⲹ', 'Ⲹ'),
+ ('Ⲻ', 'Ⲻ'),
+ ('Ⲽ', 'Ⲽ'),
+ ('Ⲿ', 'Ⲿ'),
+ ('Ⳁ', 'Ⳁ'),
+ ('Ⳃ', 'Ⳃ'),
+ ('Ⳅ', 'Ⳅ'),
+ ('Ⳇ', 'Ⳇ'),
+ ('Ⳉ', 'Ⳉ'),
+ ('Ⳋ', 'Ⳋ'),
+ ('Ⳍ', 'Ⳍ'),
+ ('Ⳏ', 'Ⳏ'),
+ ('Ⳑ', 'Ⳑ'),
+ ('Ⳓ', 'Ⳓ'),
+ ('Ⳕ', 'Ⳕ'),
+ ('Ⳗ', 'Ⳗ'),
+ ('Ⳙ', 'Ⳙ'),
+ ('Ⳛ', 'Ⳛ'),
+ ('Ⳝ', 'Ⳝ'),
+ ('Ⳟ', 'Ⳟ'),
+ ('Ⳡ', 'Ⳡ'),
+ ('Ⳣ', 'Ⳣ'),
+ ('Ⳬ', 'Ⳬ'),
+ ('Ⳮ', 'Ⳮ'),
+ ('Ⳳ', 'Ⳳ'),
+ ('Ꙁ', 'Ꙁ'),
+ ('Ꙃ', 'Ꙃ'),
+ ('Ꙅ', 'Ꙅ'),
+ ('Ꙇ', 'Ꙇ'),
+ ('Ꙉ', 'Ꙉ'),
+ ('Ꙋ', 'Ꙋ'),
+ ('Ꙍ', 'Ꙍ'),
+ ('Ꙏ', 'Ꙏ'),
+ ('Ꙑ', 'Ꙑ'),
+ ('Ꙓ', 'Ꙓ'),
+ ('Ꙕ', 'Ꙕ'),
+ ('Ꙗ', 'Ꙗ'),
+ ('Ꙙ', 'Ꙙ'),
+ ('Ꙛ', 'Ꙛ'),
+ ('Ꙝ', 'Ꙝ'),
+ ('Ꙟ', 'Ꙟ'),
+ ('Ꙡ', 'Ꙡ'),
+ ('Ꙣ', 'Ꙣ'),
+ ('Ꙥ', 'Ꙥ'),
+ ('Ꙧ', 'Ꙧ'),
+ ('Ꙩ', 'Ꙩ'),
+ ('Ꙫ', 'Ꙫ'),
+ ('Ꙭ', 'Ꙭ'),
+ ('Ꚁ', 'Ꚁ'),
+ ('Ꚃ', 'Ꚃ'),
+ ('Ꚅ', 'Ꚅ'),
+ ('Ꚇ', 'Ꚇ'),
+ ('Ꚉ', 'Ꚉ'),
+ ('Ꚋ', 'Ꚋ'),
+ ('Ꚍ', 'Ꚍ'),
+ ('Ꚏ', 'Ꚏ'),
+ ('Ꚑ', 'Ꚑ'),
+ ('Ꚓ', 'Ꚓ'),
+ ('Ꚕ', 'Ꚕ'),
+ ('Ꚗ', 'Ꚗ'),
+ ('Ꚙ', 'Ꚙ'),
+ ('Ꚛ', 'Ꚛ'),
+ ('Ꜣ', 'Ꜣ'),
+ ('Ꜥ', 'Ꜥ'),
+ ('Ꜧ', 'Ꜧ'),
+ ('Ꜩ', 'Ꜩ'),
+ ('Ꜫ', 'Ꜫ'),
+ ('Ꜭ', 'Ꜭ'),
+ ('Ꜯ', 'Ꜯ'),
+ ('Ꜳ', 'Ꜳ'),
+ ('Ꜵ', 'Ꜵ'),
+ ('Ꜷ', 'Ꜷ'),
+ ('Ꜹ', 'Ꜹ'),
+ ('Ꜻ', 'Ꜻ'),
+ ('Ꜽ', 'Ꜽ'),
+ ('Ꜿ', 'Ꜿ'),
+ ('Ꝁ', 'Ꝁ'),
+ ('Ꝃ', 'Ꝃ'),
+ ('Ꝅ', 'Ꝅ'),
+ ('Ꝇ', 'Ꝇ'),
+ ('Ꝉ', 'Ꝉ'),
+ ('Ꝋ', 'Ꝋ'),
+ ('Ꝍ', 'Ꝍ'),
+ ('Ꝏ', 'Ꝏ'),
+ ('Ꝑ', 'Ꝑ'),
+ ('Ꝓ', 'Ꝓ'),
+ ('Ꝕ', 'Ꝕ'),
+ ('Ꝗ', 'Ꝗ'),
+ ('Ꝙ', 'Ꝙ'),
+ ('Ꝛ', 'Ꝛ'),
+ ('Ꝝ', 'Ꝝ'),
+ ('Ꝟ', 'Ꝟ'),
+ ('Ꝡ', 'Ꝡ'),
+ ('Ꝣ', 'Ꝣ'),
+ ('Ꝥ', 'Ꝥ'),
+ ('Ꝧ', 'Ꝧ'),
+ ('Ꝩ', 'Ꝩ'),
+ ('Ꝫ', 'Ꝫ'),
+ ('Ꝭ', 'Ꝭ'),
+ ('Ꝯ', 'Ꝯ'),
+ ('Ꝺ', 'Ꝺ'),
+ ('Ꝼ', 'Ꝼ'),
+ ('Ᵹ', 'Ꝿ'),
+ ('Ꞁ', 'Ꞁ'),
+ ('Ꞃ', 'Ꞃ'),
+ ('Ꞅ', 'Ꞅ'),
+ ('Ꞇ', 'Ꞇ'),
+ ('Ꞌ', 'Ꞌ'),
+ ('Ɥ', 'Ɥ'),
+ ('Ꞑ', 'Ꞑ'),
+ ('Ꞓ', 'Ꞓ'),
+ ('Ꞗ', 'Ꞗ'),
+ ('Ꞙ', 'Ꞙ'),
+ ('Ꞛ', 'Ꞛ'),
+ ('Ꞝ', 'Ꞝ'),
+ ('Ꞟ', 'Ꞟ'),
+ ('Ꞡ', 'Ꞡ'),
+ ('Ꞣ', 'Ꞣ'),
+ ('Ꞥ', 'Ꞥ'),
+ ('Ꞧ', 'Ꞧ'),
+ ('Ꞩ', 'Ꞩ'),
+ ('Ɦ', 'Ɪ'),
+ ('Ʞ', 'Ꞵ'),
+ ('Ꞷ', 'Ꞷ'),
+ ('Ꞹ', 'Ꞹ'),
+ ('Ꞻ', 'Ꞻ'),
+ ('Ꞽ', 'Ꞽ'),
+ ('Ꞿ', 'Ꞿ'),
+ ('Ꟁ', 'Ꟁ'),
+ ('Ꟃ', 'Ꟃ'),
+ ('Ꞔ', 'Ꟈ'),
+ ('Ꟊ', 'Ꟊ'),
+ ('Ꟑ', 'Ꟑ'),
+ ('Ꟗ', 'Ꟗ'),
+ ('Ꟙ', 'Ꟙ'),
+ ('Ꟶ', 'Ꟶ'),
+ ('A', 'Z'),
+ ('𐐀', '𐐧'),
+ ('𐒰', '𐓓'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐲀', '𐲲'),
+ ('𑢠', '𑢿'),
+ ('𖹀', '𖹟'),
+ ('𝐀', '𝐙'),
+ ('𝐴', '𝑍'),
+ ('𝑨', '𝒁'),
+ ('𝒜', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒵'),
+ ('𝓐', '𝓩'),
+ ('𝔄', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔸', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕬', '𝖅'),
+ ('𝖠', '𝖹'),
+ ('𝗔', '𝗭'),
+ ('𝘈', '𝘡'),
+ ('𝘼', '𝙕'),
+ ('𝙰', '𝚉'),
+ ('𝚨', '𝛀'),
+ ('𝛢', '𝛺'),
+ ('𝜜', '𝜴'),
+ ('𝝖', '𝝮'),
+ ('𝞐', '𝞨'),
+ ('𝟊', '𝟊'),
+ ('𞤀', '𞤡'),
+ ('🄰', '🅉'),
+ ('🅐', '🅩'),
+ ('🅰', '🆉'),
+];
diff --git a/third_party/rust/regex-syntax/src/unicode_tables/word_break.rs b/third_party/rust/regex-syntax/src/unicode_tables/word_break.rs
new file mode 100644
index 0000000000..c0714956fe
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/unicode_tables/word_break.rs
@@ -0,0 +1,1120 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate word-break ucd-15.0.0 --chars
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.14 is available on crates.io.
+
+pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
+ ("ALetter", ALETTER),
+ ("CR", CR),
+ ("Double_Quote", DOUBLE_QUOTE),
+ ("Extend", EXTEND),
+ ("ExtendNumLet", EXTENDNUMLET),
+ ("Format", FORMAT),
+ ("Hebrew_Letter", HEBREW_LETTER),
+ ("Katakana", KATAKANA),
+ ("LF", LF),
+ ("MidLetter", MIDLETTER),
+ ("MidNum", MIDNUM),
+ ("MidNumLet", MIDNUMLET),
+ ("Newline", NEWLINE),
+ ("Numeric", NUMERIC),
+ ("Regional_Indicator", REGIONAL_INDICATOR),
+ ("Single_Quote", SINGLE_QUOTE),
+ ("WSegSpace", WSEGSPACE),
+ ("ZWJ", ZWJ),
+];
+
+pub const ALETTER: &'static [(char, char)] = &[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', '˗'),
+ ('˞', '˿'),
+ ('Ͱ', 'ʹ'),
+ ('Ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('Ҋ', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', '՜'),
+ ('՞', '՞'),
+ ('ՠ', 'ֈ'),
+ ('֊', '֊'),
+ ('׳', '׳'),
+ ('ؠ', 'ي'),
+ ('ٮ', 'ٯ'),
+ ('ٱ', 'ۓ'),
+ ('ە', 'ە'),
+ ('ۥ', 'ۦ'),
+ ('ۮ', 'ۯ'),
+ ('ۺ', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', 'ܐ'),
+ ('ܒ', 'ܯ'),
+ ('ݍ', 'ޥ'),
+ ('ޱ', 'ޱ'),
+ ('ߊ', 'ߪ'),
+ ('ߴ', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('ࠀ', 'ࠕ'),
+ ('ࠚ', 'ࠚ'),
+ ('ࠤ', 'ࠤ'),
+ ('ࠨ', 'ࠨ'),
+ ('ࡀ', 'ࡘ'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('ࢠ', 'ࣉ'),
+ ('ऄ', 'ह'),
+ ('ऽ', 'ऽ'),
+ ('ॐ', 'ॐ'),
+ ('क़', 'ॡ'),
+ ('ॱ', 'ঀ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('ঽ', 'ঽ'),
+ ('ৎ', 'ৎ'),
+ ('ড়', 'ঢ়'),
+ ('য়', 'ৡ'),
+ ('ৰ', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('ੲ', 'ੴ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('ઽ', 'ઽ'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', 'ૡ'),
+ ('ૹ', 'ૹ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('ଽ', 'ଽ'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', 'ୡ'),
+ ('ୱ', 'ୱ'),
+ ('ஃ', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('ௐ', 'ௐ'),
+ ('అ', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('ఽ', 'ఽ'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', 'ౡ'),
+ ('ಀ', 'ಀ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('ಽ', 'ಽ'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', 'ೡ'),
+ ('ೱ', 'ೲ'),
+ ('ഄ', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', 'ഺ'),
+ ('ഽ', 'ഽ'),
+ ('ൎ', 'ൎ'),
+ ('ൔ', 'ൖ'),
+ ('ൟ', 'ൡ'),
+ ('ൺ', 'ൿ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('ༀ', 'ༀ'),
+ ('ཀ', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('ྈ', 'ྌ'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('ᎀ', 'ᎏ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛮ', 'ᛸ'),
+ ('ᜀ', 'ᜑ'),
+ ('ᜟ', 'ᜱ'),
+ ('ᝀ', 'ᝑ'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢄ'),
+ ('ᢇ', 'ᢨ'),
+ ('ᢪ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('ᨀ', 'ᨖ'),
+ ('ᬅ', 'ᬳ'),
+ ('ᭅ', 'ᭌ'),
+ ('ᮃ', 'ᮠ'),
+ ('ᮮ', 'ᮯ'),
+ ('ᮺ', 'ᯥ'),
+ ('ᰀ', 'ᰣ'),
+ ('ᱍ', 'ᱏ'),
+ ('ᱚ', 'ᱽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('ᳩ', 'ᳬ'),
+ ('ᳮ', 'ᳳ'),
+ ('ᳵ', 'ᳶ'),
+ ('ᳺ', 'ᳺ'),
+ ('ᴀ', 'ᶿ'),
+ ('Ḁ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℯ', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⓐ', 'ⓩ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳮ'),
+ ('Ⳳ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('ⶀ', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('ⸯ', 'ⸯ'),
+ ('々', '々'),
+ ('〻', '〼'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ꀀ', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘟ'),
+ ('ꘪ', 'ꘫ'),
+ ('Ꙁ', 'ꙮ'),
+ ('ꙿ', 'ꚝ'),
+ ('ꚠ', 'ꛯ'),
+ ('꜈', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠁ'),
+ ('ꠃ', 'ꠅ'),
+ ('ꠇ', 'ꠊ'),
+ ('ꠌ', 'ꠢ'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢂ', 'ꢳ'),
+ ('ꣲ', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', 'ꣾ'),
+ ('ꤊ', 'ꤥ'),
+ ('ꤰ', 'ꥆ'),
+ ('ꥠ', 'ꥼ'),
+ ('ꦄ', 'ꦲ'),
+ ('ꧏ', 'ꧏ'),
+ ('ꨀ', 'ꨨ'),
+ ('ꩀ', 'ꩂ'),
+ ('ꩄ', 'ꩋ'),
+ ('ꫠ', 'ꫪ'),
+ ('ꫲ', 'ꫴ'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', 'ꭩ'),
+ ('ꭰ', 'ꯢ'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('ﭐ', 'ﮱ'),
+ ('ﯓ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('ᅠ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐅀', '𐅴'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '𐍵'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐏑', '𐏕'),
+ ('𐐀', '𐒝'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '𐨀'),
+ ('𐨐', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '𐫤'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐴀', '𐴣'),
+ ('𐺀', '𐺩'),
+ ('𐺰', '𐺱'),
+ ('𐼀', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '𐽅'),
+ ('𐽰', '𐾁'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀃', '𑀷'),
+ ('𑁱', '𑁲'),
+ ('𑁵', '𑁵'),
+ ('𑂃', '𑂯'),
+ ('𑃐', '𑃨'),
+ ('𑄃', '𑄦'),
+ ('𑅄', '𑅄'),
+ ('𑅇', '𑅇'),
+ ('𑅐', '𑅲'),
+ ('𑅶', '𑅶'),
+ ('𑆃', '𑆲'),
+ ('𑇁', '𑇄'),
+ ('𑇚', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '𑈫'),
+ ('𑈿', '𑉀'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '𑋞'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('𑌽', '𑌽'),
+ ('𑍐', '𑍐'),
+ ('𑍝', '𑍡'),
+ ('𑐀', '𑐴'),
+ ('𑑇', '𑑊'),
+ ('𑑟', '𑑡'),
+ ('𑒀', '𑒯'),
+ ('𑓄', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑖀', '𑖮'),
+ ('𑗘', '𑗛'),
+ ('𑘀', '𑘯'),
+ ('𑙄', '𑙄'),
+ ('𑚀', '𑚪'),
+ ('𑚸', '𑚸'),
+ ('𑠀', '𑠫'),
+ ('𑢠', '𑣟'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤯'),
+ ('𑤿', '𑤿'),
+ ('𑥁', '𑥁'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '𑧐'),
+ ('𑧡', '𑧡'),
+ ('𑧣', '𑧣'),
+ ('𑨀', '𑨀'),
+ ('𑨋', '𑨲'),
+ ('𑨺', '𑨺'),
+ ('𑩐', '𑩐'),
+ ('𑩜', '𑪉'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '𑰮'),
+ ('𑱀', '𑱀'),
+ ('𑱲', '𑲏'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '𑴰'),
+ ('𑵆', '𑵆'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶉'),
+ ('𑶘', '𑶘'),
+ ('𑻠', '𑻲'),
+ ('𑼂', '𑼂'),
+ ('𑼄', '𑼐'),
+ ('𑼒', '𑼳'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('𓑁', '𓑆'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩰', '𖪾'),
+ ('𖫐', '𖫭'),
+ ('𖬀', '𖬯'),
+ ('𖭀', '𖭃'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖹿'),
+ ('𖼀', '𖽊'),
+ ('𖽐', '𖽐'),
+ ('𖾓', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '𖿣'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('𞀰', '𞁭'),
+ ('𞄀', '𞄬'),
+ ('𞄷', '𞄽'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '𞊭'),
+ ('𞋀', '𞋫'),
+ ('𞓐', '𞓫'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('𞤀', '𞥃'),
+ ('𞥋', '𞥋'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('🄰', '🅉'),
+ ('🅐', '🅩'),
+ ('🅰', '🆉'),
+];
+
+pub const CR: &'static [(char, char)] = &[('\r', '\r')];
+
+pub const DOUBLE_QUOTE: &'static [(char, char)] = &[('"', '"')];
+
+pub const EXTEND: &'static [(char, char)] = &[
+ ('\u{300}', '\u{36f}'),
+ ('\u{483}', '\u{489}'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('\u{610}', '\u{61a}'),
+ ('\u{64b}', '\u{65f}'),
+ ('\u{670}', '\u{670}'),
+ ('\u{6d6}', '\u{6dc}'),
+ ('\u{6df}', '\u{6e4}'),
+ ('\u{6e7}', '\u{6e8}'),
+ ('\u{6ea}', '\u{6ed}'),
+ ('\u{711}', '\u{711}'),
+ ('\u{730}', '\u{74a}'),
+ ('\u{7a6}', '\u{7b0}'),
+ ('\u{7eb}', '\u{7f3}'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('\u{816}', '\u{819}'),
+ ('\u{81b}', '\u{823}'),
+ ('\u{825}', '\u{827}'),
+ ('\u{829}', '\u{82d}'),
+ ('\u{859}', '\u{85b}'),
+ ('\u{898}', '\u{89f}'),
+ ('\u{8ca}', '\u{8e1}'),
+ ('\u{8e3}', 'ः'),
+ ('\u{93a}', '\u{93c}'),
+ ('ा', 'ॏ'),
+ ('\u{951}', '\u{957}'),
+ ('\u{962}', '\u{963}'),
+ ('\u{981}', 'ঃ'),
+ ('\u{9bc}', '\u{9bc}'),
+ ('\u{9be}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', '\u{9cd}'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('\u{9e2}', '\u{9e3}'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', 'ਃ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('\u{a70}', '\u{a71}'),
+ ('\u{a75}', '\u{a75}'),
+ ('\u{a81}', 'ઃ'),
+ ('\u{abc}', '\u{abc}'),
+ ('ા', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('\u{ae2}', '\u{ae3}'),
+ ('\u{afa}', '\u{aff}'),
+ ('\u{b01}', 'ଃ'),
+ ('\u{b3c}', '\u{b3c}'),
+ ('\u{b3e}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('\u{b62}', '\u{b63}'),
+ ('\u{b82}', '\u{b82}'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('\u{c00}', '\u{c04}'),
+ ('\u{c3c}', '\u{c3c}'),
+ ('\u{c3e}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('\u{c62}', '\u{c63}'),
+ ('\u{c81}', 'ಃ'),
+ ('\u{cbc}', '\u{cbc}'),
+ ('ಾ', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('\u{ce2}', '\u{ce3}'),
+ ('ೳ', 'ೳ'),
+ ('\u{d00}', 'ഃ'),
+ ('\u{d3b}', '\u{d3c}'),
+ ('\u{d3e}', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', '\u{d4d}'),
+ ('\u{d57}', '\u{d57}'),
+ ('\u{d62}', '\u{d63}'),
+ ('\u{d81}', 'ඃ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('ෲ', 'ෳ'),
+ ('\u{e31}', '\u{e31}'),
+ ('\u{e34}', '\u{e3a}'),
+ ('\u{e47}', '\u{e4e}'),
+ ('\u{eb1}', '\u{eb1}'),
+ ('\u{eb4}', '\u{ebc}'),
+ ('\u{ec8}', '\u{ece}'),
+ ('\u{f18}', '\u{f19}'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('༾', '༿'),
+ ('\u{f71}', '\u{f84}'),
+ ('\u{f86}', '\u{f87}'),
+ ('\u{f8d}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('ါ', '\u{103e}'),
+ ('ၖ', '\u{1059}'),
+ ('\u{105e}', '\u{1060}'),
+ ('ၢ', 'ၤ'),
+ ('ၧ', 'ၭ'),
+ ('\u{1071}', '\u{1074}'),
+ ('\u{1082}', '\u{108d}'),
+ ('ႏ', 'ႏ'),
+ ('ႚ', '\u{109d}'),
+ ('\u{135d}', '\u{135f}'),
+ ('\u{1712}', '᜕'),
+ ('\u{1732}', '᜴'),
+ ('\u{1752}', '\u{1753}'),
+ ('\u{1772}', '\u{1773}'),
+ ('\u{17b4}', '\u{17d3}'),
+ ('\u{17dd}', '\u{17dd}'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '\u{180f}'),
+ ('\u{1885}', '\u{1886}'),
+ ('\u{18a9}', '\u{18a9}'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('\u{1a17}', '\u{1a1b}'),
+ ('ᩕ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '\u{1a7f}'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1b00}', 'ᬄ'),
+ ('\u{1b34}', '᭄'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', 'ᮂ'),
+ ('ᮡ', '\u{1bad}'),
+ ('\u{1be6}', '᯳'),
+ ('ᰤ', '\u{1c37}'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', '\u{1ce8}'),
+ ('\u{1ced}', '\u{1ced}'),
+ ('\u{1cf4}', '\u{1cf4}'),
+ ('᳷', '\u{1cf9}'),
+ ('\u{1dc0}', '\u{1dff}'),
+ ('\u{200c}', '\u{200c}'),
+ ('\u{20d0}', '\u{20f0}'),
+ ('\u{2cef}', '\u{2cf1}'),
+ ('\u{2d7f}', '\u{2d7f}'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('\u{302a}', '\u{302f}'),
+ ('\u{3099}', '\u{309a}'),
+ ('\u{a66f}', '\u{a672}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('\u{a69e}', '\u{a69f}'),
+ ('\u{a6f0}', '\u{a6f1}'),
+ ('\u{a802}', '\u{a802}'),
+ ('\u{a806}', '\u{a806}'),
+ ('\u{a80b}', '\u{a80b}'),
+ ('ꠣ', 'ꠧ'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('ꢀ', 'ꢁ'),
+ ('ꢴ', '\u{a8c5}'),
+ ('\u{a8e0}', '\u{a8f1}'),
+ ('\u{a8ff}', '\u{a8ff}'),
+ ('\u{a926}', '\u{a92d}'),
+ ('\u{a947}', '꥓'),
+ ('\u{a980}', 'ꦃ'),
+ ('\u{a9b3}', '꧀'),
+ ('\u{a9e5}', '\u{a9e5}'),
+ ('\u{aa29}', '\u{aa36}'),
+ ('\u{aa43}', '\u{aa43}'),
+ ('\u{aa4c}', 'ꩍ'),
+ ('ꩻ', 'ꩽ'),
+ ('\u{aab0}', '\u{aab0}'),
+ ('\u{aab2}', '\u{aab4}'),
+ ('\u{aab7}', '\u{aab8}'),
+ ('\u{aabe}', '\u{aabf}'),
+ ('\u{aac1}', '\u{aac1}'),
+ ('ꫫ', 'ꫯ'),
+ ('ꫵ', '\u{aaf6}'),
+ ('ꯣ', 'ꯪ'),
+ ('꯬', '\u{abed}'),
+ ('\u{fb1e}', '\u{fb1e}'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('\u{ff9e}', '\u{ff9f}'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('\u{10376}', '\u{1037a}'),
+ ('\u{10a01}', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '\u{10a0f}'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('\u{10ae5}', '\u{10ae6}'),
+ ('\u{10d24}', '\u{10d27}'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('\u{10efd}', '\u{10eff}'),
+ ('\u{10f46}', '\u{10f50}'),
+ ('\u{10f82}', '\u{10f85}'),
+ ('𑀀', '𑀂'),
+ ('\u{11038}', '\u{11046}'),
+ ('\u{11070}', '\u{11070}'),
+ ('\u{11073}', '\u{11074}'),
+ ('\u{1107f}', '𑂂'),
+ ('𑂰', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('\u{11100}', '\u{11102}'),
+ ('\u{11127}', '\u{11134}'),
+ ('𑅅', '𑅆'),
+ ('\u{11173}', '\u{11173}'),
+ ('\u{11180}', '𑆂'),
+ ('𑆳', '𑇀'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('𑇎', '\u{111cf}'),
+ ('𑈬', '\u{11237}'),
+ ('\u{1123e}', '\u{1123e}'),
+ ('\u{11241}', '\u{11241}'),
+ ('\u{112df}', '\u{112ea}'),
+ ('\u{11300}', '𑌃'),
+ ('\u{1133b}', '\u{1133c}'),
+ ('\u{1133e}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍢', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('𑐵', '\u{11446}'),
+ ('\u{1145e}', '\u{1145e}'),
+ ('\u{114b0}', '\u{114c3}'),
+ ('\u{115af}', '\u{115b5}'),
+ ('𑖸', '\u{115c0}'),
+ ('\u{115dc}', '\u{115dd}'),
+ ('𑘰', '\u{11640}'),
+ ('\u{116ab}', '\u{116b7}'),
+ ('\u{1171d}', '\u{1172b}'),
+ ('𑠬', '\u{1183a}'),
+ ('\u{11930}', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '\u{1193e}'),
+ ('𑥀', '𑥀'),
+ ('𑥂', '\u{11943}'),
+ ('𑧑', '\u{119d7}'),
+ ('\u{119da}', '\u{119e0}'),
+ ('𑧤', '𑧤'),
+ ('\u{11a01}', '\u{11a0a}'),
+ ('\u{11a33}', '𑨹'),
+ ('\u{11a3b}', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('\u{11a51}', '\u{11a5b}'),
+ ('\u{11a8a}', '\u{11a99}'),
+ ('𑰯', '\u{11c36}'),
+ ('\u{11c38}', '\u{11c3f}'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('\u{11d31}', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d45}'),
+ ('\u{11d47}', '\u{11d47}'),
+ ('𑶊', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '\u{11d97}'),
+ ('\u{11ef3}', '𑻶'),
+ ('\u{11f00}', '\u{11f01}'),
+ ('𑼃', '𑼃'),
+ ('𑼴', '\u{11f3a}'),
+ ('𑼾', '\u{11f42}'),
+ ('\u{13440}', '\u{13440}'),
+ ('\u{13447}', '\u{13455}'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('\u{16b30}', '\u{16b36}'),
+ ('\u{16f4f}', '\u{16f4f}'),
+ ('𖽑', '𖾇'),
+ ('\u{16f8f}', '\u{16f92}'),
+ ('\u{16fe4}', '\u{16fe4}'),
+ ('𖿰', '𖿱'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d165}', '\u{1d169}'),
+ ('𝅭', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('\u{1e130}', '\u{1e136}'),
+ ('\u{1e2ae}', '\u{1e2ae}'),
+ ('\u{1e2ec}', '\u{1e2ef}'),
+ ('\u{1e4ec}', '\u{1e4ef}'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('\u{1e944}', '\u{1e94a}'),
+ ('🏻', '🏿'),
+ ('\u{e0020}', '\u{e007f}'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
+
+pub const EXTENDNUMLET: &'static [(char, char)] = &[
+ ('_', '_'),
+ ('\u{202f}', '\u{202f}'),
+ ('‿', '⁀'),
+ ('⁔', '⁔'),
+ ('︳', '︴'),
+ ('﹍', '﹏'),
+ ('_', '_'),
+];
+
+pub const FORMAT: &'static [(char, char)] = &[
+ ('\u{ad}', '\u{ad}'),
+ ('\u{600}', '\u{605}'),
+ ('\u{61c}', '\u{61c}'),
+ ('\u{6dd}', '\u{6dd}'),
+ ('\u{70f}', '\u{70f}'),
+ ('\u{890}', '\u{891}'),
+ ('\u{8e2}', '\u{8e2}'),
+ ('\u{180e}', '\u{180e}'),
+ ('\u{200e}', '\u{200f}'),
+ ('\u{202a}', '\u{202e}'),
+ ('\u{2060}', '\u{2064}'),
+ ('\u{2066}', '\u{206f}'),
+ ('\u{feff}', '\u{feff}'),
+ ('\u{fff9}', '\u{fffb}'),
+ ('\u{110bd}', '\u{110bd}'),
+ ('\u{110cd}', '\u{110cd}'),
+ ('\u{13430}', '\u{1343f}'),
+ ('\u{1bca0}', '\u{1bca3}'),
+ ('\u{1d173}', '\u{1d17a}'),
+ ('\u{e0001}', '\u{e0001}'),
+];
+
+pub const HEBREW_LETTER: &'static [(char, char)] = &[
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('יִ', 'יִ'),
+ ('ײַ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﭏ'),
+];
+
+pub const KATAKANA: &'static [(char, char)] = &[
+ ('〱', '〵'),
+ ('゛', '゜'),
+ ('゠', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㋐', '㋾'),
+ ('㌀', '㍗'),
+ ('ヲ', 'ン'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛀀'),
+ ('𛄠', '𛄢'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+];
+
+pub const LF: &'static [(char, char)] = &[('\n', '\n')];
+
+pub const MIDLETTER: &'static [(char, char)] = &[
+ (':', ':'),
+ ('·', '·'),
+ ('·', '·'),
+ ('՟', '՟'),
+ ('״', '״'),
+ ('‧', '‧'),
+ ('︓', '︓'),
+ ('﹕', '﹕'),
+ (':', ':'),
+];
+
+pub const MIDNUM: &'static [(char, char)] = &[
+ (',', ','),
+ (';', ';'),
+ (';', ';'),
+ ('։', '։'),
+ ('،', '؍'),
+ ('٬', '٬'),
+ ('߸', '߸'),
+ ('⁄', '⁄'),
+ ('︐', '︐'),
+ ('︔', '︔'),
+ ('﹐', '﹐'),
+ ('﹔', '﹔'),
+ (',', ','),
+ (';', ';'),
+];
+
+pub const MIDNUMLET: &'static [(char, char)] = &[
+ ('.', '.'),
+ ('‘', '’'),
+ ('․', '․'),
+ ('﹒', '﹒'),
+ (''', '''),
+ ('.', '.'),
+];
+
+pub const NEWLINE: &'static [(char, char)] =
+ &[('\u{b}', '\u{c}'), ('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')];
+
+pub const NUMERIC: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('٠', '٩'),
+ ('٫', '٫'),
+ ('۰', '۹'),
+ ('߀', '߉'),
+ ('०', '९'),
+ ('০', '৯'),
+ ('੦', '੯'),
+ ('૦', '૯'),
+ ('୦', '୯'),
+ ('௦', '௯'),
+ ('౦', '౯'),
+ ('೦', '೯'),
+ ('൦', '൯'),
+ ('෦', '෯'),
+ ('๐', '๙'),
+ ('໐', '໙'),
+ ('༠', '༩'),
+ ('၀', '၉'),
+ ('႐', '႙'),
+ ('០', '៩'),
+ ('᠐', '᠙'),
+ ('᥆', '᥏'),
+ ('᧐', '᧙'),
+ ('᪀', '᪉'),
+ ('᪐', '᪙'),
+ ('᭐', '᭙'),
+ ('᮰', '᮹'),
+ ('᱀', '᱉'),
+ ('᱐', '᱙'),
+ ('꘠', '꘩'),
+ ('꣐', '꣙'),
+ ('꤀', '꤉'),
+ ('꧐', '꧙'),
+ ('꧰', '꧹'),
+ ('꩐', '꩙'),
+ ('꯰', '꯹'),
+ ('0', '9'),
+ ('𐒠', '𐒩'),
+ ('𐴰', '𐴹'),
+ ('𑁦', '𑁯'),
+ ('𑃰', '𑃹'),
+ ('𑄶', '𑄿'),
+ ('𑇐', '𑇙'),
+ ('𑋰', '𑋹'),
+ ('𑑐', '𑑙'),
+ ('𑓐', '𑓙'),
+ ('𑙐', '𑙙'),
+ ('𑛀', '𑛉'),
+ ('𑜰', '𑜹'),
+ ('𑣠', '𑣩'),
+ ('𑥐', '𑥙'),
+ ('𑱐', '𑱙'),
+ ('𑵐', '𑵙'),
+ ('𑶠', '𑶩'),
+ ('𑽐', '𑽙'),
+ ('𖩠', '𖩩'),
+ ('𖫀', '𖫉'),
+ ('𖭐', '𖭙'),
+ ('𝟎', '𝟿'),
+ ('𞅀', '𞅉'),
+ ('𞋰', '𞋹'),
+ ('𞓰', '𞓹'),
+ ('𞥐', '𞥙'),
+ ('🯰', '🯹'),
+];
+
+pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')];
+
+pub const SINGLE_QUOTE: &'static [(char, char)] = &[('\'', '\'')];
+
+pub const WSEGSPACE: &'static [(char, char)] = &[
+ (' ', ' '),
+ ('\u{1680}', '\u{1680}'),
+ ('\u{2000}', '\u{2006}'),
+ ('\u{2008}', '\u{200a}'),
+ ('\u{205f}', '\u{205f}'),
+ ('\u{3000}', '\u{3000}'),
+];
+
+pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')];
diff --git a/third_party/rust/regex-syntax/src/utf8.rs b/third_party/rust/regex-syntax/src/utf8.rs
new file mode 100644
index 0000000000..b9c8655320
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/utf8.rs
@@ -0,0 +1,587 @@
+/*!
+Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
+
+This is sub-module is useful for constructing byte based automatons that need
+to embed UTF-8 decoding. The most common use of this module is in conjunction
+with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type.
+
+See the documentation on the `Utf8Sequences` iterator for more details and
+an example.
+
+# Wait, what is this?
+
+This is simplest to explain with an example. Let's say you wanted to test
+whether a particular byte sequence was a Cyrillic character. One possible
+scalar value range is `[0400-04FF]`. The set of allowed bytes for this
+range can be expressed as a sequence of byte ranges:
+
+```text
+[D0-D3][80-BF]
+```
+
+This is simple enough: simply encode the boundaries, `0400` encodes to
+`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
+corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
+
+However, what if you wanted to add the Cyrillic Supplementary characters to
+your range? Your range might then become `[0400-052F]`. The same procedure
+as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
+you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
+this isn't quite correct because this range doesn't capture many characters,
+for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
+
+Instead, you need multiple sequences of byte ranges:
+
+```text
+[D0-D3][80-BF] # matches codepoints 0400-04FF
+[D4][80-AF] # matches codepoints 0500-052F
+```
+
+This gets even more complicated if you want bigger ranges, particularly if
+they naively contain surrogate codepoints. For example, the sequence of byte
+ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
+
+```text
+[0-7F]
+[C2-DF][80-BF]
+[E0][A0-BF][80-BF]
+[E1-EC][80-BF][80-BF]
+[ED][80-9F][80-BF]
+[EE-EF][80-BF][80-BF]
+```
+
+Note that the byte ranges above will *not* match any erroneous encoding of
+UTF-8, including encodings of surrogate codepoints.
+
+And, of course, for all of Unicode (`[000000-10FFFF]`):
+
+```text
+[0-7F]
+[C2-DF][80-BF]
+[E0][A0-BF][80-BF]
+[E1-EC][80-BF][80-BF]
+[ED][80-9F][80-BF]
+[EE-EF][80-BF][80-BF]
+[F0][90-BF][80-BF][80-BF]
+[F1-F3][80-BF][80-BF][80-BF]
+[F4][80-8F][80-BF][80-BF]
+```
+
+This module automates the process of creating these byte ranges from ranges of
+Unicode scalar values.
+
+# Lineage
+
+I got the idea and general implementation strategy from Russ Cox in his
+[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
+Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
+I also got the idea from
+[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
+which uses it for executing automata on their term index.
+*/
+
+#![deny(missing_docs)]
+
+use std::char;
+use std::fmt;
+use std::iter::FusedIterator;
+use std::slice;
+
+const MAX_UTF8_BYTES: usize = 4;
+
+/// Utf8Sequence represents a sequence of byte ranges.
+///
+/// To match a Utf8Sequence, a candidate byte sequence must match each
+/// successive range.
+///
+/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
+/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
+pub enum Utf8Sequence {
+ /// One byte range.
+ One(Utf8Range),
+ /// Two successive byte ranges.
+ Two([Utf8Range; 2]),
+ /// Three successive byte ranges.
+ Three([Utf8Range; 3]),
+ /// Four successive byte ranges.
+ Four([Utf8Range; 4]),
+}
+
+impl Utf8Sequence {
+ /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
+ /// range.
+ ///
+ /// This assumes that `start` and `end` have the same length.
+ fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
+ assert_eq!(start.len(), end.len());
+ match start.len() {
+ 2 => Utf8Sequence::Two([
+ Utf8Range::new(start[0], end[0]),
+ Utf8Range::new(start[1], end[1]),
+ ]),
+ 3 => Utf8Sequence::Three([
+ Utf8Range::new(start[0], end[0]),
+ Utf8Range::new(start[1], end[1]),
+ Utf8Range::new(start[2], end[2]),
+ ]),
+ 4 => Utf8Sequence::Four([
+ Utf8Range::new(start[0], end[0]),
+ Utf8Range::new(start[1], end[1]),
+ Utf8Range::new(start[2], end[2]),
+ Utf8Range::new(start[3], end[3]),
+ ]),
+ n => unreachable!("invalid encoded length: {}", n),
+ }
+ }
+
+ /// Returns the underlying sequence of byte ranges as a slice.
+ pub fn as_slice(&self) -> &[Utf8Range] {
+ use self::Utf8Sequence::*;
+ match *self {
+ One(ref r) => slice::from_ref(r),
+ Two(ref r) => &r[..],
+ Three(ref r) => &r[..],
+ Four(ref r) => &r[..],
+ }
+ }
+
+ /// Returns the number of byte ranges in this sequence.
+ ///
+ /// The length is guaranteed to be in the closed interval `[1, 4]`.
+ pub fn len(&self) -> usize {
+ self.as_slice().len()
+ }
+
+ /// Reverses the ranges in this sequence.
+ ///
+ /// For example, if this corresponds to the following sequence:
+ ///
+ /// ```text
+ /// [D0-D3][80-BF]
+ /// ```
+ ///
+ /// Then after reversal, it will be
+ ///
+ /// ```text
+ /// [80-BF][D0-D3]
+ /// ```
+ ///
+ /// This is useful when one is constructing a UTF-8 automaton to match
+ /// character classes in reverse.
+ pub fn reverse(&mut self) {
+ match *self {
+ Utf8Sequence::One(_) => {}
+ Utf8Sequence::Two(ref mut x) => x.reverse(),
+ Utf8Sequence::Three(ref mut x) => x.reverse(),
+ Utf8Sequence::Four(ref mut x) => x.reverse(),
+ }
+ }
+
+ /// Returns true if and only if a prefix of `bytes` matches this sequence
+ /// of byte ranges.
+ pub fn matches(&self, bytes: &[u8]) -> bool {
+ if bytes.len() < self.len() {
+ return false;
+ }
+ for (&b, r) in bytes.iter().zip(self) {
+ if !r.matches(b) {
+ return false;
+ }
+ }
+ true
+ }
+}
+
+impl<'a> IntoIterator for &'a Utf8Sequence {
+ type IntoIter = slice::Iter<'a, Utf8Range>;
+ type Item = &'a Utf8Range;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.as_slice().iter()
+ }
+}
+
+impl fmt::Debug for Utf8Sequence {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use self::Utf8Sequence::*;
+ match *self {
+ One(ref r) => write!(f, "{:?}", r),
+ Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
+ Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
+ Four(ref r) => {
+ write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
+ }
+ }
+ }
+}
+
+/// A single inclusive range of UTF-8 bytes.
+#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+pub struct Utf8Range {
+ /// Start of byte range (inclusive).
+ pub start: u8,
+ /// End of byte range (inclusive).
+ pub end: u8,
+}
+
+impl Utf8Range {
+ fn new(start: u8, end: u8) -> Self {
+ Utf8Range { start, end }
+ }
+
+ /// Returns true if and only if the given byte is in this range.
+ pub fn matches(&self, b: u8) -> bool {
+ self.start <= b && b <= self.end
+ }
+}
+
+impl fmt::Debug for Utf8Range {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ if self.start == self.end {
+ write!(f, "[{:X}]", self.start)
+ } else {
+ write!(f, "[{:X}-{:X}]", self.start, self.end)
+ }
+ }
+}
+
+/// An iterator over ranges of matching UTF-8 byte sequences.
+///
+/// The iteration represents an alternation of comprehensive byte sequences
+/// that match precisely the set of UTF-8 encoded scalar values.
+///
+/// A byte sequence corresponds to one of the scalar values in the range given
+/// if and only if it completely matches exactly one of the sequences of byte
+/// ranges produced by this iterator.
+///
+/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
+/// sequences will match the same bytes.
+///
+/// # Example
+///
+/// This shows how to match an arbitrary byte sequence against a range of
+/// scalar values.
+///
+/// ```rust
+/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
+///
+/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
+/// for range in seqs {
+/// if range.matches(bytes) {
+/// return true;
+/// }
+/// }
+/// false
+/// }
+///
+/// // Test the basic multilingual plane.
+/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
+///
+/// // UTF-8 encoding of 'a'.
+/// assert!(matches(&seqs, &[0x61]));
+/// // UTF-8 encoding of '☃' (`\u{2603}`).
+/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
+/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
+/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
+/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
+/// // which is invalid UTF-8, and therefore fails, despite the fact that
+/// // the corresponding codepoint (0xD800) falls in the range given.
+/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
+/// // And fails against plain old invalid UTF-8.
+/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
+/// ```
+///
+/// If this example seems circuitous, that's because it is! It's meant to be
+/// illustrative. In practice, you could just try to decode your byte sequence
+/// and compare it with the scalar value range directly. However, this is not
+/// always possible (for example, in a byte based automaton).
+#[derive(Debug)]
+pub struct Utf8Sequences {
+ range_stack: Vec<ScalarRange>,
+}
+
+impl Utf8Sequences {
+ /// Create a new iterator over UTF-8 byte ranges for the scalar value range
+ /// given.
+ pub fn new(start: char, end: char) -> Self {
+ let mut it = Utf8Sequences { range_stack: vec![] };
+ it.push(start as u32, end as u32);
+ it
+ }
+
+ /// reset resets the scalar value range.
+ /// Any existing state is cleared, but resources may be reused.
+ ///
+ /// N.B. Benchmarks say that this method is dubious.
+ #[doc(hidden)]
+ pub fn reset(&mut self, start: char, end: char) {
+ self.range_stack.clear();
+ self.push(start as u32, end as u32);
+ }
+
+ fn push(&mut self, start: u32, end: u32) {
+ self.range_stack.push(ScalarRange { start, end });
+ }
+}
+
+struct ScalarRange {
+ start: u32,
+ end: u32,
+}
+
+impl fmt::Debug for ScalarRange {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
+ }
+}
+
+impl Iterator for Utf8Sequences {
+ type Item = Utf8Sequence;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ 'TOP: while let Some(mut r) = self.range_stack.pop() {
+ 'INNER: loop {
+ if let Some((r1, r2)) = r.split() {
+ self.push(r2.start, r2.end);
+ r.start = r1.start;
+ r.end = r1.end;
+ continue 'INNER;
+ }
+ if !r.is_valid() {
+ continue 'TOP;
+ }
+ for i in 1..MAX_UTF8_BYTES {
+ let max = max_scalar_value(i);
+ if r.start <= max && max < r.end {
+ self.push(max + 1, r.end);
+ r.end = max;
+ continue 'INNER;
+ }
+ }
+ if let Some(ascii_range) = r.as_ascii() {
+ return Some(Utf8Sequence::One(ascii_range));
+ }
+ for i in 1..MAX_UTF8_BYTES {
+ let m = (1 << (6 * i)) - 1;
+ if (r.start & !m) != (r.end & !m) {
+ if (r.start & m) != 0 {
+ self.push((r.start | m) + 1, r.end);
+ r.end = r.start | m;
+ continue 'INNER;
+ }
+ if (r.end & m) != m {
+ self.push(r.end & !m, r.end);
+ r.end = (r.end & !m) - 1;
+ continue 'INNER;
+ }
+ }
+ }
+ let mut start = [0; MAX_UTF8_BYTES];
+ let mut end = [0; MAX_UTF8_BYTES];
+ let n = r.encode(&mut start, &mut end);
+ return Some(Utf8Sequence::from_encoded_range(
+ &start[0..n],
+ &end[0..n],
+ ));
+ }
+ }
+ None
+ }
+}
+
+impl FusedIterator for Utf8Sequences {}
+
+impl ScalarRange {
+ /// split splits this range if it overlaps with a surrogate codepoint.
+ ///
+ /// Either or both ranges may be invalid.
+ fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
+ if self.start < 0xE000 && self.end > 0xD7FF {
+ Some((
+ ScalarRange { start: self.start, end: 0xD7FF },
+ ScalarRange { start: 0xE000, end: self.end },
+ ))
+ } else {
+ None
+ }
+ }
+
+ /// is_valid returns true if and only if start <= end.
+ fn is_valid(&self) -> bool {
+ self.start <= self.end
+ }
+
+ /// as_ascii returns this range as a Utf8Range if and only if all scalar
+ /// values in this range can be encoded as a single byte.
+ fn as_ascii(&self) -> Option<Utf8Range> {
+ if self.is_ascii() {
+ Some(Utf8Range::new(self.start as u8, self.end as u8))
+ } else {
+ None
+ }
+ }
+
+ /// is_ascii returns true if the range is ASCII only (i.e., takes a single
+ /// byte to encode any scalar value).
+ fn is_ascii(&self) -> bool {
+ self.is_valid() && self.end <= 0x7f
+ }
+
+ /// encode writes the UTF-8 encoding of the start and end of this range
+ /// to the corresponding destination slices, and returns the number of
+ /// bytes written.
+ ///
+ /// The slices should have room for at least `MAX_UTF8_BYTES`.
+ fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
+ let cs = char::from_u32(self.start).unwrap();
+ let ce = char::from_u32(self.end).unwrap();
+ let ss = cs.encode_utf8(start);
+ let se = ce.encode_utf8(end);
+ assert_eq!(ss.len(), se.len());
+ ss.len()
+ }
+}
+
+fn max_scalar_value(nbytes: usize) -> u32 {
+ match nbytes {
+ 1 => 0x007F,
+ 2 => 0x07FF,
+ 3 => 0xFFFF,
+ 4 => 0x0010_FFFF,
+ _ => unreachable!("invalid UTF-8 byte sequence size"),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::char;
+
+ use crate::utf8::{Utf8Range, Utf8Sequences};
+
+ fn rutf8(s: u8, e: u8) -> Utf8Range {
+ Utf8Range::new(s, e)
+ }
+
+ fn never_accepts_surrogate_codepoints(start: char, end: char) {
+ for cp in 0xD800..0xE000 {
+ let buf = encode_surrogate(cp);
+ for r in Utf8Sequences::new(start, end) {
+ if r.matches(&buf) {
+ panic!(
+ "Sequence ({:X}, {:X}) contains range {:?}, \
+ which matches surrogate code point {:X} \
+ with encoded bytes {:?}",
+ start as u32, end as u32, r, cp, buf,
+ );
+ }
+ }
+ }
+ }
+
+ #[test]
+ fn codepoints_no_surrogates() {
+ never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
+ never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
+ never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
+ never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
+ never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
+ }
+
+ #[test]
+ fn single_codepoint_one_sequence() {
+ // Tests that every range of scalar values that contains a single
+ // scalar value is recognized by one sequence of byte ranges.
+ for i in 0x0..=0x0010_FFFF {
+ let c = match char::from_u32(i) {
+ None => continue,
+ Some(c) => c,
+ };
+ let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
+ assert_eq!(seqs.len(), 1);
+ }
+ }
+
+ #[test]
+ fn bmp() {
+ use crate::utf8::Utf8Sequence::*;
+
+ let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
+ assert_eq!(
+ seqs,
+ vec![
+ One(rutf8(0x0, 0x7F)),
+ Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
+ Three([
+ rutf8(0xE0, 0xE0),
+ rutf8(0xA0, 0xBF),
+ rutf8(0x80, 0xBF)
+ ]),
+ Three([
+ rutf8(0xE1, 0xEC),
+ rutf8(0x80, 0xBF),
+ rutf8(0x80, 0xBF)
+ ]),
+ Three([
+ rutf8(0xED, 0xED),
+ rutf8(0x80, 0x9F),
+ rutf8(0x80, 0xBF)
+ ]),
+ Three([
+ rutf8(0xEE, 0xEF),
+ rutf8(0x80, 0xBF),
+ rutf8(0x80, 0xBF)
+ ]),
+ ]
+ );
+ }
+
+ #[test]
+ fn reverse() {
+ use crate::utf8::Utf8Sequence::*;
+
+ let mut s = One(rutf8(0xA, 0xB));
+ s.reverse();
+ assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
+
+ let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
+ s.reverse();
+ assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
+
+ let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
+ s.reverse();
+ assert_eq!(
+ s.as_slice(),
+ &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
+ );
+
+ let mut s = Four([
+ rutf8(0xA, 0xB),
+ rutf8(0xB, 0xC),
+ rutf8(0xC, 0xD),
+ rutf8(0xD, 0xE),
+ ]);
+ s.reverse();
+ assert_eq!(
+ s.as_slice(),
+ &[
+ rutf8(0xD, 0xE),
+ rutf8(0xC, 0xD),
+ rutf8(0xB, 0xC),
+ rutf8(0xA, 0xB)
+ ]
+ );
+ }
+
+ fn encode_surrogate(cp: u32) -> [u8; 3] {
+ const TAG_CONT: u8 = 0b1000_0000;
+ const TAG_THREE_B: u8 = 0b1110_0000;
+
+ assert!(0xD800 <= cp && cp < 0xE000);
+ let mut dst = [0; 3];
+ dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B;
+ dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT;
+ dst[2] = (cp & 0x3F) as u8 | TAG_CONT;
+ dst
+ }
+}
diff --git a/third_party/rust/regex-syntax/test b/third_party/rust/regex-syntax/test
new file mode 100755
index 0000000000..4b1b9fb1a9
--- /dev/null
+++ b/third_party/rust/regex-syntax/test
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -e
+
+# This is a convenience script for running a broad swath of the syntax tests.
+echo "===== DEFAULT FEATURES ==="
+cargo test
+
+features=(
+ unicode
+ unicode-age
+ unicode-bool
+ unicode-case
+ unicode-gencat
+ unicode-perl
+ unicode-script
+ unicode-segment
+)
+for f in "${features[@]}"; do
+ echo "===== FEATURE: $f ==="
+ cargo test --no-default-features --features "$f"
+done
diff --git a/third_party/rust/regex/.cargo-checksum.json b/third_party/rust/regex/.cargo-checksum.json
new file mode 100644
index 0000000000..1623d8918e
--- /dev/null
+++ b/third_party/rust/regex/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"CHANGELOG.md":"47b22859157339150c957dd72be9cf87aee341ebb3711efac5930efb10436368","Cargo.lock":"3445929e595d109e2f37d349ffad3dd6cb76e7203a029cf1955838d0438d68a4","Cargo.toml":"0abdf3ce883520254d94a04dcf831fb6f0b75bfda7bcf9c8500ca9a2d1f8ff44","HACKING.md":"17818f7a17723608f6bdbe6388ad0a913d4f96f76a16649aaf4e274b1fa0ea97","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","PERFORMANCE.md":"0d5ef3866386918dfdefb1aa9a28cfe33cb3c8ceeb79f3f8ba5b88253dd95991","README.md":"f69204a0f446047d8f4d1f3d84b75f235adb5c26477f3a37b671411bc954d14c","UNICODE.md":"a8a8399540eed000d19420135a527f400247a04572e44d124c786b870f518776","examples/regexdna-input.txt":"156a49710bb3e1ed4bc2bbb0af0f383b747b3d0281453cfff39c296124c598f8","examples/regexdna-output.txt":"35e85b19b70a893d752fd43e54e1e9da08bac43559191cea85b33387c24c4cc1","examples/shootout-regex-dna-bytes.rs":"fa2daedb4e0a05f64f33f4af62fbb0176db998e3676f8637ab684b725367a7b4","examples/shootout-regex-dna-cheat.rs":"1f871a6eaaf8372299fa3c762051112fa89a14235b03f734fc50ebd51ecaee72","examples/shootout-regex-dna-replace.rs":"32ffdf13ac6c4ce3fc32116a048e9cc682aa34cdb8e5beaf565a22addbdcd9ab","examples/shootout-regex-dna-single-cheat.rs":"809f75bf1e1917a53623eb6f1a3ce3b7d2ed98a6a1dbc0bd4853bec49a0c6f94","examples/shootout-regex-dna-single.rs":"1ab14f5703cd4be2e75a2e792e0ba1d322b9e4b14535d396805a4316d577f5bb","examples/shootout-regex-dna.rs":"20ea46ab63f91e3ac6a64e997eadd436a9cbc2f1bdade28e4512052f0e25bc34","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/backtrack.rs":"52987d80448f3d7f5d4e3545ddfc09f1f30de7602d9b5489961db4b215a377fd","src/compile.rs":"79a59be2d2db650b5a322e15e9bf1d3227944410bc780fc6089da8f4d2609b77","src/dfa.rs":"10273980d1f08aaff495e11efa240249a2b2c08a4db7c49c8d6759bc65a3b174","src/error.rs":"71c85db839514f26ee024a689061743ea94a34eb7a3291e6c2b69b45a9682d09","src/exec.rs":"21495ab6813598204a444aeea3a0121674081389fd0f07fc3443eb8858b1c677","src/expand.rs":"71220309a3bac797f55129f49e79c03e96efec894ea338c735b78695367e04ca","src/find_byte.rs":"b387247b77e3269f057c3399aefe5a815032c3af918c876f80eb4b282e4eb95e","src/freqs.rs":"255555f3d95b08a5bb3bc2f38d5a06cc100a39c0f0127fe4f50c33afa1cadc65","src/input.rs":"13f49c1bce2fadd04a45b421d374cd0f8b72bef83f7e8fda958962aaccbe799a","src/lib.rs":"982fadba415c4c5b93f4d7d4a73a23ec88e2d96daaa03b679d14490ea0f63197","src/literal/imp.rs":"b7f63a861c299bea4baaab17353a420ee339c2cf76d3858c95f39342bd4463e7","src/literal/mod.rs":"533f1d68af088e9485170145e27518368e541a0337fdb44f63249ebf97310300","src/pattern.rs":"993d8b6b4bcea5e02bee3c76e17c356a5a47f8fc53c5555edfd1ebb71c0878bf","src/pikevm.rs":"6c0eaa7e878c945ac4c3c545c98f5706ad04846fc432a5086c8ee78eb030dfa7","src/pool.rs":"942e991ae31ef349bd76efd78b2a712c01166dec965bf93742977ed0870d5a10","src/prog.rs":"bebb3e50745bbc05d6c8240d972ba55a1818c51b1161dc1c21f3fe13c11d4884","src/re_builder.rs":"943344bf6e2fc90902ee04b11b741c32418ac6814b21b7982cc0a3a817713f3e","src/re_bytes.rs":"63ee1db1637a3764addb10e27248129acffaf78bb0a69624add4d9d6f1e97040","src/re_set.rs":"7921ac4a919b7a5deffe82d099a9ccaf5487aebd890dfb7a661e602c6ad3f1a9","src/re_trait.rs":"d237121b6f6b606836c72305cbcb3bbdbc54d1f6827d19a19cd0fbb4372e0145","src/re_unicode.rs":"4ca66d6e835df7c0f570c8cde52667ef90ba1687d5285f12fedef2e38ae925b4","src/sparse.rs":"0da3ddb7972109869248a764dbb10254555f4bb51c375e89fb3fab9cafa47320","src/testdata/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","src/testdata/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","src/testdata/basic.dat":"b5b33aa89d48a61cd67cb1fbfd8f70e62c83e30b86256f9f915a5190dd38ff06","src/testdata/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","src/testdata/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","src/utf8.rs":"f85a356ff5d5b19e417b73ce1dd84581b21d283f6dddd195547c30af9c60bd1a","test":"0d62fdca7da12fc19ea5306b5de1d83e68d9365a029c043d524334da138b0304","tests/api.rs":"7b2a0ef75e99b9776094967bd66e9cdeaa8e11359f5f0a12bd08ef0e8d0c11fc","tests/api_str.rs":"2ae38c04e7e8fac008b609a820d0b1561ba75f39b0edc0987d6d3d06132da77f","tests/bytes.rs":"edc50f526c5fee43df89d639ef18b237e4eb91e9d533bfc43f3cbab7417d38ba","tests/consistent.rs":"d69435154c09478076497216e43081a835ac65147181a4fbddad7bff469605b2","tests/crates_regex.rs":"91a59d470e0700b4bcb3ff735d06799f3107b8ef4875a2e9904607b164be0326","tests/crazy.rs":"c0d56380dff19bdd5d7a3eb731d0e2dc564e169a1b73c81e1879b1e87f5f5f77","tests/flags.rs":"05caace2c81a99d2168037f3a38035d4dffe9f85ef3ebd7ef18b1bc6612f1ea8","tests/fowler.rs":"d78cf914de40b1e125cc92b65ccb444d462586bd07b5e05de4e4a1b5de16aa76","tests/macros.rs":"6db70c16fc90df13e6b30d2b606f8b6dd4dc976697967f6ee001b15aab6d0b19","tests/macros_bytes.rs":"a049f528a93173a1bb176cd46932dce1880679f4a1752e099be920f0e4546fd0","tests/macros_str.rs":"e585b1461374c45a2eca44ca045bc3c1fe984b2b4212e432b0c695b420e708b7","tests/misc.rs":"395f52793fa022e4cdda78675b6a6fba1a3106b4b99c834c39f7801574054bd1","tests/multiline.rs":"1b1a3326ed976437c1357f01d81833ece7ea244f38826246eab55cacd5d0862a","tests/noparse.rs":"12b6be0eff3d80779d33c6459396c74c0f6ebf4ddc9f1d33c3e747ea9e3bf268","tests/regression.rs":"1c965fefb8c7a2b1dfdab3e3fdeebaf47846555c50c8005e5537f96a52a3e252","tests/regression_fuzz.rs":"a504ec563e0d23bd2039493b7b1767fe1f831d7d668f6f4b2ecd124fc7899bcd","tests/replace.rs":"66f97532e40697934e2a77605b9002dfd22c46b6033ccb755e7660d855229f41","tests/searcher.rs":"ce35e47b0a276a7e8c9060c6a0b225ffba163aebc61fbc15555a6897fa0e552c","tests/set.rs":"f1e2af6baeeaed3cc99ed347ff516fe7b2eb0027ef64b891502e1486598eaf8a","tests/shortest_match.rs":"a2c94390c0d61bc24796b4c1288c924e90c8c9c6156fdebb858175177a194a42","tests/suffix_reverse.rs":"b95f89397404871227d9efe6df23b9ded147f183db81597e608f693955c668b5","tests/test_backtrack.rs":"b70c5e5f1241efd76dd9f9dd4a4df8a7b38113bd407d1f5f56867f1176177a59","tests/test_backtrack_bytes.rs":"b8a111d4b4109c8bba7e2afb650572c495a14d357fb1f743c1076fb001f704b5","tests/test_backtrack_utf8bytes.rs":"c0c279785d18beac2b4e178e7bf6c14ed235d65f00ca467cfd9c333d79487649","tests/test_crates_regex.rs":"fd9525c2eef0e2f8cb7f787bc2b721bcd0b5d84f3bca49adfe48d657a99c721a","tests/test_default.rs":"c2dfa0298896f86f1be2abf6b0c347a7ca12f95aeac92bf614dc3b86bdfff269","tests/test_default_bytes.rs":"831d3e6bfb882feb15f700e30304bd34328f888fb4c15c7169371e25024ce9a7","tests/test_nfa.rs":"f119fc43a018249c39c813d57096b0654ff69f337345f2bbd9b0e61cc9137285","tests/test_nfa_bytes.rs":"89eae3bef6a1d0bcea6b5de5be35ad72f613f2ceb8b58fe82a6c6ef2ccdc07d0","tests/test_nfa_utf8bytes.rs":"7d830b4aa401887d7cf098b62fed4cd8017ef8b61f625c7c9a2159a6b4cfeb71","tests/unicode.rs":"1af9db7f09a6b0113b8a64733e06c8415fef720b2fdef227ae398d94332287cd","tests/word_boundary.rs":"7081317ddcec1e82dd4a2090a571c6abf2ff4bbfa8cd10395e1eb3f386157fae","tests/word_boundary_ascii.rs":"cd0be5b5b485de0ba7994b42e2864585556c3d2d8bf5eab05b58931d9aaf4b87","tests/word_boundary_unicode.rs":"75dbcc35d3abc0f9795c2ea99e216dc227b0a5b58e9ca5eef767815ff0513921"},"package":"48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"} \ No newline at end of file
diff --git a/third_party/rust/regex/CHANGELOG.md b/third_party/rust/regex/CHANGELOG.md
new file mode 100644
index 0000000000..466f5a9c92
--- /dev/null
+++ b/third_party/rust/regex/CHANGELOG.md
@@ -0,0 +1,1095 @@
+1.7.1 (2023-01-09)
+==================
+This release was done principally to try and fix the doc.rs rendering for the
+regex crate.
+
+Performance improvements:
+
+* [PERF #930](https://github.com/rust-lang/regex/pull/930):
+ Optimize `replacen`. This also applies to `replace`, but not `replace_all`.
+
+Bug fixes:
+
+* [BUG #945](https://github.com/rust-lang/regex/issues/945):
+ Maybe fix rustdoc rendering by just bumping a new release?
+
+
+1.7.0 (2022-11-05)
+==================
+This release principally includes an upgrade to Unicode 15.
+
+New features:
+
+* [FEATURE #832](https://github.com/rust-lang/regex/issues/916):
+ Upgrade to Unicode 15.
+
+
+1.6.0 (2022-07-05)
+==================
+This release principally includes an upgrade to Unicode 14.
+
+New features:
+
+* [FEATURE #832](https://github.com/rust-lang/regex/pull/832):
+ Clarify that `Captures::len` includes all groups, not just matching groups.
+* [FEATURE #857](https://github.com/rust-lang/regex/pull/857):
+ Add an `ExactSizeIterator` impl for `SubCaptureMatches`.
+* [FEATURE #861](https://github.com/rust-lang/regex/pull/861):
+ Improve `RegexSet` documentation examples.
+* [FEATURE #877](https://github.com/rust-lang/regex/issues/877):
+ Upgrade to Unicode 14.
+
+Bug fixes:
+
+* [BUG #792](https://github.com/rust-lang/regex/issues/792):
+ Fix error message rendering bug.
+
+
+1.5.6 (2022-05-20)
+==================
+This release includes a few bug fixes, including a bug that produced incorrect
+matches when a non-greedy `?` operator was used.
+
+* [BUG #680](https://github.com/rust-lang/regex/issues/680):
+ Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class.
+* [BUG #859](https://github.com/rust-lang/regex/issues/859):
+ Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`.
+* [BUG #862](https://github.com/rust-lang/regex/issues/862):
+ Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'.
+
+
+1.5.5 (2022-03-08)
+==================
+This releases fixes a security bug in the regex compiler. This bug permits a
+vector for a denial-of-service attack in cases where the regex being compiled
+is untrusted. There are no known problems where the regex is itself trusted,
+including in cases of untrusted haystacks.
+
+* [SECURITY #GHSA-m5pq-gvj9-9vr8](https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8):
+ Fixes a bug in the regex compiler where empty sub-expressions subverted the
+ existing mitigations in place to enforce a size limit on compiled regexes.
+ The Rust Security Response WG published an advisory about this:
+ https://groups.google.com/g/rustlang-security-announcements/c/NcNNL1Jq7Yw
+
+
+1.5.4 (2021-05-06)
+==================
+This release fixes another compilation failure when building regex. This time,
+the fix is for when the `pattern` feature is enabled, which only works on
+nightly Rust. CI has been updated to test this case.
+
+* [BUG #772](https://github.com/rust-lang/regex/pull/772):
+ Fix build when `pattern` feature is enabled.
+
+
+1.5.3 (2021-05-01)
+==================
+This releases fixes a bug when building regex with only the `unicode-perl`
+feature. It turns out that while CI was building this configuration, it wasn't
+actually failing the overall build on a failed compilation.
+
+* [BUG #769](https://github.com/rust-lang/regex/issues/769):
+ Fix build in `regex-syntax` when only the `unicode-perl` feature is enabled.
+
+
+1.5.2 (2021-05-01)
+==================
+This release fixes a performance bug when Unicode word boundaries are used.
+Namely, for certain regexes on certain inputs, it's possible for the lazy DFA
+to stop searching (causing a fallback to a slower engine) when it doesn't
+actually need to.
+
+[PR #768](https://github.com/rust-lang/regex/pull/768) fixes the bug, which was
+originally reported in
+[ripgrep#1860](https://github.com/BurntSushi/ripgrep/issues/1860).
+
+
+1.5.1 (2021-04-30)
+==================
+This is a patch release that fixes a compilation error when the `perf-literal`
+feature is not enabled.
+
+
+1.5.0 (2021-04-30)
+==================
+This release primarily updates to Rust 2018 (finally) and bumps the MSRV to
+Rust 1.41 (from Rust 1.28). Rust 1.41 was chosen because it's still reasonably
+old, and is what's in Debian stable at the time of writing.
+
+This release also drops this crate's own bespoke substring search algorithms
+in favor of a new
+[`memmem` implementation provided by the `memchr` crate](https://docs.rs/memchr/2.4.0/memchr/memmem/index.html).
+This will change the performance profile of some regexes, sometimes getting a
+little worse, and hopefully more frequently, getting a lot better. Please
+report any serious performance regressions if you find them.
+
+
+1.4.6 (2021-04-22)
+==================
+This is a small patch release that fixes the compiler's size check on how much
+heap memory a regex uses. Previously, the compiler did not account for the
+heap usage of Unicode character classes. Now it does. It's possible that this
+may make some regexes fail to compile that previously did compile. If that
+happens, please file an issue.
+
+* [BUG OSS-fuzz#33579](https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579):
+ Some regexes can use more heap memory than one would expect.
+
+
+1.4.5 (2021-03-14)
+==================
+This is a small patch release that fixes a regression in the size of a `Regex`
+in the 1.4.4 release. Prior to 1.4.4, a `Regex` was 552 bytes. In the 1.4.4
+release, it was 856 bytes due to internal changes. In this release, a `Regex`
+is now 16 bytes. In general, the size of a `Regex` was never something that was
+on my radar, but this increased size in the 1.4.4 release seems to have crossed
+a threshold and resulted in stack overflows in some programs.
+
+* [BUG #750](https://github.com/rust-lang/regex/pull/750):
+ Fixes stack overflows seemingly caused by a large `Regex` size by decreasing
+ its size.
+
+
+1.4.4 (2021-03-11)
+==================
+This is a small patch release that contains some bug fixes. Notably, it also
+drops the `thread_local` (and `lazy_static`, via transitivity) dependencies.
+
+Bug fixes:
+
+* [BUG #362](https://github.com/rust-lang/regex/pull/362):
+ Memory leaks caused by an internal caching strategy should now be fixed.
+* [BUG #576](https://github.com/rust-lang/regex/pull/576):
+ All regex types now implement `UnwindSafe` and `RefUnwindSafe`.
+* [BUG #728](https://github.com/rust-lang/regex/pull/749):
+ Add missing `Replacer` impls for `Vec<u8>`, `String`, `Cow`, etc.
+
+
+1.4.3 (2021-01-08)
+==================
+This is a small patch release that adds some missing standard trait
+implementations for some types in the public API.
+
+Bug fixes:
+
+* [BUG #734](https://github.com/rust-lang/regex/pull/734):
+ Add `FusedIterator` and `ExactSizeIterator` impls to iterator types.
+* [BUG #735](https://github.com/rust-lang/regex/pull/735):
+ Add missing `Debug` impls to public API types.
+
+
+1.4.2 (2020-11-01)
+==================
+This is a small bug fix release that bans `\P{any}`. We previously banned empty
+classes like `[^\w\W]`, but missed the `\P{any}` case. In the future, we hope
+to permit empty classes.
+
+* [BUG #722](https://github.com/rust-lang/regex/issues/722):
+ Ban `\P{any}` to avoid a panic in the regex compiler. Found by OSS-Fuzz.
+
+
+1.4.1 (2020-10-13)
+==================
+This is a small bug fix release that makes `\p{cf}` work. Previously, it would
+report "property not found" even though `cf` is a valid abbreviation for the
+`Format` general category.
+
+* [BUG #719](https://github.com/rust-lang/regex/issues/719):
+ Fixes bug that prevented `\p{cf}` from working.
+
+
+1.4.0 (2020-10-11)
+==================
+This releases has a few minor documentation fixes as well as some very minor
+API additions. The MSRV remains at Rust 1.28 for now, but this is intended to
+increase to at least Rust 1.41.1 soon.
+
+This release also adds support for OSS-Fuzz. Kudos to
+[@DavidKorczynski](https://github.com/DavidKorczynski)
+for doing the heavy lifting for that!
+
+New features:
+
+* [FEATURE #649](https://github.com/rust-lang/regex/issues/649):
+ Support `[`, `]` and `.` in capture group names.
+* [FEATURE #687](https://github.com/rust-lang/regex/issues/687):
+ Add `is_empty` predicate to `RegexSet`.
+* [FEATURE #689](https://github.com/rust-lang/regex/issues/689):
+ Implement `Clone` for `SubCaptureMatches`.
+* [FEATURE #715](https://github.com/rust-lang/regex/issues/715):
+ Add `empty` constructor to `RegexSet` for convenience.
+
+Bug fixes:
+
+* [BUG #694](https://github.com/rust-lang/regex/issues/694):
+ Fix doc example for `Replacer::replace_append`.
+* [BUG #698](https://github.com/rust-lang/regex/issues/698):
+ Clarify docs for `s` flag when using a `bytes::Regex`.
+* [BUG #711](https://github.com/rust-lang/regex/issues/711):
+ Clarify `is_match` docs to indicate that it can match anywhere in string.
+
+
+1.3.9 (2020-05-28)
+==================
+This release fixes a MSRV (Minimum Support Rust Version) regression in the
+1.3.8 release. Namely, while 1.3.8 compiles on Rust 1.28, it actually does not
+compile on other Rust versions, such as Rust 1.39.
+
+Bug fixes:
+
+* [BUG #685](https://github.com/rust-lang/regex/issues/685):
+ Remove use of `doc_comment` crate, which cannot be used before Rust 1.43.
+
+
+1.3.8 (2020-05-28)
+==================
+This release contains a couple of important bug fixes driven
+by better support for empty-subexpressions in regexes. For
+example, regexes like `b|` are now allowed. Major thanks to
+[@sliquister](https://github.com/sliquister) for implementing support for this
+in [#677](https://github.com/rust-lang/regex/pull/677).
+
+Bug fixes:
+
+* [BUG #523](https://github.com/rust-lang/regex/pull/523):
+ Add note to documentation that spaces can be escaped in `x` mode.
+* [BUG #524](https://github.com/rust-lang/regex/issues/524):
+ Add support for empty sub-expressions, including empty alternations.
+* [BUG #659](https://github.com/rust-lang/regex/issues/659):
+ Fix match bug caused by an empty sub-expression miscompilation.
+
+
+1.3.7 (2020-04-17)
+==================
+This release contains a small bug fix that fixes how `regex` forwards crate
+features to `regex-syntax`. In particular, this will reduce recompilations in
+some cases.
+
+Bug fixes:
+
+* [BUG #665](https://github.com/rust-lang/regex/pull/665):
+ Fix feature forwarding to `regex-syntax`.
+
+
+1.3.6 (2020-03-24)
+==================
+This release contains a sizable (~30%) performance improvement when compiling
+some kinds of large regular expressions.
+
+Performance improvements:
+
+* [PERF #657](https://github.com/rust-lang/regex/pull/657):
+ Improvement performance of compiling large regular expressions.
+
+
+1.3.5 (2020-03-12)
+==================
+This release updates this crate to Unicode 13.
+
+New features:
+
+* [FEATURE #653](https://github.com/rust-lang/regex/pull/653):
+ Update `regex-syntax` to Unicode 13.
+
+
+1.3.4 (2020-01-30)
+==================
+This is a small bug fix release that fixes a bug related to the scoping of
+flags in a regex. Namely, before this fix, a regex like `((?i)a)b)` would
+match `aB` despite the fact that `b` should not be matched case insensitively.
+
+Bug fixes:
+
+* [BUG #640](https://github.com/rust-lang/regex/issues/640):
+ Fix bug related to the scoping of flags in a regex.
+
+
+1.3.3 (2020-01-09)
+==================
+This is a small maintenance release that upgrades the dependency on
+`thread_local` from `0.3` to `1.0`. The minimum supported Rust version remains
+at Rust 1.28.
+
+
+1.3.2 (2020-01-09)
+==================
+This is a small maintenance release with some house cleaning and bug fixes.
+
+New features:
+
+* [FEATURE #631](https://github.com/rust-lang/regex/issues/631):
+ Add a `Match::range` method an a `From<Match> for Range` impl.
+
+Bug fixes:
+
+* [BUG #521](https://github.com/rust-lang/regex/issues/521):
+ Corrects `/-/.splitn("a", 2)` to return `["a"]` instead of `["a", ""]`.
+* [BUG #594](https://github.com/rust-lang/regex/pull/594):
+ Improve error reporting when writing `\p\`.
+* [BUG #627](https://github.com/rust-lang/regex/issues/627):
+ Corrects `/-/.split("a-")` to return `["a", ""]` instead of `["a"]`.
+* [BUG #633](https://github.com/rust-lang/regex/pull/633):
+ Squash deprecation warnings for the `std::error::Error::description` method.
+
+
+1.3.1 (2019-09-04)
+==================
+This is a maintenance release with no changes in order to try to work-around
+a [docs.rs/Cargo issue](https://github.com/rust-lang/docs.rs/issues/400).
+
+
+1.3.0 (2019-09-03)
+==================
+This release adds a plethora of new crate features that permit users of regex
+to shrink its size considerably, in exchange for giving up either functionality
+(such as Unicode support) or runtime performance. When all such features are
+disabled, the dependency tree for `regex` shrinks to exactly 1 crate
+(`regex-syntax`). More information about the new crate features can be
+[found in the docs](https://docs.rs/regex/*/#crate-features).
+
+Note that while this is a new minor version release, the minimum supported
+Rust version for this crate remains at `1.28.0`.
+
+New features:
+
+* [FEATURE #474](https://github.com/rust-lang/regex/issues/474):
+ The `use_std` feature has been deprecated in favor of the `std` feature.
+ The `use_std` feature will be removed in regex 2. Until then, `use_std` will
+ remain as an alias for the `std` feature.
+* [FEATURE #583](https://github.com/rust-lang/regex/issues/583):
+ Add a substantial number of crate features shrinking `regex`.
+
+
+1.2.1 (2019-08-03)
+==================
+This release does a bit of house cleaning. Namely:
+
+* This repository is now using rustfmt.
+* License headers have been removed from all files, in following suit with the
+ Rust project.
+* Teddy has been removed from the `regex` crate, and is now part of the
+ `aho-corasick` crate.
+ [See `aho-corasick`'s new `packed` sub-module for details](https://docs.rs/aho-corasick/0.7.6/aho_corasick/packed/index.html).
+* The `utf8-ranges` crate has been deprecated, with its functionality moving
+ into the
+ [`utf8` sub-module of `regex-syntax`](https://docs.rs/regex-syntax/0.6.11/regex_syntax/utf8/index.html).
+* The `ucd-util` dependency has been dropped, in favor of implementing what
+ little we need inside of `regex-syntax` itself.
+
+In general, this is part of an ongoing (long term) effort to make optimizations
+in the regex engine easier to reason about. The current code is too convoluted
+and thus it is very easy to introduce new bugs. This simplification effort is
+the primary motivation behind re-working the `aho-corasick` crate to not only
+bundle algorithms like Teddy, but to also provide regex-like match semantics
+automatically.
+
+Moving forward, the plan is to join up with the `bstr` and `regex-automata`
+crates, with the former providing more sophisticated substring search
+algorithms (thereby deleting existing code in `regex`) and the latter providing
+ahead-of-time compiled DFAs for cases where they are inexpensive to compute.
+
+
+1.2.0 (2019-07-20)
+==================
+This release updates regex's minimum supported Rust version to 1.28, which was
+release almost 1 year ago. This release also updates regex's Unicode data
+tables to 12.1.0.
+
+
+1.1.9 (2019-07-06)
+==================
+This release contains a bug fix that caused regex's tests to fail, due to a
+dependency on an unreleased behavior in regex-syntax.
+
+* [BUG #593](https://github.com/rust-lang/regex/issues/593):
+ Move an integration-style test on error messages into regex-syntax.
+
+
+1.1.8 (2019-07-04)
+==================
+This release contains a few small internal refactorings. One of which fixes
+an instance of undefined behavior in a part of the SIMD code.
+
+Bug fixes:
+
+* [BUG #545](https://github.com/rust-lang/regex/issues/545):
+ Improves error messages when a repetition operator is used without a number.
+* [BUG #588](https://github.com/rust-lang/regex/issues/588):
+ Removes use of a repr(Rust) union used for type punning in the Teddy matcher.
+* [BUG #591](https://github.com/rust-lang/regex/issues/591):
+ Update docs for running benchmarks and improve failure modes.
+
+
+1.1.7 (2019-06-09)
+==================
+This release fixes up a few warnings as a result of recent deprecations.
+
+
+1.1.6 (2019-04-16)
+==================
+This release fixes a regression introduced by a bug fix (for
+[BUG #557](https://github.com/rust-lang/regex/issues/557)) which could cause
+the regex engine to enter an infinite loop. This bug was originally
+[reported against ripgrep](https://github.com/BurntSushi/ripgrep/issues/1247).
+
+
+1.1.5 (2019-04-01)
+==================
+This release fixes a bug in regex's dependency specification where it requires
+a newer version of regex-syntax, but this wasn't communicated correctly in the
+Cargo.toml. This would have been caught by a minimal version check, but this
+check was disabled because the `rand` crate itself advertises incorrect
+dependency specifications.
+
+Bug fixes:
+
+* [BUG #570](https://github.com/rust-lang/regex/pull/570):
+ Fix regex-syntax minimal version.
+
+
+1.1.4 (2019-03-31)
+==================
+This release fixes a backwards compatibility regression where Regex was no
+longer UnwindSafe. This was caused by the upgrade to aho-corasick 0.7, whose
+AhoCorasick type was itself not UnwindSafe. This has been fixed in aho-corasick
+0.7.4, which we now require.
+
+Bug fixes:
+
+* [BUG #568](https://github.com/rust-lang/regex/pull/568):
+ Fix an API regression where Regex was no longer UnwindSafe.
+
+
+1.1.3 (2019-03-30)
+==================
+This releases fixes a few bugs and adds a performance improvement when a regex
+is a simple alternation of literals.
+
+Performance improvements:
+
+* [OPT #566](https://github.com/rust-lang/regex/pull/566):
+ Upgrades `aho-corasick` to 0.7 and uses it for `foo|bar|...|quux` regexes.
+
+Bug fixes:
+
+* [BUG #527](https://github.com/rust-lang/regex/issues/527):
+ Fix a bug where the parser would panic on patterns like `((?x))`.
+* [BUG #555](https://github.com/rust-lang/regex/issues/555):
+ Fix a bug where the parser would panic on patterns like `(?m){1,1}`.
+* [BUG #557](https://github.com/rust-lang/regex/issues/557):
+ Fix a bug where captures could lead to an incorrect match.
+
+
+1.1.2 (2019-02-27)
+==================
+This release fixes a bug found in the fix introduced in 1.1.1.
+
+Bug fixes:
+
+* [BUG edf45e6f](https://github.com/rust-lang/regex/commit/edf45e6f):
+ Fix bug introduced in reverse suffix literal matcher in the 1.1.1 release.
+
+
+1.1.1 (2019-02-27)
+==================
+This is a small release with one fix for a bug caused by literal optimizations.
+
+Bug fixes:
+
+* [BUG 661bf53d](https://github.com/rust-lang/regex/commit/661bf53d):
+ Fixes a bug in the reverse suffix literal optimization. This was originally
+ reported
+ [against ripgrep](https://github.com/BurntSushi/ripgrep/issues/1203).
+
+
+1.1.0 (2018-11-30)
+==================
+This is a small release with a couple small enhancements. This release also
+increases the minimal supported Rust version (MSRV) to 1.24.1 (from 1.20.0). In
+accordance with this crate's MSRV policy, this release bumps the minor version
+number.
+
+Performance improvements:
+
+* [OPT #511](https://github.com/rust-lang/regex/pull/511),
+ [OPT #540](https://github.com/rust-lang/regex/pull/540):
+ Improve lazy DFA construction for large regex sets.
+
+New features:
+
+* [FEATURE #538](https://github.com/rust-lang/regex/pull/538):
+ Add Emoji and "break" Unicode properties. See [UNICODE.md](UNICODE.md).
+
+Bug fixes:
+
+* [BUG #530](https://github.com/rust-lang/regex/pull/530):
+ Add Unicode license (for data tables).
+* Various typo/doc fixups.
+
+
+1.0.6 (2018-11-06)
+==================
+This is a small release.
+
+Performance improvements:
+
+* [OPT #513](https://github.com/rust-lang/regex/pull/513):
+ Improve performance of compiling large Unicode classes by 8-10%.
+
+Bug fixes:
+
+* [BUG #533](https://github.com/rust-lang/regex/issues/533):
+ Fix definition of `[[:blank:]]` class that regressed in `regex-syntax 0.5`.
+
+
+1.0.5 (2018-09-06)
+==================
+This is a small release with an API enhancement.
+
+New features:
+
+* [FEATURE #509](https://github.com/rust-lang/regex/pull/509):
+ Generalize impls of the `Replacer` trait.
+
+
+1.0.4 (2018-08-25)
+==================
+This is a small release that bumps the quickcheck dependency.
+
+
+1.0.3 (2018-08-24)
+==================
+This is a small bug fix release.
+
+Bug fixes:
+
+* [BUG #504](https://github.com/rust-lang/regex/pull/504):
+ Fix for Cargo's "minimal version" support.
+* [BUG 1e39165f](https://github.com/rust-lang/regex/commit/1e39165f):
+ Fix doc examples for byte regexes.
+
+
+1.0.2 (2018-07-18)
+==================
+This release exposes some new lower level APIs on `Regex` that permit
+amortizing allocation and controlling the location at which a search is
+performed in a more granular way. Most users of the regex crate will not
+need or want to use these APIs.
+
+New features:
+
+* [FEATURE #493](https://github.com/rust-lang/regex/pull/493):
+ Add a few lower level APIs for amortizing allocation and more fine grained
+ searching.
+
+Bug fixes:
+
+* [BUG 3981d2ad](https://github.com/rust-lang/regex/commit/3981d2ad):
+ Correct outdated documentation on `RegexBuilder::dot_matches_new_line`.
+* [BUG 7ebe4ae0](https://github.com/rust-lang/regex/commit/7ebe4ae0):
+ Correct outdated documentation on `Parser::allow_invalid_utf8` in the
+ `regex-syntax` crate.
+* [BUG 24c7770b](https://github.com/rust-lang/regex/commit/24c7770b):
+ Fix a bug in the HIR printer where it wouldn't correctly escape meta
+ characters in character classes.
+
+
+1.0.1 (2018-06-19)
+==================
+This release upgrades regex's Unicode tables to Unicode 11, and enables SIMD
+optimizations automatically on Rust stable (1.27 or newer).
+
+New features:
+
+* [FEATURE #486](https://github.com/rust-lang/regex/pull/486):
+ Implement `size_hint` on `RegexSet` match iterators.
+* [FEATURE #488](https://github.com/rust-lang/regex/pull/488):
+ Update Unicode tables for Unicode 11.
+* [FEATURE #490](https://github.com/rust-lang/regex/pull/490):
+ SIMD optimizations are now enabled automatically in Rust stable, for versions
+ 1.27 and up. No compilation flags or features need to be set. CPU support
+ SIMD is detected automatically at runtime.
+
+Bug fixes:
+
+* [BUG #482](https://github.com/rust-lang/regex/pull/482):
+ Present a better compilation error when the `use_std` feature isn't used.
+
+
+1.0.0 (2018-05-01)
+==================
+This release marks the 1.0 release of regex.
+
+While this release includes some breaking changes, most users of older versions
+of the regex library should be able to migrate to 1.0 by simply bumping the
+version number. The important changes are as follows:
+
+* We adopt Rust 1.20 as the new minimum supported version of Rust for regex.
+ We also tentativley adopt a policy that permits bumping the minimum supported
+ version of Rust in minor version releases of regex, but no patch releases.
+ That is, with respect to semver, we do not strictly consider bumping the
+ minimum version of Rust to be a breaking change, but adopt a conservative
+ stance as a compromise.
+* Octal syntax in regular expressions has been disabled by default. This
+ permits better error messages that inform users that backreferences aren't
+ available. Octal syntax can be re-enabled via the corresponding option on
+ `RegexBuilder`.
+* `(?-u:\B)` is no longer allowed in Unicode regexes since it can match at
+ invalid UTF-8 code unit boundaries. `(?-u:\b)` is still allowed in Unicode
+ regexes.
+* The `From<regex_syntax::Error>` impl has been removed. This formally removes
+ the public dependency on `regex-syntax`.
+* A new feature, `use_std`, has been added and enabled by default. Disabling
+ the feature will result in a compilation error. In the future, this may
+ permit us to support `no_std` environments (w/ `alloc`) in a backwards
+ compatible way.
+
+For more information and discussion, please see
+[1.0 release tracking issue](https://github.com/rust-lang/regex/issues/457).
+
+
+0.2.11 (2018-05-01)
+===================
+This release primarily contains bug fixes. Some of them resolve bugs where
+the parser could panic.
+
+New features:
+
+* [FEATURE #459](https://github.com/rust-lang/regex/pull/459):
+ Include C++'s standard regex library and Boost's regex library in the
+ benchmark harness. We now include D/libphobos, C++/std, C++/boost, Oniguruma,
+ PCRE1, PCRE2, RE2 and Tcl in the harness.
+
+Bug fixes:
+
+* [BUG #445](https://github.com/rust-lang/regex/issues/445):
+ Clarify order of indices returned by RegexSet match iterator.
+* [BUG #461](https://github.com/rust-lang/regex/issues/461):
+ Improve error messages for invalid regexes like `[\d-a]`.
+* [BUG #464](https://github.com/rust-lang/regex/issues/464):
+ Fix a bug in the error message pretty printer that could cause a panic when
+ a regex contained a literal `\n` character.
+* [BUG #465](https://github.com/rust-lang/regex/issues/465):
+ Fix a panic in the parser that was caused by applying a repetition operator
+ to `(?flags)`.
+* [BUG #466](https://github.com/rust-lang/regex/issues/466):
+ Fix a bug where `\pC` was not recognized as an alias for `\p{Other}`.
+* [BUG #470](https://github.com/rust-lang/regex/pull/470):
+ Fix a bug where literal searches did more work than necessary for anchored
+ regexes.
+
+
+0.2.10 (2018-03-16)
+===================
+This release primarily updates the regex crate to changes made in `std::arch`
+on nightly Rust.
+
+New features:
+
+* [FEATURE #458](https://github.com/rust-lang/regex/pull/458):
+ The `Hir` type in `regex-syntax` now has a printer.
+
+
+0.2.9 (2018-03-12)
+==================
+This release introduces a new nightly only feature, `unstable`, which enables
+SIMD optimizations for certain types of regexes. No additional compile time
+options are necessary, and the regex crate will automatically choose the
+best CPU features at run time. As a result, the `simd` (nightly only) crate
+dependency has been dropped.
+
+New features:
+
+* [FEATURE #456](https://github.com/rust-lang/regex/pull/456):
+ The regex crate now includes AVX2 optimizations in addition to the extant
+ SSSE3 optimization.
+
+Bug fixes:
+
+* [BUG #455](https://github.com/rust-lang/regex/pull/455):
+ Fix a bug where `(?x)[ / - ]` failed to parse.
+
+
+0.2.8 (2018-03-12)
+==================
+Bug gixes:
+
+* [BUG #454](https://github.com/rust-lang/regex/pull/454):
+ Fix a bug in the nest limit checker being too aggressive.
+
+
+0.2.7 (2018-03-07)
+==================
+This release includes a ground-up rewrite of the regex-syntax crate, which has
+been in development for over a year.
+
+New features:
+
+* Error messages for invalid regexes have been greatly improved. You get these
+ automatically; you don't need to do anything. In addition to better
+ formatting, error messages will now explicitly call out the use of look
+ around. When regex 1.0 is released, this will happen for backreferences as
+ well.
+* Full support for intersection, difference and symmetric difference of
+ character classes. These can be used via the `&&`, `--` and `~~` binary
+ operators within classes.
+* A Unicode Level 1 conformat implementation of `\p{..}` character classes.
+ Things like `\p{scx:Hira}`, `\p{age:3.2}` or `\p{Changes_When_Casefolded}`
+ now work. All property name and value aliases are supported, and properties
+ are selected via loose matching. e.g., `\p{Greek}` is the same as
+ `\p{G r E e K}`.
+* A new `UNICODE.md` document has been added to this repository that
+ exhaustively documents support for UTS#18.
+* Empty sub-expressions are now permitted in most places. That is, `()+` is
+ now a valid regex.
+* Almost everything in regex-syntax now uses constant stack space, even when
+ performing analysis that requires structural induction. This reduces the risk
+ of a user provided regular expression causing a stack overflow.
+* [FEATURE #174](https://github.com/rust-lang/regex/issues/174):
+ The `Ast` type in `regex-syntax` now contains span information.
+* [FEATURE #424](https://github.com/rust-lang/regex/issues/424):
+ Support `\u`, `\u{...}`, `\U` and `\U{...}` syntax for specifying code points
+ in a regular expression.
+* [FEATURE #449](https://github.com/rust-lang/regex/pull/449):
+ Add a `Replace::by_ref` adapter for use of a replacer without consuming it.
+
+Bug fixes:
+
+* [BUG #446](https://github.com/rust-lang/regex/issues/446):
+ We re-enable the Boyer-Moore literal matcher.
+
+
+0.2.6 (2018-02-08)
+==================
+Bug fixes:
+
+* [BUG #446](https://github.com/rust-lang/regex/issues/446):
+ Fixes a bug in the new Boyer-Moore searcher that results in a match failure.
+ We fix this bug by temporarily disabling Boyer-Moore.
+
+
+0.2.5 (2017-12-30)
+==================
+Bug fixes:
+
+* [BUG #437](https://github.com/rust-lang/regex/issues/437):
+ Fixes a bug in the new Boyer-Moore searcher that results in a panic.
+
+
+0.2.4 (2017-12-30)
+==================
+New features:
+
+* [FEATURE #348](https://github.com/rust-lang/regex/pull/348):
+ Improve performance for capture searches on anchored regex.
+ (Contributed by @ethanpailes. Nice work!)
+* [FEATURE #419](https://github.com/rust-lang/regex/pull/419):
+ Expand literal searching to include Tuned Boyer-Moore in some cases.
+ (Contributed by @ethanpailes. Nice work!)
+
+Bug fixes:
+
+* [BUG](https://github.com/rust-lang/regex/pull/436):
+ The regex compiler plugin has been removed.
+* [BUG](https://github.com/rust-lang/regex/pull/436):
+ `simd` has been bumped to `0.2.1`, which fixes a Rust nightly build error.
+* [BUG](https://github.com/rust-lang/regex/pull/436):
+ Bring the benchmark harness up to date.
+
+
+0.2.3 (2017-11-30)
+==================
+New features:
+
+* [FEATURE #374](https://github.com/rust-lang/regex/pull/374):
+ Add `impl From<Match> for &str`.
+* [FEATURE #380](https://github.com/rust-lang/regex/pull/380):
+ Derive `Clone` and `PartialEq` on `Error`.
+* [FEATURE #400](https://github.com/rust-lang/regex/pull/400):
+ Update to Unicode 10.
+
+Bug fixes:
+
+* [BUG #375](https://github.com/rust-lang/regex/issues/375):
+ Fix a bug that prevented the bounded backtracker from terminating.
+* [BUG #393](https://github.com/rust-lang/regex/issues/393),
+ [BUG #394](https://github.com/rust-lang/regex/issues/394):
+ Fix bug with `replace` methods for empty matches.
+
+
+0.2.2 (2017-05-21)
+==================
+New features:
+
+* [FEATURE #341](https://github.com/rust-lang/regex/issues/341):
+ Support nested character classes and intersection operation.
+ For example, `[\p{Greek}&&\pL]` matches greek letters and
+ `[[0-9]&&[^4]]` matches every decimal digit except `4`.
+ (Much thanks to @robinst, who contributed this awesome feature.)
+
+Bug fixes:
+
+* [BUG #321](https://github.com/rust-lang/regex/issues/321):
+ Fix bug in literal extraction and UTF-8 decoding.
+* [BUG #326](https://github.com/rust-lang/regex/issues/326):
+ Add documentation tip about the `(?x)` flag.
+* [BUG #333](https://github.com/rust-lang/regex/issues/333):
+ Show additional replacement example using curly braces.
+* [BUG #334](https://github.com/rust-lang/regex/issues/334):
+ Fix bug when resolving captures after a match.
+* [BUG #338](https://github.com/rust-lang/regex/issues/338):
+ Add example that uses `Captures::get` to API documentation.
+* [BUG #353](https://github.com/rust-lang/regex/issues/353):
+ Fix RegexSet bug that caused match failure in some cases.
+* [BUG #354](https://github.com/rust-lang/regex/pull/354):
+ Fix panic in parser when `(?x)` is used.
+* [BUG #358](https://github.com/rust-lang/regex/issues/358):
+ Fix literal optimization bug with RegexSet.
+* [BUG #359](https://github.com/rust-lang/regex/issues/359):
+ Fix example code in README.
+* [BUG #365](https://github.com/rust-lang/regex/pull/365):
+ Fix bug in `rure_captures_len` in the C binding.
+* [BUG #367](https://github.com/rust-lang/regex/issues/367):
+ Fix byte class bug that caused a panic.
+
+
+0.2.1
+=====
+One major bug with `replace_all` has been fixed along with a couple of other
+touchups.
+
+* [BUG #312](https://github.com/rust-lang/regex/issues/312):
+ Fix documentation for `NoExpand` to reference correct lifetime parameter.
+* [BUG #314](https://github.com/rust-lang/regex/issues/314):
+ Fix a bug with `replace_all` when replacing a match with the empty string.
+* [BUG #316](https://github.com/rust-lang/regex/issues/316):
+ Note a missing breaking change from the `0.2.0` CHANGELOG entry.
+ (`RegexBuilder::compile` was renamed to `RegexBuilder::build`.)
+* [BUG #324](https://github.com/rust-lang/regex/issues/324):
+ Compiling `regex` should only require one version of `memchr` crate.
+
+
+0.2.0
+=====
+This is a new major release of the regex crate, and is an implementation of the
+[regex 1.0 RFC](https://github.com/rust-lang/rfcs/blob/master/text/1620-regex-1.0.md).
+We are releasing a `0.2` first, and if there are no major problems, we will
+release a `1.0` shortly. For `0.2`, the minimum *supported* Rust version is
+1.12.
+
+There are a number of **breaking changes** in `0.2`. They are split into two
+types. The first type correspond to breaking changes in regular expression
+syntax. The second type correspond to breaking changes in the API.
+
+Breaking changes for regex syntax:
+
+* POSIX character classes now require double bracketing. Previously, the regex
+ `[:upper:]` would parse as the `upper` POSIX character class. Now it parses
+ as the character class containing the characters `:upper:`. The fix to this
+ change is to use `[[:upper:]]` instead. Note that variants like
+ `[[:upper:][:blank:]]` continue to work.
+* The character `[` must always be escaped inside a character class.
+* The characters `&`, `-` and `~` must be escaped if any one of them are
+ repeated consecutively. For example, `[&]`, `[\&]`, `[\&\&]`, `[&-&]` are all
+ equivalent while `[&&]` is illegal. (The motivation for this and the prior
+ change is to provide a backwards compatible path for adding character class
+ set notation.)
+* A `bytes::Regex` now has Unicode mode enabled by default (like the main
+ `Regex` type). This means regexes compiled with `bytes::Regex::new` that
+ don't have the Unicode flag set should add `(?-u)` to recover the original
+ behavior.
+
+Breaking changes for the regex API:
+
+* `find` and `find_iter` now **return `Match` values instead of
+ `(usize, usize)`.** `Match` values have `start` and `end` methods, which
+ return the match offsets. `Match` values also have an `as_str` method,
+ which returns the text of the match itself.
+* The `Captures` type now only provides a single iterator over all capturing
+ matches, which should replace uses of `iter` and `iter_pos`. Uses of
+ `iter_named` should use the `capture_names` method on `Regex`.
+* The `at` method on the `Captures` type has been renamed to `get`, and it
+ now returns a `Match`. Similarly, the `name` method on `Captures` now returns
+ a `Match`.
+* The `replace` methods now return `Cow` values. The `Cow::Borrowed` variant
+ is returned when no replacements are made.
+* The `Replacer` trait has been completely overhauled. This should only
+ impact clients that implement this trait explicitly. Standard uses of
+ the `replace` methods should continue to work unchanged. If you implement
+ the `Replacer` trait, please consult the new documentation.
+* The `quote` free function has been renamed to `escape`.
+* The `Regex::with_size_limit` method has been removed. It is replaced by
+ `RegexBuilder::size_limit`.
+* The `RegexBuilder` type has switched from owned `self` method receivers to
+ `&mut self` method receivers. Most uses will continue to work unchanged, but
+ some code may require naming an intermediate variable to hold the builder.
+* The `compile` method on `RegexBuilder` has been renamed to `build`.
+* The free `is_match` function has been removed. It is replaced by compiling
+ a `Regex` and calling its `is_match` method.
+* The `PartialEq` and `Eq` impls on `Regex` have been dropped. If you relied
+ on these impls, the fix is to define a wrapper type around `Regex`, impl
+ `Deref` on it and provide the necessary impls.
+* The `is_empty` method on `Captures` has been removed. This always returns
+ `false`, so its use is superfluous.
+* The `Syntax` variant of the `Error` type now contains a string instead of
+ a `regex_syntax::Error`. If you were examining syntax errors more closely,
+ you'll need to explicitly use the `regex_syntax` crate to re-parse the regex.
+* The `InvalidSet` variant of the `Error` type has been removed since it is
+ no longer used.
+* Most of the iterator types have been renamed to match conventions. If you
+ were using these iterator types explicitly, please consult the documentation
+ for its new name. For example, `RegexSplits` has been renamed to `Split`.
+
+A number of bugs have been fixed:
+
+* [BUG #151](https://github.com/rust-lang/regex/issues/151):
+ The `Replacer` trait has been changed to permit the caller to control
+ allocation.
+* [BUG #165](https://github.com/rust-lang/regex/issues/165):
+ Remove the free `is_match` function.
+* [BUG #166](https://github.com/rust-lang/regex/issues/166):
+ Expose more knobs (available in `0.1`) and remove `with_size_limit`.
+* [BUG #168](https://github.com/rust-lang/regex/issues/168):
+ Iterators produced by `Captures` now have the correct lifetime parameters.
+* [BUG #175](https://github.com/rust-lang/regex/issues/175):
+ Fix a corner case in the parsing of POSIX character classes.
+* [BUG #178](https://github.com/rust-lang/regex/issues/178):
+ Drop the `PartialEq` and `Eq` impls on `Regex`.
+* [BUG #179](https://github.com/rust-lang/regex/issues/179):
+ Remove `is_empty` from `Captures` since it always returns false.
+* [BUG #276](https://github.com/rust-lang/regex/issues/276):
+ Position of named capture can now be retrieved from a `Captures`.
+* [BUG #296](https://github.com/rust-lang/regex/issues/296):
+ Remove winapi/kernel32-sys dependency on UNIX.
+* [BUG #307](https://github.com/rust-lang/regex/issues/307):
+ Fix error on emscripten.
+
+
+0.1.80
+======
+* [PR #292](https://github.com/rust-lang/regex/pull/292):
+ Fixes bug #291, which was introduced by PR #290.
+
+0.1.79
+======
+* Require regex-syntax 0.3.8.
+
+0.1.78
+======
+* [PR #290](https://github.com/rust-lang/regex/pull/290):
+ Fixes bug #289, which caused some regexes with a certain combination
+ of literals to match incorrectly.
+
+0.1.77
+======
+* [PR #281](https://github.com/rust-lang/regex/pull/281):
+ Fixes bug #280 by disabling all literal optimizations when a pattern
+ is partially anchored.
+
+0.1.76
+======
+* Tweak criteria for using the Teddy literal matcher.
+
+0.1.75
+======
+* [PR #275](https://github.com/rust-lang/regex/pull/275):
+ Improves match verification performance in the Teddy SIMD searcher.
+* [PR #278](https://github.com/rust-lang/regex/pull/278):
+ Replaces slow substring loop in the Teddy SIMD searcher with Aho-Corasick.
+* Implemented DoubleEndedIterator on regex set match iterators.
+
+0.1.74
+======
+* Release regex-syntax 0.3.5 with a minor bug fix.
+* Fix bug #272.
+* Fix bug #277.
+* [PR #270](https://github.com/rust-lang/regex/pull/270):
+ Fixes bugs #264, #268 and an unreported where the DFA cache size could be
+ drastically under estimated in some cases (leading to high unexpected memory
+ usage).
+
+0.1.73
+======
+* Release `regex-syntax 0.3.4`.
+* Bump `regex-syntax` dependency version for `regex` to `0.3.4`.
+
+0.1.72
+======
+* [PR #262](https://github.com/rust-lang/regex/pull/262):
+ Fixes a number of small bugs caught by fuzz testing (AFL).
+
+0.1.71
+======
+* [PR #236](https://github.com/rust-lang/regex/pull/236):
+ Fix a bug in how suffix literals were extracted, which could lead
+ to invalid match behavior in some cases.
+
+0.1.70
+======
+* [PR #231](https://github.com/rust-lang/regex/pull/231):
+ Add SIMD accelerated multiple pattern search.
+* [PR #228](https://github.com/rust-lang/regex/pull/228):
+ Reintroduce the reverse suffix literal optimization.
+* [PR #226](https://github.com/rust-lang/regex/pull/226):
+ Implements NFA state compression in the lazy DFA.
+* [PR #223](https://github.com/rust-lang/regex/pull/223):
+ A fully anchored RegexSet can now short-circuit.
+
+0.1.69
+======
+* [PR #216](https://github.com/rust-lang/regex/pull/216):
+ Tweak the threshold for running backtracking.
+* [PR #217](https://github.com/rust-lang/regex/pull/217):
+ Add upper limit (from the DFA) to capture search (for the NFA).
+* [PR #218](https://github.com/rust-lang/regex/pull/218):
+ Add rure, a C API.
+
+0.1.68
+======
+* [PR #210](https://github.com/rust-lang/regex/pull/210):
+ Fixed a performance bug in `bytes::Regex::replace` where `extend` was used
+ instead of `extend_from_slice`.
+* [PR #211](https://github.com/rust-lang/regex/pull/211):
+ Fixed a bug in the handling of word boundaries in the DFA.
+* [PR #213](https://github.com/rust-lang/pull/213):
+ Added RE2 and Tcl to the benchmark harness. Also added a CLI utility from
+ running regexes using any of the following regex engines: PCRE1, PCRE2,
+ Oniguruma, RE2, Tcl and of course Rust's own regexes.
+
+0.1.67
+======
+* [PR #201](https://github.com/rust-lang/regex/pull/201):
+ Fix undefined behavior in the `regex!` compiler plugin macro.
+* [PR #205](https://github.com/rust-lang/regex/pull/205):
+ More improvements to DFA performance. Competitive with RE2. See PR for
+ benchmarks.
+* [PR #209](https://github.com/rust-lang/regex/pull/209):
+ Release 0.1.66 was semver incompatible since it required a newer version
+ of Rust than previous releases. This PR fixes that. (And `0.1.66` was
+ yanked.)
+
+0.1.66
+======
+* Speculative support for Unicode word boundaries was added to the DFA. This
+ should remove the last common case that disqualified use of the DFA.
+* An optimization that scanned for suffix literals and then matched the regular
+ expression in reverse was removed because it had worst case quadratic time
+ complexity. It was replaced with a more limited optimization where, given any
+ regex of the form `re$`, it will be matched in reverse from the end of the
+ haystack.
+* [PR #202](https://github.com/rust-lang/regex/pull/202):
+ The inner loop of the DFA was heavily optimized to improve cache locality
+ and reduce the overall number of instructions run on each iteration. This
+ represents the first use of `unsafe` in `regex` (to elide bounds checks).
+* [PR #200](https://github.com/rust-lang/regex/pull/200):
+ Use of the `mempool` crate (which used thread local storage) was replaced
+ with a faster version of a similar API in @Amanieu's `thread_local` crate.
+ It should reduce contention when using a regex from multiple threads
+ simultaneously.
+* PCRE2 JIT benchmarks were added. A benchmark comparison can be found
+ [here](https://gist.github.com/anonymous/14683c01993e91689f7206a18675901b).
+ (Includes a comparison with PCRE1's JIT and Oniguruma.)
+* A bug where word boundaries weren't being matched correctly in the DFA was
+ fixed. This only affected use of `bytes::Regex`.
+* [#160](https://github.com/rust-lang/regex/issues/160):
+ `Captures` now has a `Debug` impl.
diff --git a/third_party/rust/regex/Cargo.lock b/third_party/rust/regex/Cargo.lock
new file mode 100644
index 0000000000..031b3647e4
--- /dev/null
+++ b/third_party/rust/regex/Cargo.lock
@@ -0,0 +1,98 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "aho-corasick"
+version = "0.7.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "getrandom"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "libc"
+version = "0.2.139"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
+
+[[package]]
+name = "memchr"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+
+[[package]]
+name = "quickcheck"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
+dependencies = [
+ "rand",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "regex"
+version = "1.7.1"
+dependencies = [
+ "aho-corasick",
+ "lazy_static",
+ "memchr",
+ "quickcheck",
+ "rand",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
diff --git a/third_party/rust/regex/Cargo.toml b/third_party/rust/regex/Cargo.toml
new file mode 100644
index 0000000000..4f8673ea94
--- /dev/null
+++ b/third_party/rust/regex/Cargo.toml
@@ -0,0 +1,149 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2018"
+name = "regex"
+version = "1.7.1"
+authors = ["The Rust Project Developers"]
+exclude = [
+ "/scripts/*",
+ "/.github/*",
+]
+autotests = false
+description = """
+An implementation of regular expressions for Rust. This implementation uses
+finite automata and guarantees linear time matching on all inputs.
+"""
+homepage = "https://github.com/rust-lang/regex"
+documentation = "https://docs.rs/regex"
+readme = "README.md"
+categories = ["text-processing"]
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/rust-lang/regex"
+
+[profile.bench]
+debug = true
+
+[profile.release]
+debug = true
+
+[profile.test]
+debug = true
+
+[lib]
+doctest = false
+bench = false
+
+[[test]]
+name = "default"
+path = "tests/test_default.rs"
+
+[[test]]
+name = "default-bytes"
+path = "tests/test_default_bytes.rs"
+
+[[test]]
+name = "nfa"
+path = "tests/test_nfa.rs"
+
+[[test]]
+name = "nfa-utf8bytes"
+path = "tests/test_nfa_utf8bytes.rs"
+
+[[test]]
+name = "nfa-bytes"
+path = "tests/test_nfa_bytes.rs"
+
+[[test]]
+name = "backtrack"
+path = "tests/test_backtrack.rs"
+
+[[test]]
+name = "backtrack-utf8bytes"
+path = "tests/test_backtrack_utf8bytes.rs"
+
+[[test]]
+name = "backtrack-bytes"
+path = "tests/test_backtrack_bytes.rs"
+
+[[test]]
+name = "crates-regex"
+path = "tests/test_crates_regex.rs"
+
+[dependencies.aho-corasick]
+version = "0.7.18"
+optional = true
+
+[dependencies.memchr]
+version = "2.4.0"
+optional = true
+
+[dependencies.regex-syntax]
+version = "0.6.27"
+default-features = false
+
+[dev-dependencies.lazy_static]
+version = "1"
+
+[dev-dependencies.quickcheck]
+version = "1.0.3"
+default-features = false
+
+[dev-dependencies.rand]
+version = "0.8.3"
+features = [
+ "getrandom",
+ "small_rng",
+]
+default-features = false
+
+[features]
+default = [
+ "std",
+ "perf",
+ "unicode",
+ "regex-syntax/default",
+]
+pattern = []
+perf = [
+ "perf-cache",
+ "perf-dfa",
+ "perf-inline",
+ "perf-literal",
+]
+perf-cache = []
+perf-dfa = []
+perf-inline = []
+perf-literal = [
+ "aho-corasick",
+ "memchr",
+]
+std = []
+unicode = [
+ "unicode-age",
+ "unicode-bool",
+ "unicode-case",
+ "unicode-gencat",
+ "unicode-perl",
+ "unicode-script",
+ "unicode-segment",
+ "regex-syntax/unicode",
+]
+unicode-age = ["regex-syntax/unicode-age"]
+unicode-bool = ["regex-syntax/unicode-bool"]
+unicode-case = ["regex-syntax/unicode-case"]
+unicode-gencat = ["regex-syntax/unicode-gencat"]
+unicode-perl = ["regex-syntax/unicode-perl"]
+unicode-script = ["regex-syntax/unicode-script"]
+unicode-segment = ["regex-syntax/unicode-segment"]
+unstable = ["pattern"]
+use_std = ["std"]
diff --git a/third_party/rust/regex/HACKING.md b/third_party/rust/regex/HACKING.md
new file mode 100644
index 0000000000..34af5b517c
--- /dev/null
+++ b/third_party/rust/regex/HACKING.md
@@ -0,0 +1,341 @@
+Your friendly guide to hacking and navigating the regex library.
+
+This guide assumes familiarity with Rust and Cargo, and at least a perusal of
+the user facing documentation for this crate.
+
+If you're looking for background on the implementation in this library, then
+you can do no better than Russ Cox's article series on implementing regular
+expressions using finite automata: https://swtch.com/~rsc/regexp/
+
+
+## Architecture overview
+
+As you probably already know, this library executes regular expressions using
+finite automata. In particular, a design goal is to make searching linear
+with respect to both the regular expression and the text being searched.
+Meeting that design goal on its own is not so hard and can be done with an
+implementation of the Pike VM (similar to Thompson's construction, but supports
+capturing groups), as described in: https://swtch.com/~rsc/regexp/regexp2.html
+--- This library contains such an implementation in src/pikevm.rs.
+
+Making it fast is harder. One of the key problems with the Pike VM is that it
+can be in more than one state at any point in time, and must shuffle capture
+positions between them. The Pike VM also spends a lot of time following the
+same epsilon transitions over and over again. We can employ one trick to
+speed up the Pike VM: extract one or more literal prefixes from the regular
+expression and execute specialized code to quickly find matches of those
+prefixes in the search text. The Pike VM can then be avoided for most the
+search, and instead only executed when a prefix is found. The code to find
+prefixes is in the regex-syntax crate (in this repository). The code to search
+for literals is in src/literals.rs. When more than one literal prefix is found,
+we fall back to an Aho-Corasick DFA using the aho-corasick crate. For one
+literal, we use a variant of the Boyer-Moore algorithm. Both Aho-Corasick and
+Boyer-Moore use `memchr` when appropriate. The Boyer-Moore variant in this
+library also uses elementary frequency analysis to choose the right byte to run
+`memchr` with.
+
+Of course, detecting prefix literals can only take us so far. Not all regular
+expressions have literal prefixes. To remedy this, we try another approach
+to executing the Pike VM: backtracking, whose implementation can be found in
+src/backtrack.rs. One reason why backtracking can be faster is that it avoids
+excessive shuffling of capture groups. Of course, backtracking is susceptible
+to exponential runtimes, so we keep track of every state we've visited to make
+sure we never visit it again. This guarantees linear time execution, but we
+pay for it with the memory required to track visited states. Because of the
+memory requirement, we only use this engine on small search strings *and* small
+regular expressions.
+
+Lastly, the real workhorse of this library is the "lazy" DFA in src/dfa.rs.
+It is distinct from the Pike VM in that the DFA is explicitly represented in
+memory and is only ever in one state at a time. It is said to be "lazy" because
+the DFA is computed as text is searched, where each byte in the search text
+results in at most one new DFA state. It is made fast by caching states. DFAs
+are susceptible to exponential state blow up (where the worst case is computing
+a new state for every input byte, regardless of what's in the state cache). To
+avoid using a lot of memory, the lazy DFA uses a bounded cache. Once the cache
+is full, it is wiped and state computation starts over again. If the cache is
+wiped too frequently, then the DFA gives up and searching falls back to one of
+the aforementioned algorithms.
+
+All of the above matching engines expose precisely the same matching semantics.
+This is indeed tested. (See the section below about testing.)
+
+The following sub-sections describe the rest of the library and how each of the
+matching engines are actually used.
+
+### Parsing
+
+Regular expressions are parsed using the regex-syntax crate, which is
+maintained in this repository. The regex-syntax crate defines an abstract
+syntax and provides very detailed error messages when a parse error is
+encountered. Parsing is done in a separate crate so that others may benefit
+from its existence, and because it is relatively divorced from the rest of the
+regex library.
+
+The regex-syntax crate also provides sophisticated support for extracting
+prefix and suffix literals from regular expressions.
+
+### Compilation
+
+The compiler is in src/compile.rs. The input to the compiler is some abstract
+syntax for a regular expression and the output is a sequence of opcodes that
+matching engines use to execute a search. (One can think of matching engines as
+mini virtual machines.) The sequence of opcodes is a particular encoding of a
+non-deterministic finite automaton. In particular, the opcodes explicitly rely
+on epsilon transitions.
+
+Consider a simple regular expression like `a|b`. Its compiled form looks like
+this:
+
+ 000 Save(0)
+ 001 Split(2, 3)
+ 002 'a' (goto: 4)
+ 003 'b'
+ 004 Save(1)
+ 005 Match
+
+The first column is the instruction pointer and the second column is the
+instruction. Save instructions indicate that the current position in the input
+should be stored in a captured location. Split instructions represent a binary
+branch in the program (i.e., epsilon transitions). The instructions `'a'` and
+`'b'` indicate that the literal bytes `'a'` or `'b'` should match.
+
+In older versions of this library, the compilation looked like this:
+
+ 000 Save(0)
+ 001 Split(2, 3)
+ 002 'a'
+ 003 Jump(5)
+ 004 'b'
+ 005 Save(1)
+ 006 Match
+
+In particular, empty instructions that merely served to move execution from one
+point in the program to another were removed. Instead, every instruction has a
+`goto` pointer embedded into it. This resulted in a small performance boost for
+the Pike VM, because it was one fewer epsilon transition that it had to follow.
+
+There exist more instructions and they are defined and documented in
+src/prog.rs.
+
+Compilation has several knobs and a few unfortunately complicated invariants.
+Namely, the output of compilation can be one of two types of programs: a
+program that executes on Unicode scalar values or a program that executes
+on raw bytes. In the former case, the matching engine is responsible for
+performing UTF-8 decoding and executing instructions using Unicode codepoints.
+In the latter case, the program handles UTF-8 decoding implicitly, so that the
+matching engine can execute on raw bytes. All matching engines can execute
+either Unicode or byte based programs except for the lazy DFA, which requires
+byte based programs. In general, both representations were kept because (1) the
+lazy DFA requires byte based programs so that states can be encoded in a memory
+efficient manner and (2) the Pike VM benefits greatly from inlining Unicode
+character classes into fewer instructions as it results in fewer epsilon
+transitions.
+
+N.B. UTF-8 decoding is built into the compiled program by making use of the
+utf8-ranges crate. The compiler in this library factors out common suffixes to
+reduce the size of huge character classes (e.g., `\pL`).
+
+A regrettable consequence of this split in instruction sets is we generally
+need to compile two programs; one for NFA execution and one for the lazy DFA.
+
+In fact, it is worse than that: the lazy DFA is not capable of finding the
+starting location of a match in a single scan, and must instead execute a
+backwards search after finding the end location. To execute a backwards search,
+we must have compiled the regular expression *in reverse*.
+
+This means that every compilation of a regular expression generally results in
+three distinct programs. It would be possible to lazily compile the Unicode
+program, since it is never needed if (1) the regular expression uses no word
+boundary assertions and (2) the caller never asks for sub-capture locations.
+
+### Execution
+
+At the time of writing, there are four matching engines in this library:
+
+1. The Pike VM (supports captures).
+2. Bounded backtracking (supports captures).
+3. Literal substring or multi-substring search.
+4. Lazy DFA (no support for Unicode word boundary assertions).
+
+Only the first two matching engines are capable of executing every regular
+expression program. They also happen to be the slowest, which means we need
+some logic that (1) knows various facts about the regular expression and (2)
+knows what the caller wants. Using this information, we can determine which
+engine (or engines) to use.
+
+The logic for choosing which engine to execute is in src/exec.rs and is
+documented on the Exec type. Exec values contain regular expression Programs
+(defined in src/prog.rs), which contain all the necessary tidbits for actually
+executing a regular expression on search text.
+
+For the most part, the execution logic is straight-forward and follows the
+limitations of each engine described above pretty faithfully. The hairiest
+part of src/exec.rs by far is the execution of the lazy DFA, since it requires
+a forwards and backwards search, and then falls back to either the Pike VM or
+backtracking if the caller requested capture locations.
+
+The Exec type also contains mutable scratch space for each type of matching
+engine. This scratch space is used during search (for example, for the lazy
+DFA, it contains compiled states that are reused on subsequent searches).
+
+### Programs
+
+A regular expression program is essentially a sequence of opcodes produced by
+the compiler plus various facts about the regular expression (such as whether
+it is anchored, its capture names, etc.).
+
+### The regex! macro
+
+The `regex!` macro no longer exists. It was developed in a bygone era as a
+compiler plugin during the infancy of the regex crate. Back then, then only
+matching engine in the crate was the Pike VM. The `regex!` macro was, itself,
+also a Pike VM. The only advantages it offered over the dynamic Pike VM that
+was built at runtime were the following:
+
+ 1. Syntax checking was done at compile time. Your Rust program wouldn't
+ compile if your regex didn't compile.
+ 2. Reduction of overhead that was proportional to the size of the regex.
+ For the most part, this overhead consisted of heap allocation, which
+ was nearly eliminated in the compiler plugin.
+
+The main takeaway here is that the compiler plugin was a marginally faster
+version of a slow regex engine. As the regex crate evolved, it grew other regex
+engines (DFA, bounded backtracker) and sophisticated literal optimizations.
+The regex macro didn't keep pace, and it therefore became (dramatically) slower
+than the dynamic engines. The only reason left to use it was for the compile
+time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint
+tool) has a lint that checks your regular expression validity, which mostly
+replaces that use case.
+
+Additionally, the regex compiler plugin stopped receiving maintenance. Nobody
+complained. At that point, it seemed prudent to just remove it.
+
+Will a compiler plugin be brought back? The future is murky, but there is
+definitely an opportunity there to build something that is faster than the
+dynamic engines in some cases. But it will be challenging! As of now, there
+are no plans to work on this.
+
+
+## Testing
+
+A key aspect of any mature regex library is its test suite. A subset of the
+tests in this library come from Glenn Fowler's AT&T test suite (its online
+presence seems gone at the time of writing). The source of the test suite is
+located in src/testdata. The scripts/regex-match-tests.py takes the test suite
+in src/testdata and generates tests/matches.rs.
+
+There are also many other manually crafted tests and regression tests in
+tests/tests.rs. Some of these tests were taken from RE2.
+
+The biggest source of complexity in the tests is related to answering this
+question: how can we reuse the tests to check all of our matching engines? One
+approach would have been to encode every test into some kind of format (like
+the AT&T test suite) and code generate tests for each matching engine. The
+approach we use in this library is to create a Cargo.toml entry point for each
+matching engine we want to test. The entry points are:
+
+* `tests/test_default.rs` - tests `Regex::new`
+* `tests/test_default_bytes.rs` - tests `bytes::Regex::new`
+* `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA
+ algorithm on every regex.
+* `tests/test_nfa_bytes.rs` - tests `Regex::new`, forced to use the NFA
+ algorithm on every regex and use *arbitrary* byte based programs.
+* `tests/test_nfa_utf8bytes.rs` - tests `Regex::new`, forced to use the NFA
+ algorithm on every regex and use *UTF-8* byte based programs.
+* `tests/test_backtrack.rs` - tests `Regex::new`, forced to use
+ backtracking on every regex.
+* `tests/test_backtrack_bytes.rs` - tests `Regex::new`, forced to use
+ backtracking on every regex and use *arbitrary* byte based programs.
+* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use
+ backtracking on every regex and use *UTF-8* byte based programs.
+* `tests/test_crates_regex.rs` - tests to make sure that all of the
+ backends behave in the same way against a number of quickcheck
+ generated random inputs. These tests need to be enabled through
+ the `RUST_REGEX_RANDOM_TEST` environment variable (see
+ below).
+
+The lazy DFA and pure literal engines are absent from this list because
+they cannot be used on every regular expression. Instead, we rely on
+`tests/test_dynamic.rs` to test the lazy DFA and literal engines when possible.
+
+Since the tests are repeated several times, and because `cargo test` runs all
+entry points, it can take a while to compile everything. To reduce compile
+times slightly, try using `cargo test --test default`, which will only use the
+`tests/test_default.rs` entry point.
+
+The random testing takes quite a while, so it is not enabled by default.
+In order to run the random testing you can set the
+`RUST_REGEX_RANDOM_TEST` environment variable to anything before
+invoking `cargo test`. Note that this variable is inspected at compile
+time, so if the tests don't seem to be running, you may need to run
+`cargo clean`.
+
+## Benchmarking
+
+The benchmarking in this crate is made up of many micro-benchmarks. Currently,
+there are two primary sets of benchmarks: the benchmarks that were adopted
+at this library's inception (in `bench/src/misc.rs`) and a newer set of
+benchmarks meant to test various optimizations. Specifically, the latter set
+contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter
+set are all executed on the same lengthy input whereas the former benchmarks
+are executed on strings of varying length.
+
+There is also a smattering of benchmarks for parsing and compilation.
+
+Benchmarks are in a separate crate so that its dependencies can be managed
+separately from the main regex crate.
+
+Benchmarking follows a similarly wonky setup as tests. There are multiple entry
+points:
+
+* `bench_rust.rs` - benchmarks `Regex::new`
+* `bench_rust_bytes.rs` benchmarks `bytes::Regex::new`
+* `bench_pcre.rs` - benchmarks PCRE
+* `bench_onig.rs` - benchmarks Oniguruma
+
+The PCRE and Oniguruma benchmarks exist as a comparison point to a mature
+regular expression library. In general, this regex library compares favorably
+(there are even a few benchmarks that PCRE simply runs too slowly on or
+outright can't execute at all). I would love to add other regular expression
+library benchmarks (especially RE2).
+
+If you're hacking on one of the matching engines and just want to see
+benchmarks, then all you need to run is:
+
+ $ (cd bench && ./run rust)
+
+If you want to compare your results with older benchmarks, then try:
+
+ $ (cd bench && ./run rust | tee old)
+ $ ... make it faster
+ $ (cd bench && ./run rust | tee new)
+ $ cargo benchcmp old new --improvements
+
+The `cargo-benchcmp` utility is available here:
+https://github.com/BurntSushi/cargo-benchcmp
+
+The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See
+`./bench/bench --help`.
+
+## Dev Docs
+
+When digging your teeth into the codebase for the first time, the
+crate documentation can be a great resource. By default `rustdoc`
+will strip out all documentation of private crate members in an
+effort to help consumers of the crate focus on the *interface*
+without having to concern themselves with the *implementation*.
+Normally this is a great thing, but if you want to start hacking
+on regex internals it is not what you want. Many of the private members
+of this crate are well documented with rustdoc style comments, and
+it would be a shame to miss out on the opportunity that presents.
+You can generate the private docs with:
+
+```
+$ rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments
+```
+
+Then just point your browser at `target/doc/regex/index.html`.
+
+See https://github.com/rust-lang/rust/issues/15347 for more info
+about generating developer docs for internal use.
diff --git a/third_party/rust/regex/LICENSE-APACHE b/third_party/rust/regex/LICENSE-APACHE
new file mode 100644
index 0000000000..16fe87b06e
--- /dev/null
+++ b/third_party/rust/regex/LICENSE-APACHE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/third_party/rust/regex/LICENSE-MIT b/third_party/rust/regex/LICENSE-MIT
new file mode 100644
index 0000000000..39d4bdb5ac
--- /dev/null
+++ b/third_party/rust/regex/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2014 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/third_party/rust/regex/PERFORMANCE.md b/third_party/rust/regex/PERFORMANCE.md
new file mode 100644
index 0000000000..8cd0d9c719
--- /dev/null
+++ b/third_party/rust/regex/PERFORMANCE.md
@@ -0,0 +1,277 @@
+Your friendly guide to understanding the performance characteristics of this
+crate.
+
+This guide assumes some familiarity with the public API of this crate, which
+can be found here: https://docs.rs/regex
+
+## Theory vs. Practice
+
+One of the design goals of this crate is to provide worst case linear time
+behavior with respect to the text searched using finite state automata. This
+means that, *in theory*, the performance of this crate is much better than most
+regex implementations, which typically use backtracking which has worst case
+exponential time.
+
+For example, try opening a Python interpreter and typing this:
+
+ >>> import re
+ >>> re.search('(a*)*c', 'a' * 30).span()
+
+I'll wait.
+
+At some point, you'll figure out that it won't terminate any time soon. ^C it.
+
+The promise of this crate is that *this pathological behavior can't happen*.
+
+With that said, just because we have protected ourselves against worst case
+exponential behavior doesn't mean we are immune from large constant factors
+or places where the current regex engine isn't quite optimal. This guide will
+detail those cases and provide guidance on how to avoid them, among other
+bits of general advice.
+
+## Thou Shalt Not Compile Regular Expressions In A Loop
+
+**Advice**: Use `lazy_static` to amortize the cost of `Regex` compilation.
+
+Don't do it unless you really don't mind paying for it. Compiling a regular
+expression in this crate is quite expensive. It is conceivable that it may get
+faster some day, but I wouldn't hold out hope for, say, an order of magnitude
+improvement. In particular, compilation can take any where from a few dozen
+microseconds to a few dozen milliseconds. Yes, milliseconds. Unicode character
+classes, in particular, have the largest impact on compilation performance. At
+the time of writing, for example, `\pL{100}` takes around 44ms to compile. This
+is because `\pL` corresponds to every letter in Unicode and compilation must
+turn it into a proper automaton that decodes a subset of UTF-8 which
+corresponds to those letters. Compilation also spends some cycles shrinking the
+size of the automaton.
+
+This means that in order to realize efficient regex matching, one must
+*amortize the cost of compilation*. Trivially, if a call to `is_match` is
+inside a loop, then make sure your call to `Regex::new` is *outside* that loop.
+
+In many programming languages, regular expressions can be conveniently defined
+and compiled in a global scope, and code can reach out and use them as if
+they were global static variables. In Rust, there is really no concept of
+life-before-main, and therefore, one cannot utter this:
+
+ static MY_REGEX: Regex = Regex::new("...").unwrap();
+
+Unfortunately, this would seem to imply that one must pass `Regex` objects
+around to everywhere they are used, which can be especially painful depending
+on how your program is structured. Thankfully, the
+[`lazy_static`](https://crates.io/crates/lazy_static)
+crate provides an answer that works well:
+
+ use lazy_static::lazy_static;
+ use regex::Regex;
+
+ fn some_helper_function(text: &str) -> bool {
+ lazy_static! {
+ static ref MY_REGEX: Regex = Regex::new("...").unwrap();
+ }
+ MY_REGEX.is_match(text)
+ }
+
+In other words, the `lazy_static!` macro enables us to define a `Regex` *as if*
+it were a global static value. What is actually happening under the covers is
+that the code inside the macro (i.e., `Regex::new(...)`) is run on *first use*
+of `MY_REGEX` via a `Deref` impl. The implementation is admittedly magical, but
+it's self contained and everything works exactly as you expect. In particular,
+`MY_REGEX` can be used from multiple threads without wrapping it in an `Arc` or
+a `Mutex`. On that note...
+
+## Using a regex from multiple threads
+
+**Advice**: The performance impact from using a `Regex` from multiple threads
+is likely negligible. If necessary, clone the `Regex` so that each thread gets
+its own copy. Cloning a regex does not incur any additional memory overhead
+than what would be used by using a `Regex` from multiple threads
+simultaneously. *Its only cost is ergonomics.*
+
+It is supported and encouraged to define your regexes using `lazy_static!` as
+if they were global static values, and then use them to search text from
+multiple threads simultaneously.
+
+One might imagine that this is possible because a `Regex` represents a
+*compiled* program, so that any allocation or mutation is already done, and is
+therefore read-only. Unfortunately, this is not true. Each type of search
+strategy in this crate requires some kind of mutable scratch space to use
+*during search*. For example, when executing a DFA, its states are computed
+lazily and reused on subsequent searches. Those states go into that mutable
+scratch space.
+
+The mutable scratch space is an implementation detail, and in general, its
+mutation should not be observable from users of this crate. Therefore, it uses
+interior mutability. This implies that `Regex` can either only be used from one
+thread, or it must do some sort of synchronization. Either choice is
+reasonable, but this crate chooses the latter, in particular because it is
+ergonomic and makes use with `lazy_static!` straight forward.
+
+Synchronization implies *some* amount of overhead. When a `Regex` is used from
+a single thread, this overhead is negligible. When a `Regex` is used from
+multiple threads simultaneously, it is possible for the overhead of
+synchronization from contention to impact performance. The specific cases where
+contention may happen is if you are calling any of these methods repeatedly
+from multiple threads simultaneously:
+
+* shortest_match
+* is_match
+* find
+* captures
+
+In particular, every invocation of one of these methods must synchronize with
+other threads to retrieve its mutable scratch space before searching can start.
+If, however, you are using one of these methods:
+
+* find_iter
+* captures_iter
+
+Then you may not suffer from contention since the cost of synchronization is
+amortized on *construction of the iterator*. That is, the mutable scratch space
+is obtained when the iterator is created and retained throughout its lifetime.
+
+## Only ask for what you need
+
+**Advice**: Prefer in this order: `is_match`, `find`, `captures`.
+
+There are three primary search methods on a `Regex`:
+
+* is_match
+* find
+* captures
+
+In general, these are ordered from fastest to slowest.
+
+`is_match` is fastest because it doesn't actually need to find the start or the
+end of the leftmost-first match. It can quit immediately after it knows there
+is a match. For example, given the regex `a+` and the haystack, `aaaaa`, the
+search will quit after examining the first byte.
+
+In contrast, `find` must return both the start and end location of the
+leftmost-first match. It can use the DFA matcher for this, but must run it
+forwards once to find the end of the match *and then run it backwards* to find
+the start of the match. The two scans and the cost of finding the real end of
+the leftmost-first match make this more expensive than `is_match`.
+
+`captures` is the most expensive of them all because it must do what `find`
+does, and then run either the bounded backtracker or the Pike VM to fill in the
+capture group locations. Both of these are simulations of an NFA, which must
+spend a lot of time shuffling states around. The DFA limits the performance hit
+somewhat by restricting the amount of text that must be searched via an NFA
+simulation.
+
+One other method not mentioned is `shortest_match`. This method has precisely
+the same performance characteristics as `is_match`, except it will return the
+end location of when it discovered a match. For example, given the regex `a+`
+and the haystack `aaaaa`, `shortest_match` may return `1` as opposed to `5`,
+the latter of which being the correct end location of the leftmost-first match.
+
+## Literals in your regex may make it faster
+
+**Advice**: Literals can reduce the work that the regex engine needs to do. Use
+them if you can, especially as prefixes.
+
+In particular, if your regex starts with a prefix literal, the prefix is
+quickly searched before entering the (much slower) regex engine. For example,
+given the regex `foo\w+`, the literal `foo` will be searched for using
+Boyer-Moore. If there's no match, then no regex engine is ever used. Only when
+there's a match is the regex engine invoked at the location of the match, which
+effectively permits the regex engine to skip large portions of a haystack.
+If a regex is comprised entirely of literals (possibly more than one), then
+it's possible that the regex engine can be avoided entirely even when there's a
+match.
+
+When one literal is found, Boyer-Moore is used. When multiple literals are
+found, then an optimized version of Aho-Corasick is used.
+
+This optimization is in particular extended quite a bit in this crate. Here are
+a few examples of regexes that get literal prefixes detected:
+
+* `(foo|bar)` detects `foo` and `bar`
+* `(a|b)c` detects `ac` and `bc`
+* `[ab]foo[yz]` detects `afooy`, `afooz`, `bfooy` and `bfooz`
+* `a?b` detects `a` and `b`
+* `a*b` detects `a` and `b`
+* `(ab){3,6}` detects `ababab`
+
+Literals in anchored regexes can also be used for detecting non-matches very
+quickly. For example, `^foo\w+` and `\w+foo$` may be able to detect a non-match
+just by examining the first (or last) three bytes of the haystack.
+
+## Unicode word boundaries may prevent the DFA from being used
+
+**Advice**: In most cases, `\b` should work well. If not, use `(?-u:\b)`
+instead of `\b` if you care about consistent performance more than correctness.
+
+It's a sad state of the current implementation. At the moment, the DFA will try
+to interpret Unicode word boundaries as if they were ASCII word boundaries.
+If the DFA comes across any non-ASCII byte, it will quit and fall back to an
+alternative matching engine that can handle Unicode word boundaries correctly.
+The alternate matching engine is generally quite a bit slower (perhaps by an
+order of magnitude). If necessary, this can be ameliorated in two ways.
+
+The first way is to add some number of literal prefixes to your regular
+expression. Even though the DFA may not be used, specialized routines will
+still kick in to find prefix literals quickly, which limits how much work the
+NFA simulation will need to do.
+
+The second way is to give up on Unicode and use an ASCII word boundary instead.
+One can use an ASCII word boundary by disabling Unicode support. That is,
+instead of using `\b`, use `(?-u:\b)`. Namely, given the regex `\b.+\b`, it
+can be transformed into a regex that uses the DFA with `(?-u:\b).+(?-u:\b)`. It
+is important to limit the scope of disabling the `u` flag, since it might lead
+to a syntax error if the regex could match arbitrary bytes. For example, if one
+wrote `(?-u)\b.+\b`, then a syntax error would be returned because `.` matches
+any *byte* when the Unicode flag is disabled.
+
+The second way isn't appreciably different than just using a Unicode word
+boundary in the first place, since the DFA will speculatively interpret it as
+an ASCII word boundary anyway. The key difference is that if an ASCII word
+boundary is used explicitly, then the DFA won't quit in the presence of
+non-ASCII UTF-8 bytes. This results in giving up correctness in exchange for
+more consistent performance.
+
+N.B. When using `bytes::Regex`, Unicode support is disabled by default, so one
+can simply write `\b` to get an ASCII word boundary.
+
+## Excessive counting can lead to exponential state blow up in the DFA
+
+**Advice**: Don't write regexes that cause DFA state blow up if you care about
+match performance.
+
+Wait, didn't I say that this crate guards against exponential worst cases?
+Well, it turns out that the process of converting an NFA to a DFA can lead to
+an exponential blow up in the number of states. This crate specifically guards
+against exponential blow up by doing two things:
+
+1. The DFA is computed lazily. That is, a state in the DFA only exists in
+ memory if it is visited. In particular, the lazy DFA guarantees that *at
+ most* one state is created for every byte of input. This, on its own,
+ guarantees linear time complexity.
+2. Of course, creating a new state for *every* byte of input means that search
+ will go incredibly slow because of very large constant factors. On top of
+ that, creating a state for every byte in a large haystack could result in
+ exorbitant memory usage. To ameliorate this, the DFA bounds the number of
+ states it can store. Once it reaches its limit, it flushes its cache. This
+ prevents reuse of states that it already computed. If the cache is flushed
+ too frequently, then the DFA will give up and execution will fall back to
+ one of the NFA simulations.
+
+In effect, this crate will detect exponential state blow up and fall back to
+a search routine with fixed memory requirements. This does, however, mean that
+searching will be much slower than one might expect. Regexes that rely on
+counting in particular are strong aggravators of this behavior. For example,
+matching `[01]*1[01]{20}$` against a random sequence of `0`s and `1`s.
+
+In the future, it may be possible to increase the bound that the DFA uses,
+which would allow the caller to choose how much memory they're willing to
+spend.
+
+## Resist the temptation to "optimize" regexes
+
+**Advice**: This ain't a backtracking engine.
+
+An entire book was written on how to optimize Perl-style regular expressions.
+Most of those techniques are not applicable for this library. For example,
+there is no problem with using non-greedy matching or having lots of
+alternations in your regex.
diff --git a/third_party/rust/regex/README.md b/third_party/rust/regex/README.md
new file mode 100644
index 0000000000..861417da65
--- /dev/null
+++ b/third_party/rust/regex/README.md
@@ -0,0 +1,246 @@
+regex
+=====
+A Rust library for parsing, compiling, and executing regular expressions. Its
+syntax is similar to Perl-style regular expressions, but lacks a few features
+like look around and backreferences. In exchange, all searches execute in
+linear time with respect to the size of the regular expression and search text.
+Much of the syntax and implementation is inspired
+by [RE2](https://github.com/google/re2).
+
+[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions)
+[![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex)
+[![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
+
+### Documentation
+
+[Module documentation with examples](https://docs.rs/regex).
+The module documentation also includes a comprehensive description of the
+syntax supported.
+
+Documentation with examples for the various matching functions and iterators
+can be found on the
+[`Regex` type](https://docs.rs/regex/*/regex/struct.Regex.html).
+
+### Usage
+
+To bring this crate into your repository, either add `regex` to your
+`Cargo.toml`, or run `cargo add regex`.
+
+Here's a simple example that matches a date in YYYY-MM-DD format and prints the
+year, month and day:
+
+```rust
+use regex::Regex;
+
+fn main() {
+ let re = Regex::new(r"(?x)
+(?P<year>\d{4}) # the year
+-
+(?P<month>\d{2}) # the month
+-
+(?P<day>\d{2}) # the day
+").unwrap();
+ let caps = re.captures("2010-03-14").unwrap();
+
+ assert_eq!("2010", &caps["year"]);
+ assert_eq!("03", &caps["month"]);
+ assert_eq!("14", &caps["day"]);
+}
+```
+
+If you have lots of dates in text that you'd like to iterate over, then it's
+easy to adapt the above example with an iterator:
+
+```rust
+use regex::Regex;
+
+const TO_SEARCH: &'static str = "
+On 2010-03-14, foo happened. On 2014-10-14, bar happened.
+";
+
+fn main() {
+ let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
+
+ for caps in re.captures_iter(TO_SEARCH) {
+ // Note that all of the unwraps are actually OK for this regex
+ // because the only way for the regex to match is if all of the
+ // capture groups match. This is not true in general though!
+ println!("year: {}, month: {}, day: {}",
+ caps.get(1).unwrap().as_str(),
+ caps.get(2).unwrap().as_str(),
+ caps.get(3).unwrap().as_str());
+ }
+}
+```
+
+This example outputs:
+
+```text
+year: 2010, month: 03, day: 14
+year: 2014, month: 10, day: 14
+```
+
+### Usage: Avoid compiling the same regex in a loop
+
+It is an anti-pattern to compile the same regular expression in a loop since
+compilation is typically expensive. (It takes anywhere from a few microseconds
+to a few **milliseconds** depending on the size of the regex.) Not only is
+compilation itself expensive, but this also prevents optimizations that reuse
+allocations internally to the matching engines.
+
+In Rust, it can sometimes be a pain to pass regular expressions around if
+they're used from inside a helper function. Instead, we recommend using the
+[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that
+regular expressions are compiled exactly once.
+
+For example:
+
+```rust,ignore
+use regex::Regex;
+
+fn some_helper_function(text: &str) -> bool {
+ lazy_static! {
+ static ref RE: Regex = Regex::new("...").unwrap();
+ }
+ RE.is_match(text)
+}
+```
+
+Specifically, in this example, the regex will be compiled when it is used for
+the first time. On subsequent uses, it will reuse the previous compilation.
+
+### Usage: match regular expressions on `&[u8]`
+
+The main API of this crate (`regex::Regex`) requires the caller to pass a
+`&str` for searching. In Rust, an `&str` is required to be valid UTF-8, which
+means the main API can't be used for searching arbitrary bytes.
+
+To match on arbitrary bytes, use the `regex::bytes::Regex` API. The API
+is identical to the main API, except that it takes an `&[u8]` to search
+on instead of an `&str`. By default, `.` will match any *byte* using
+`regex::bytes::Regex`, while `.` will match any *UTF-8 encoded Unicode scalar
+value* using the main API.
+
+This example shows how to find all null-terminated strings in a slice of bytes:
+
+```rust
+use regex::bytes::Regex;
+
+let re = Regex::new(r"(?P<cstr>[^\x00]+)\x00").unwrap();
+let text = b"foo\x00bar\x00baz\x00";
+
+// Extract all of the strings without the null terminator from each match.
+// The unwrap is OK here since a match requires the `cstr` capture to match.
+let cstrs: Vec<&[u8]> =
+ re.captures_iter(text)
+ .map(|c| c.name("cstr").unwrap().as_bytes())
+ .collect();
+assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs);
+```
+
+Notice here that the `[^\x00]+` will match any *byte* except for `NUL`. When
+using the main API, `[^\x00]+` would instead match any valid UTF-8 sequence
+except for `NUL`.
+
+### Usage: match multiple regular expressions simultaneously
+
+This demonstrates how to use a `RegexSet` to match multiple (possibly
+overlapping) regular expressions in a single scan of the search text:
+
+```rust
+use regex::RegexSet;
+
+let set = RegexSet::new(&[
+ r"\w+",
+ r"\d+",
+ r"\pL+",
+ r"foo",
+ r"bar",
+ r"barfoo",
+ r"foobar",
+]).unwrap();
+
+// Iterate over and collect all of the matches.
+let matches: Vec<_> = set.matches("foobar").into_iter().collect();
+assert_eq!(matches, vec![0, 2, 3, 4, 6]);
+
+// You can also test whether a particular regex matched:
+let matches = set.matches("foobar");
+assert!(!matches.matched(5));
+assert!(matches.matched(6));
+```
+
+### Usage: enable SIMD optimizations
+
+SIMD optimizations are enabled automatically on Rust stable 1.27 and newer.
+For nightly versions of Rust, this requires a recent version with the SIMD
+features stabilized.
+
+
+### Usage: a regular expression parser
+
+This repository contains a crate that provides a well tested regular expression
+parser, abstract syntax and a high-level intermediate representation for
+convenient analysis. It provides no facilities for compilation or execution.
+This may be useful if you're implementing your own regex engine or otherwise
+need to do analysis on the syntax of a regular expression. It is otherwise not
+recommended for general use.
+
+[Documentation `regex-syntax`.](https://docs.rs/regex-syntax)
+
+
+### Crate features
+
+This crate comes with several features that permit tweaking the trade off
+between binary size, compilation time and runtime performance. Users of this
+crate can selectively disable Unicode tables, or choose from a variety of
+optimizations performed by this crate to disable.
+
+When all of these features are disabled, runtime match performance may be much
+worse, but if you're matching on short strings, or if high performance isn't
+necessary, then such a configuration is perfectly serviceable. To disable
+all such features, use the following `Cargo.toml` dependency configuration:
+
+```toml
+[dependencies.regex]
+version = "1.3"
+default-features = false
+# regex currently requires the standard library, you must re-enable it.
+features = ["std"]
+```
+
+This will reduce the dependency tree of `regex` down to a single crate
+(`regex-syntax`).
+
+The full set of features one can disable are
+[in the "Crate features" section of the documentation](https://docs.rs/regex/*/#crate-features).
+
+
+### Minimum Rust version policy
+
+This crate's minimum supported `rustc` version is `1.41.1`.
+
+The current **tentative** policy is that the minimum Rust version required
+to use this crate can be increased in minor version updates. For example, if
+regex 1.0 requires Rust 1.20.0, then regex 1.0.z for all values of `z` will
+also require Rust 1.20.0 or newer. However, regex 1.y for `y > 0` may require a
+newer minimum version of Rust.
+
+In general, this crate will be conservative with respect to the minimum
+supported version of Rust.
+
+
+### License
+
+This project is licensed under either of
+
+ * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
+ https://www.apache.org/licenses/LICENSE-2.0)
+ * MIT license ([LICENSE-MIT](LICENSE-MIT) or
+ https://opensource.org/licenses/MIT)
+
+at your option.
+
+The data in `regex-syntax/src/unicode_tables/` is licensed under the Unicode
+License Agreement
+([LICENSE-UNICODE](https://www.unicode.org/copyright.html#License)).
diff --git a/third_party/rust/regex/UNICODE.md b/third_party/rust/regex/UNICODE.md
new file mode 100644
index 0000000000..df7d21ed97
--- /dev/null
+++ b/third_party/rust/regex/UNICODE.md
@@ -0,0 +1,259 @@
+# Unicode conformance
+
+This document describes the regex crate's conformance to Unicode's
+[UTS#18](https://unicode.org/reports/tr18/)
+report, which lays out 3 levels of support: Basic, Extended and Tailored.
+
+Full support for Level 1 ("Basic Unicode Support") is provided with two
+exceptions:
+
+1. Line boundaries are not Unicode aware. Namely, only the `\n`
+ (`END OF LINE`) character is recognized as a line boundary.
+2. The compatibility properties specified by
+ [RL1.2a](https://unicode.org/reports/tr18/#RL1.2a)
+ are ASCII-only definitions.
+
+Little to no support is provided for either Level 2 or Level 3. For the most
+part, this is because the features are either complex/hard to implement, or at
+the very least, very difficult to implement without sacrificing performance.
+For example, tackling canonical equivalence such that matching worked as one
+would expect regardless of normalization form would be a significant
+undertaking. This is at least partially a result of the fact that this regex
+engine is based on finite automata, which admits less flexibility normally
+associated with backtracking implementations.
+
+
+## RL1.1 Hex Notation
+
+[UTS#18 RL1.1](https://unicode.org/reports/tr18/#Hex_notation)
+
+Hex Notation refers to the ability to specify a Unicode code point in a regular
+expression via its hexadecimal code point representation. This is useful in
+environments that have poor Unicode font rendering or if you need to express a
+code point that is not normally displayable. All forms of hexadecimal notation
+are supported
+
+ \x7F hex character code (exactly two digits)
+ \x{10FFFF} any hex character code corresponding to a Unicode code point
+ \u007F hex character code (exactly four digits)
+ \u{7F} any hex character code corresponding to a Unicode code point
+ \U0000007F hex character code (exactly eight digits)
+ \U{7F} any hex character code corresponding to a Unicode code point
+
+Briefly, the `\x{...}`, `\u{...}` and `\U{...}` are all exactly equivalent ways
+of expressing hexadecimal code points. Any number of digits can be written
+within the brackets. In contrast, `\xNN`, `\uNNNN`, `\UNNNNNNNN` are all
+fixed-width variants of the same idea.
+
+Note that when Unicode mode is disabled, any non-ASCII Unicode codepoint is
+banned. Additionally, the `\xNN` syntax represents arbitrary bytes when Unicode
+mode is disabled. That is, the regex `\xFF` matches the Unicode codepoint
+U+00FF (encoded as `\xC3\xBF` in UTF-8) while the regex `(?-u)\xFF` matches
+the literal byte `\xFF`.
+
+
+## RL1.2 Properties
+
+[UTS#18 RL1.2](https://unicode.org/reports/tr18/#Categories)
+
+Full support for Unicode property syntax is provided. Unicode properties
+provide a convenient way to construct character classes of groups of code
+points specified by Unicode. The regex crate does not provide exhaustive
+support, but covers a useful subset. In particular:
+
+* [General categories](https://unicode.org/reports/tr18/#General_Category_Property)
+* [Scripts and Script Extensions](https://unicode.org/reports/tr18/#Script_Property)
+* [Age](https://unicode.org/reports/tr18/#Age)
+* A smattering of boolean properties, including all of those specified by
+ [RL1.2](https://unicode.org/reports/tr18/#RL1.2) explicitly.
+
+In all cases, property name and value abbreviations are supported, and all
+names/values are matched loosely without regard for case, whitespace or
+underscores. Property name aliases can be found in Unicode's
+[`PropertyAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt)
+file, while property value aliases can be found in Unicode's
+[`PropertyValueAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt)
+file.
+
+The syntax supported is also consistent with the UTS#18 recommendation:
+
+* `\p{Greek}` selects the `Greek` script. Equivalent expressions follow:
+ `\p{sc:Greek}`, `\p{Script:Greek}`, `\p{Sc=Greek}`, `\p{script=Greek}`,
+ `\P{sc!=Greek}`. Similarly for `General_Category` (or `gc` for short) and
+ `Script_Extensions` (or `scx` for short).
+* `\p{age:3.2}` selects all code points in Unicode 3.2.
+* `\p{Alphabetic}` selects the "alphabetic" property and can be abbreviated
+ via `\p{alpha}` (for example).
+* Single letter variants for properties with single letter abbreviations.
+ For example, `\p{Letter}` can be equivalently written as `\pL`.
+
+The following is a list of all properties supported by the regex crate (starred
+properties correspond to properties required by RL1.2):
+
+* `General_Category` \* (including `Any`, `ASCII` and `Assigned`)
+* `Script` \*
+* `Script_Extensions` \*
+* `Age`
+* `ASCII_Hex_Digit`
+* `Alphabetic` \*
+* `Bidi_Control`
+* `Case_Ignorable`
+* `Cased`
+* `Changes_When_Casefolded`
+* `Changes_When_Casemapped`
+* `Changes_When_Lowercased`
+* `Changes_When_Titlecased`
+* `Changes_When_Uppercased`
+* `Dash`
+* `Default_Ignorable_Code_Point` \*
+* `Deprecated`
+* `Diacritic`
+* `Emoji`
+* `Emoji_Presentation`
+* `Emoji_Modifier`
+* `Emoji_Modifier_Base`
+* `Emoji_Component`
+* `Extended_Pictographic`
+* `Extender`
+* `Grapheme_Base`
+* `Grapheme_Cluster_Break`
+* `Grapheme_Extend`
+* `Hex_Digit`
+* `IDS_Binary_Operator`
+* `IDS_Trinary_Operator`
+* `ID_Continue`
+* `ID_Start`
+* `Join_Control`
+* `Logical_Order_Exception`
+* `Lowercase` \*
+* `Math`
+* `Noncharacter_Code_Point` \*
+* `Pattern_Syntax`
+* `Pattern_White_Space`
+* `Prepended_Concatenation_Mark`
+* `Quotation_Mark`
+* `Radical`
+* `Regional_Indicator`
+* `Sentence_Break`
+* `Sentence_Terminal`
+* `Soft_Dotted`
+* `Terminal_Punctuation`
+* `Unified_Ideograph`
+* `Uppercase` \*
+* `Variation_Selector`
+* `White_Space` \*
+* `Word_Break`
+* `XID_Continue`
+* `XID_Start`
+
+
+## RL1.2a Compatibility Properties
+
+[UTS#18 RL1.2a](https://unicode.org/reports/tr18/#RL1.2a)
+
+The regex crate only provides ASCII definitions of the
+[compatibility properties documented in UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties)
+(sans the `\X` class, for matching grapheme clusters, which isn't provided
+at all). This is because it seems to be consistent with most other regular
+expression engines, and in particular, because these are often referred to as
+"ASCII" or "POSIX" character classes.
+
+Note that the `\w`, `\s` and `\d` character classes **are** Unicode aware.
+Their traditional ASCII definition can be used by disabling Unicode. That is,
+`[[:word:]]` and `(?-u)\w` are equivalent.
+
+
+## RL1.3 Subtraction and Intersection
+
+[UTS#18 RL1.3](https://unicode.org/reports/tr18/#Subtraction_and_Intersection)
+
+The regex crate provides full support for nested character classes, along with
+union, intersection (`&&`), difference (`--`) and symmetric difference (`~~`)
+operations on arbitrary character classes.
+
+For example, to match all non-ASCII letters, you could use either
+`[\p{Letter}--\p{Ascii}]` (difference) or `[\p{Letter}&&[^\p{Ascii}]]`
+(intersecting the negation).
+
+
+## RL1.4 Simple Word Boundaries
+
+[UTS#18 RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries)
+
+The regex crate provides basic Unicode aware word boundary assertions. A word
+boundary assertion can be written as `\b`, or `\B` as its negation. A word
+boundary negation corresponds to a zero-width match, where its adjacent
+characters correspond to word and non-word, or non-word and word characters.
+
+Conformance in this case chooses to define word character in the same way that
+the `\w` character class is defined: a code point that is a member of one of
+the following classes:
+
+* `\p{Alphabetic}`
+* `\p{Join_Control}`
+* `\p{gc:Mark}`
+* `\p{gc:Decimal_Number}`
+* `\p{gc:Connector_Punctuation}`
+
+In particular, this differs slightly from the
+[prescription given in RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries)
+but is permissible according to
+[UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
+Namely, it is convenient and simpler to have `\w` and `\b` be in sync with
+one another.
+
+Finally, Unicode word boundaries can be disabled, which will cause ASCII word
+boundaries to be used instead. That is, `\b` is a Unicode word boundary while
+`(?-u)\b` is an ASCII-only word boundary. This can occasionally be beneficial
+if performance is important, since the implementation of Unicode word
+boundaries is currently sub-optimal on non-ASCII text.
+
+
+## RL1.5 Simple Loose Matches
+
+[UTS#18 RL1.5](https://unicode.org/reports/tr18/#Simple_Loose_Matches)
+
+The regex crate provides full support for case insensitive matching in
+accordance with RL1.5. That is, it uses the "simple" case folding mapping. The
+"simple" mapping was chosen because of a key convenient property: every
+"simple" mapping is a mapping from exactly one code point to exactly one other
+code point. This makes case insensitive matching of character classes, for
+example, straight-forward to implement.
+
+When case insensitive mode is enabled (e.g., `(?i)[a]` is equivalent to `a|A`),
+then all characters classes are case folded as well.
+
+
+## RL1.6 Line Boundaries
+
+[UTS#18 RL1.6](https://unicode.org/reports/tr18/#Line_Boundaries)
+
+The regex crate only provides support for recognizing the `\n` (`END OF LINE`)
+character as a line boundary. This choice was made mostly for implementation
+convenience, and to avoid performance cliffs that Unicode word boundaries are
+subject to.
+
+Ideally, it would be nice to at least support `\r\n` as a line boundary as
+well, and in theory, this could be done efficiently.
+
+
+## RL1.7 Code Points
+
+[UTS#18 RL1.7](https://unicode.org/reports/tr18/#Supplementary_Characters)
+
+The regex crate provides full support for Unicode code point matching. Namely,
+the fundamental atom of any match is always a single code point.
+
+Given Rust's strong ties to UTF-8, the following guarantees are also provided:
+
+* All matches are reported on valid UTF-8 code unit boundaries. That is, any
+ match range returned by the public regex API is guaranteed to successfully
+ slice the string that was searched.
+* By consequence of the above, it is impossible to match surrogode code points.
+ No support for UTF-16 is provided, so this is never necessary.
+
+Note that when Unicode mode is disabled, the fundamental atom of matching is
+no longer a code point but a single byte. When Unicode mode is disabled, many
+Unicode features are disabled as well. For example, `(?-u)\pL` is not a valid
+regex but `\pL(?-u)\xFF` (matches any Unicode `Letter` followed by the literal
+byte `\xFF`) is, for example.
diff --git a/third_party/rust/regex/examples/regexdna-input.txt b/third_party/rust/regex/examples/regexdna-input.txt
new file mode 100644
index 0000000000..fb23263397
--- /dev/null
+++ b/third_party/rust/regex/examples/regexdna-input.txt
@@ -0,0 +1,1671 @@
+>ONE Homo sapiens alu
+GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA
+TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT
+AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG
+GCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCG
+CCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGT
+GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCA
+GGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAA
+TTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAG
+AATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCA
+GCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGT
+AATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACC
+AGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTG
+GTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACC
+CGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAG
+AGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTT
+TGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACA
+TGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCT
+GTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGG
+TTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGT
+CTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG
+CGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCG
+TCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTA
+CTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCG
+AGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCG
+GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACC
+TGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAA
+TACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGA
+GGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACT
+GCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTC
+ACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGT
+TCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC
+CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCG
+CTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTG
+GGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCC
+CAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCT
+GGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGC
+GCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGA
+GGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGA
+GACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGA
+GGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTG
+AAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAAT
+CCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCA
+GTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAA
+AAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGC
+GGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCT
+ACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGG
+GAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATC
+GCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGC
+GGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGG
+TCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAA
+AAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAG
+GAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACT
+CCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCC
+TGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAG
+ACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGC
+GTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGA
+ACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGA
+CAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCA
+CTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCA
+ACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCG
+CCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGG
+AGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTC
+CGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCG
+AGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACC
+CCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAG
+CTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAG
+CCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGG
+CCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATC
+ACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAA
+AAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGC
+TGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCC
+ACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGG
+CTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGG
+AGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATT
+AGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAA
+TCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGC
+CTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAA
+TCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAG
+CCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGT
+GGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCG
+GGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAG
+CGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTG
+GGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATG
+GTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGT
+AATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTT
+GCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCT
+CAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCG
+GGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTC
+TCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACT
+CGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAG
+ATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGG
+CGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTG
+AGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATA
+CAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGG
+CAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGC
+ACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCAC
+GCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTC
+GAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCG
+GGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCT
+TGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGG
+CGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCA
+GCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGG
+CCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGC
+GCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGG
+CGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGA
+CTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGG
+CCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAA
+ACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCC
+CAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGT
+GAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAA
+AGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGG
+ATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTAC
+TAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGA
+GGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGC
+GCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGG
+TGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTC
+AGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAA
+ATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGA
+GAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC
+AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTG
+TAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGAC
+CAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGT
+GGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAAC
+CCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACA
+GAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACT
+TTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAAC
+ATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCC
+TGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAG
+GTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCG
+TCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAG
+GCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCC
+GTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCT
+ACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCC
+GAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCC
+GGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCAC
+CTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAA
+ATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTG
+AGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCAC
+TGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCT
+CACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAG
+TTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAG
+CCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATC
+GCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCT
+GGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATC
+CCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCC
+TGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGG
+CGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG
+AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCG
+AGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGG
+AGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGT
+GAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAA
+TCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGC
+AGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCA
+AAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGG
+CGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTC
+TACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCG
+GGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGAT
+CGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCG
+CGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAG
+GTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACA
+AAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCA
+GGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCAC
+TCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGC
+CTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGA
+GACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGG
+CGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTG
+AACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCG
+ACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGC
+ACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCC
+AACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGC
+GCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCG
+GAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACT
+CCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCC
+GAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAAC
+CCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA
+GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGA
+GCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAG
+GCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGAT
+CACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTA
+AAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGG
+CTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGC
+CACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTG
+GCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAG
+GAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAAT
+TAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGA
+ATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAG
+CCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTA
+ATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCA
+GCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGG
+TGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCC
+GGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGA
+GCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTT
+GGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACAT
+GGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTG
+TAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGT
+TGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTC
+TCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGC
+GGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGT
+CTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTAC
+TCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGA
+GATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGG
+GCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCT
+GAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT
+ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAG
+GCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTG
+CACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCA
+CGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTT
+CGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCC
+GGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGC
+TTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGG
+GCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCC
+AGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTG
+GCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCG
+CGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAG
+GCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAG
+ACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAG
+GCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGA
+AACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATC
+CCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAG
+TGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAA
+AAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCG
+GATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTA
+CTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGG
+AGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCG
+CGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCG
+GTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGT
+CAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAA
+AATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGG
+AGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTC
+CAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCT
+GTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA
+CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCG
+TGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAA
+CCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGAC
+AGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCAC
+TTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAA
+CATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGC
+CTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGA
+GGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCC
+GTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGA
+GGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCC
+CGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGC
+TACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGC
+CGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGC
+CGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCA
+CCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAA
+AATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCT
+GAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCA
+CTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGC
+TCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGA
+GTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTA
+GCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAAT
+CGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCC
+TGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAAT
+CCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGC
+CTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTG
+GCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGG
+GAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGC
+GAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG
+GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGG
+TGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTA
+ATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTG
+CAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTC
+AAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGG
+GCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCT
+CTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTC
+GGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGA
+TCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGC
+GCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGA
+GGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATAC
+AAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGC
+AGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCA
+CTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACG
+CCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCG
+AGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGG
+GCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTT
+GAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGC
+GACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAG
+CACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGC
+CAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCG
+CGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGC
+GGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGAC
+TCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGC
+CGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAA
+CCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCC
+AGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTG
+AGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA
+GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA
+TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT
+AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG
+GCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCG
+CCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGT
+GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCA
+GGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAA
+TTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAG
+AATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCA
+GCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGT
+AATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACC
+AGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTG
+GTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACC
+CGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAG
+AGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTT
+TGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACA
+TGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCT
+GTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGG
+TTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGT
+CTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG
+CGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCG
+TCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTA
+CTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCG
+AGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCG
+GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACC
+TGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAA
+TACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGA
+GGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACT
+GCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTC
+ACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGT
+TCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC
+CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCG
+CTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTG
+GGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCC
+CAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCT
+GGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGC
+GCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGA
+GGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGA
+GACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGA
+GGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTG
+AAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAAT
+CCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCA
+GTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAA
+AAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGC
+GGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCT
+ACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGG
+GAGGCTGAGGCAGGAGAATC
+>TWO IUB ambiguity codes
+cttBtatcatatgctaKggNcataaaSatgtaaaDcDRtBggDtctttataattcBgtcg
+tactDtDagcctatttSVHtHttKtgtHMaSattgWaHKHttttagacatWatgtRgaaa
+NtactMcSMtYtcMgRtacttctWBacgaaatatagScDtttgaagacacatagtVgYgt
+cattHWtMMWcStgttaggKtSgaYaaccWStcgBttgcgaMttBYatcWtgacaYcaga
+gtaBDtRacttttcWatMttDBcatWtatcttactaBgaYtcttgttttttttYaaScYa
+HgtgttNtSatcMtcVaaaStccRcctDaataataStcYtRDSaMtDttgttSagtRRca
+tttHatSttMtWgtcgtatSSagactYaaattcaMtWatttaSgYttaRgKaRtccactt
+tattRggaMcDaWaWagttttgacatgttctacaaaRaatataataaMttcgDacgaSSt
+acaStYRctVaNMtMgtaggcKatcttttattaaaaagVWaHKYagtttttatttaacct
+tacgtVtcVaattVMBcttaMtttaStgacttagattWWacVtgWYagWVRctDattBYt
+gtttaagaagattattgacVatMaacattVctgtBSgaVtgWWggaKHaatKWcBScSWa
+accRVacacaaactaccScattRatatKVtactatatttHttaagtttSKtRtacaaagt
+RDttcaaaaWgcacatWaDgtDKacgaacaattacaRNWaatHtttStgttattaaMtgt
+tgDcgtMgcatBtgcttcgcgaDWgagctgcgaggggVtaaScNatttacttaatgacag
+cccccacatYScaMgtaggtYaNgttctgaMaacNaMRaacaaacaKctacatagYWctg
+ttWaaataaaataRattagHacacaagcgKatacBttRttaagtatttccgatctHSaat
+actcNttMaagtattMtgRtgaMgcataatHcMtaBSaRattagttgatHtMttaaKagg
+YtaaBataSaVatactWtataVWgKgttaaaacagtgcgRatatacatVtHRtVYataSa
+KtWaStVcNKHKttactatccctcatgWHatWaRcttactaggatctataDtDHBttata
+aaaHgtacVtagaYttYaKcctattcttcttaataNDaaggaaaDYgcggctaaWSctBa
+aNtgctggMBaKctaMVKagBaactaWaDaMaccYVtNtaHtVWtKgRtcaaNtYaNacg
+gtttNattgVtttctgtBaWgtaattcaagtcaVWtactNggattctttaYtaaagccgc
+tcttagHVggaYtgtNcDaVagctctctKgacgtatagYcctRYHDtgBattDaaDgccK
+tcHaaStttMcctagtattgcRgWBaVatHaaaataYtgtttagMDMRtaataaggatMt
+ttctWgtNtgtgaaaaMaatatRtttMtDgHHtgtcattttcWattRSHcVagaagtacg
+ggtaKVattKYagactNaatgtttgKMMgYNtcccgSKttctaStatatNVataYHgtNa
+BKRgNacaactgatttcctttaNcgatttctctataScaHtataRagtcRVttacDSDtt
+aRtSatacHgtSKacYagttMHtWataggatgactNtatSaNctataVtttRNKtgRacc
+tttYtatgttactttttcctttaaacatacaHactMacacggtWataMtBVacRaSaatc
+cgtaBVttccagccBcttaRKtgtgcctttttRtgtcagcRttKtaaacKtaaatctcac
+aattgcaNtSBaaccgggttattaaBcKatDagttactcttcattVtttHaaggctKKga
+tacatcBggScagtVcacattttgaHaDSgHatRMaHWggtatatRgccDttcgtatcga
+aacaHtaagttaRatgaVacttagattVKtaaYttaaatcaNatccRttRRaMScNaaaD
+gttVHWgtcHaaHgacVaWtgttScactaagSgttatcttagggDtaccagWattWtRtg
+ttHWHacgattBtgVcaYatcggttgagKcWtKKcaVtgaYgWctgYggVctgtHgaNcV
+taBtWaaYatcDRaaRtSctgaHaYRttagatMatgcatttNattaDttaattgttctaa
+ccctcccctagaWBtttHtBccttagaVaatMcBHagaVcWcagBVttcBtaYMccagat
+gaaaaHctctaacgttagNWRtcggattNatcRaNHttcagtKttttgWatWttcSaNgg
+gaWtactKKMaacatKatacNattgctWtatctaVgagctatgtRaHtYcWcttagccaa
+tYttWttaWSSttaHcaaaaagVacVgtaVaRMgattaVcDactttcHHggHRtgNcctt
+tYatcatKgctcctctatVcaaaaKaaaagtatatctgMtWtaaaacaStttMtcgactt
+taSatcgDataaactaaacaagtaaVctaggaSccaatMVtaaSKNVattttgHccatca
+cBVctgcaVatVttRtactgtVcaattHgtaaattaaattttYtatattaaRSgYtgBag
+aHSBDgtagcacRHtYcBgtcacttacactaYcgctWtattgSHtSatcataaatataHt
+cgtYaaMNgBaatttaRgaMaatatttBtttaaaHHKaatctgatWatYaacttMctctt
+ttVctagctDaaagtaVaKaKRtaacBgtatccaaccactHHaagaagaaggaNaaatBW
+attccgStaMSaMatBttgcatgRSacgttVVtaaDMtcSgVatWcaSatcttttVatag
+ttactttacgatcaccNtaDVgSRcgVcgtgaacgaNtaNatatagtHtMgtHcMtagaa
+attBgtataRaaaacaYKgtRccYtatgaagtaataKgtaaMttgaaRVatgcagaKStc
+tHNaaatctBBtcttaYaBWHgtVtgacagcaRcataWctcaBcYacYgatDgtDHccta
+aagacYRcaggattHaYgtKtaatgcVcaataMYacccatatcacgWDBtgaatcBaata
+cKcttRaRtgatgaBDacggtaattaaYtataStgVHDtDctgactcaaatKtacaatgc
+gYatBtRaDatHaactgtttatatDttttaaaKVccYcaaccNcBcgHaaVcattHctcg
+attaaatBtatgcaaaaatYMctSactHatacgaWacattacMBgHttcgaatVaaaaca
+BatatVtctgaaaaWtctRacgBMaatSgRgtgtcgactatcRtattaScctaStagKga
+DcWgtYtDDWKRgRtHatRtggtcgaHgggcgtattaMgtcagccaBggWVcWctVaaat
+tcgNaatcKWagcNaHtgaaaSaaagctcYctttRVtaaaatNtataaccKtaRgtttaM
+tgtKaBtRtNaggaSattHatatWactcagtgtactaKctatttgRYYatKatgtccgtR
+tttttatttaatatVgKtttgtatgtNtataRatWYNgtRtHggtaaKaYtKSDcatcKg
+taaYatcSRctaVtSMWtVtRWHatttagataDtVggacagVcgKWagBgatBtaaagNc
+aRtagcataBggactaacacRctKgttaatcctHgDgttKHHagttgttaatgHBtatHc
+DaagtVaBaRccctVgtgDtacRHSctaagagcggWYaBtSaKtHBtaaactYacgNKBa
+VYgtaacttagtVttcttaatgtBtatMtMtttaattaatBWccatRtttcatagVgMMt
+agctStKctaMactacDNYgKYHgaWcgaHgagattacVgtttgtRaSttaWaVgataat
+gtgtYtaStattattMtNgWtgttKaccaatagNYttattcgtatHcWtctaaaNVYKKt
+tWtggcDtcgaagtNcagatacgcattaagaccWctgcagcttggNSgaNcHggatgtVt
+catNtRaaBNcHVagagaaBtaaSggDaatWaatRccaVgggStctDaacataKttKatt
+tggacYtattcSatcttagcaatgaVBMcttDattctYaaRgatgcattttNgVHtKcYR
+aatRKctgtaaacRatVSagctgtWacBtKVatctgttttKcgtctaaDcaagtatcSat
+aWVgcKKataWaYttcccSaatgaaaacccWgcRctWatNcWtBRttYaattataaNgac
+acaatagtttVNtataNaYtaatRaVWKtBatKagtaatataDaNaaaaataMtaagaaS
+tccBcaatNgaataWtHaNactgtcDtRcYaaVaaaaaDgtttRatctatgHtgttKtga
+aNSgatactttcgagWaaatctKaaDaRttgtggKKagcDgataaattgSaacWaVtaNM
+acKtcaDaaatttctRaaVcagNacaScRBatatctRatcctaNatWgRtcDcSaWSgtt
+RtKaRtMtKaatgttBHcYaaBtgatSgaSWaScMgatNtctcctatttctYtatMatMt
+RRtSaattaMtagaaaaStcgVgRttSVaScagtgDtttatcatcatacRcatatDctta
+tcatVRtttataaHtattcYtcaaaatactttgVctagtaaYttagatagtSYacKaaac
+gaaKtaaatagataatSatatgaaatSgKtaatVtttatcctgKHaatHattagaaccgt
+YaaHactRcggSBNgtgctaaBagBttgtRttaaattYtVRaaaattgtaatVatttctc
+ttcatgBcVgtgKgaHaaatattYatagWacNctgaaMcgaattStagWaSgtaaKagtt
+ttaagaDgatKcctgtaHtcatggKttVDatcaaggtYcgccagNgtgcVttttagagat
+gctaccacggggtNttttaSHaNtatNcctcatSaaVgtactgBHtagcaYggYVKNgta
+KBcRttgaWatgaatVtagtcgattYgatgtaatttacDacSctgctaaaStttaWMagD
+aaatcaVYctccgggcgaVtaaWtStaKMgDtttcaaMtVgBaatccagNaaatcYRMBg
+gttWtaaScKttMWtYataRaDBMaDataatHBcacDaaKDactaMgagttDattaHatH
+taYatDtattDcRNStgaatattSDttggtattaaNSYacttcDMgYgBatWtaMagact
+VWttctttgYMaYaacRgHWaattgRtaagcattctMKVStatactacHVtatgatcBtV
+NataaBttYtSttacKgggWgYDtgaVtYgatDaacattYgatggtRDaVDttNactaSa
+MtgNttaacaaSaBStcDctaccacagacgcaHatMataWKYtaYattMcaMtgSttDag
+cHacgatcaHttYaKHggagttccgatYcaatgatRaVRcaagatcagtatggScctata
+ttaNtagcgacgtgKaaWaactSgagtMYtcttccaKtStaacggMtaagNttattatcg
+tctaRcactctctDtaacWYtgaYaSaagaWtNtatttRacatgNaatgttattgWDDcN
+aHcctgaaHacSgaataaRaataMHttatMtgaSDSKatatHHaNtacagtccaYatWtc
+actaactatKDacSaStcggataHgYatagKtaatKagStaNgtatactatggRHacttg
+tattatgtDVagDVaRctacMYattDgtttYgtctatggtKaRSttRccRtaaccttaga
+gRatagSaaMaacgcaNtatgaaatcaRaagataatagatactcHaaYKBctccaagaRa
+BaStNagataggcgaatgaMtagaatgtcaKttaaatgtaWcaBttaatRcggtgNcaca
+aKtttScRtWtgcatagtttWYaagBttDKgcctttatMggNttattBtctagVtacata
+aaYttacacaaRttcYtWttgHcaYYtaMgBaBatctNgcDtNttacgacDcgataaSat
+YaSttWtcctatKaatgcagHaVaacgctgcatDtgttaSataaaaYSNttatagtaNYt
+aDaaaNtggggacttaBggcHgcgtNtaaMcctggtVtaKcgNacNtatVaSWctWtgaW
+cggNaBagctctgaYataMgaagatBSttctatacttgtgtKtaattttRagtDtacata
+tatatgatNHVgBMtKtaKaNttDHaagatactHaccHtcatttaaagttVaMcNgHata
+tKtaNtgYMccttatcaaNagctggacStttcNtggcaVtattactHaSttatgNMVatt
+MMDtMactattattgWMSgtHBttStStgatatRaDaagattttctatMtaaaaaggtac
+taaVttaSacNaatactgMttgacHaHRttgMacaaaatagttaatatWKRgacDgaRta
+tatttattatcYttaWtgtBRtWatgHaaattHataagtVaDtWaVaWtgStcgtMSgaS
+RgMKtaaataVacataatgtaSaatttagtcgaaHtaKaatgcacatcggRaggSKctDc
+agtcSttcccStYtccRtctctYtcaaKcgagtaMttttcRaYDttgttatctaatcata
+NctctgctatcaMatactataggDaHaaSttMtaDtcNatataattctMcStaaBYtaNa
+gatgtaatHagagSttgWHVcttatKaYgDctcttggtgttMcRaVgSgggtagacaata
+aDtaattSaDaNaHaBctattgNtaccaaRgaVtKNtaaYggHtaKKgHcatctWtctDt
+ttctttggSDtNtaStagttataaacaattgcaBaBWggHgcaaaBtYgctaatgaaatW
+cDcttHtcMtWWattBHatcatcaaatctKMagtDNatttWaBtHaaaNgMttaaStagt
+tctctaatDtcRVaYttgttMtRtgtcaSaaYVgSWDRtaatagctcagDgcWWaaaBaa
+RaBctgVgggNgDWStNaNBKcBctaaKtttDcttBaaggBttgaccatgaaaNgttttt
+tttatctatgttataccaaDRaaSagtaVtDtcaWatBtacattaWacttaSgtattggD
+gKaaatScaattacgWcagKHaaccaYcRcaRttaDttRtttHgaHVggcttBaRgtccc
+tDatKaVtKtcRgYtaKttacgtatBtStaagcaattaagaRgBagSaattccSWYttta
+ttVaataNctgHgttaaNBgcVYgtRtcccagWNaaaacaDNaBcaaaaRVtcWMgBagM
+tttattacgDacttBtactatcattggaaatVccggttRttcatagttVYcatYaSHaHc
+ttaaagcNWaHataaaRWtctVtRYtagHtaaaYMataHYtNBctNtKaatattStgaMc
+BtRgctaKtgcScSttDgYatcVtggaaKtaagatWccHccgKYctaNNctacaWctttt
+gcRtgtVcgaKttcMRHgctaHtVaataaDtatgKDcttatBtDttggNtacttttMtga
+acRattaaNagaactcaaaBBVtcDtcgaStaDctgaaaSgttMaDtcgttcaccaaaag
+gWtcKcgSMtcDtatgtttStaaBtatagDcatYatWtaaaBacaKgcaDatgRggaaYc
+taRtccagattDaWtttggacBaVcHtHtaacDacYgtaatataMagaatgHMatcttat
+acgtatttttatattacHactgttataMgStYaattYaccaattgagtcaaattaYtgta
+tcatgMcaDcgggtcttDtKgcatgWRtataatatRacacNRBttcHtBgcRttgtgcgt
+catacMtttBctatctBaatcattMttMYgattaaVYatgDaatVagtattDacaacDMa
+tcMtHcccataagatgBggaccattVWtRtSacatgctcaaggggYtttDtaaNgNtaaB
+atggaatgtctRtaBgBtcNYatatNRtagaacMgagSaSDDSaDcctRagtVWSHtVSR
+ggaacaBVaccgtttaStagaacaMtactccagtttVctaaRaaHttNcttagcaattta
+ttaatRtaaaatctaacDaBttggSagagctacHtaaRWgattcaaBtctRtSHaNtgta
+cattVcaHaNaagtataccacaWtaRtaaVKgMYaWgttaKggKMtKcgWatcaDatYtK
+SttgtacgaccNctSaattcDcatcttcaaaDKttacHtggttHggRRaRcaWacaMtBW
+VHSHgaaMcKattgtaRWttScNattBBatYtaNRgcggaagacHSaattRtttcYgacc
+BRccMacccKgatgaacttcgDgHcaaaaaRtatatDtatYVtttttHgSHaSaatagct
+NYtaHYaVYttattNtttgaaaYtaKttWtctaNtgagaaaNctNDctaaHgttagDcRt
+tatagccBaacgcaRBtRctRtggtaMYYttWtgataatcgaataattattataVaaaaa
+ttacNRVYcaaMacNatRttcKatMctgaagactaattataaYgcKcaSYaatMNctcaa
+cgtgatttttBacNtgatDccaattattKWWcattttatatatgatBcDtaaaagttgaa
+VtaHtaHHtBtataRBgtgDtaataMttRtDgDcttattNtggtctatctaaBcatctaR
+atgNacWtaatgaagtcMNaacNgHttatactaWgcNtaStaRgttaaHacccgaYStac
+aaaatWggaYaWgaattattcMaactcBKaaaRVNcaNRDcYcgaBctKaacaaaaaSgc
+tccYBBHYaVagaatagaaaacagYtctVccaMtcgtttVatcaatttDRtgWctagtac
+RttMctgtDctttcKtWttttataaatgVttgBKtgtKWDaWagMtaaagaaattDVtag
+gttacatcatttatgtcgMHaVcttaBtVRtcgtaYgBRHatttHgaBcKaYWaatcNSc
+tagtaaaaatttacaatcactSWacgtaatgKttWattagttttNaggtctcaagtcact
+attcttctaagKggaataMgtttcataagataaaaatagattatDgcBVHWgaBKttDgc
+atRHaagcaYcRaattattatgtMatatattgHDtcaDtcaaaHctStattaatHaccga
+cNattgatatattttgtgtDtRatagSacaMtcRtcattcccgacacSattgttKaWatt
+NHcaacttccgtttSRtgtctgDcgctcaaMagVtBctBMcMcWtgtaacgactctcttR
+ggRKSttgYtYatDccagttDgaKccacgVatWcataVaaagaataMgtgataaKYaaat
+cHDaacgataYctRtcYatcgcaMgtNttaBttttgatttaRtStgcaacaaaataccVg
+aaDgtVgDcStctatatttattaaaaRKDatagaaagaKaaYYcaYSgKStctccSttac
+agtcNactttDVttagaaagMHttRaNcSaRaMgBttattggtttaRMggatggcKDgWR
+tNaataataWKKacttcKWaaagNaBttaBatMHtccattaacttccccYtcBcYRtaga
+ttaagctaaYBDttaNtgaaaccHcaRMtKtaaHMcNBttaNaNcVcgVttWNtDaBatg
+ataaVtcWKcttRggWatcattgaRagHgaattNtatttctctattaattaatgaDaaMa
+tacgttgggcHaYVaaNaDDttHtcaaHtcVVDgBVagcMacgtgttaaBRNtatRtcag
+taagaggtttaagacaVaaggttaWatctccgtVtaDtcDatttccVatgtacNtttccg
+tHttatKgScBatgtVgHtYcWagcaKtaMYaaHgtaattaSaHcgcagtWNaatNccNN
+YcacgVaagaRacttctcattcccRtgtgtaattagcSttaaStWaMtctNNcSMacatt
+ataaactaDgtatWgtagtttaagaaaattgtagtNagtcaataaatttgatMMYactaa
+tatcggBWDtVcYttcDHtVttatacYaRgaMaacaStaatcRttttVtagaDtcacWat
+ttWtgaaaagaaagNRacDtttStVatBaDNtaactatatcBSMcccaSttccggaMatg
+attaaWatKMaBaBatttgataNctgttKtVaagtcagScgaaaDggaWgtgttttKtWt
+atttHaatgtagttcactaaKMagttSYBtKtaYgaactcagagRtatagtVtatcaaaW
+YagcgNtaDagtacNSaaYDgatBgtcgataacYDtaaactacagWDcYKaagtttatta
+gcatcgagttKcatDaattgattatDtcagRtWSKtcgNtMaaaaacaMttKcaWcaaSV
+MaaaccagMVtaMaDtMaHaBgaacataBBVtaatVYaNSWcSgNtDNaaKacacBttta
+tKtgtttcaaHaMctcagtaacgtcgYtactDcgcctaNgagagcYgatattttaaattt
+ccattttacatttDaaRctattttWctttacgtDatYtttcagacgcaaVttagtaaKaa
+aRtgVtccataBggacttatttgtttaWNtgttVWtaWNVDaattgtatttBaagcBtaa
+BttaaVatcHcaVgacattccNggtcgacKttaaaRtagRtctWagaYggtgMtataatM
+tgaaRttattttgWcttNtDRRgMDKacagaaaaggaaaRStcccagtYccVattaNaaK
+StNWtgacaVtagaagcttSaaDtcacaacgDYacWDYtgtttKatcVtgcMaDaSKStV
+cgtagaaWaKaagtttcHaHgMgMtctataagBtKaaaKKcactggagRRttaagaBaaN
+atVVcgRcKSttDaactagtSttSattgttgaaRYatggttVttaataaHttccaagDtg
+atNWtaagHtgcYtaactRgcaatgMgtgtRaatRaNaacHKtagactactggaatttcg
+ccataacgMctRgatgttaccctaHgtgWaYcactcacYaattcttaBtgacttaaacct
+gYgaWatgBttcttVttcgttWttMcNYgtaaaatctYgMgaaattacNgaHgaacDVVM
+tttggtHtctaaRgtacagacgHtVtaBMNBgattagcttaRcttacaHcRctgttcaaD
+BggttKaacatgKtttYataVaNattccgMcgcgtagtRaVVaattaKaatggttRgaMc
+agtatcWBttNtHagctaatctagaaNaaacaYBctatcgcVctBtgcaaagDgttVtga
+HtactSNYtaaNccatgtgDacgaVtDcgKaRtacDcttgctaagggcagMDagggtBWR
+tttSgccttttttaacgtcHctaVtVDtagatcaNMaVtcVacatHctDWNaataRgcgt
+aVHaggtaaaaSgtttMtattDgBtctgatSgtRagagYtctSaKWaataMgattRKtaa
+catttYcgtaacacattRWtBtcggtaaatMtaaacBatttctKagtcDtttgcBtKYYB
+aKttctVttgttaDtgattttcttccacttgSaaacggaaaNDaattcYNNaWcgaaYat
+tttMgcBtcatRtgtaaagatgaWtgaccaYBHgaatagataVVtHtttVgYBtMctaMt
+cctgaDcYttgtccaaaRNtacagcMctKaaaggatttacatgtttaaWSaYaKttBtag
+DacactagctMtttNaKtctttcNcSattNacttggaacaatDagtattRtgSHaataat
+gccVgacccgatactatccctgtRctttgagaSgatcatatcgDcagWaaHSgctYYWta
+tHttggttctttatVattatcgactaagtgtagcatVgtgHMtttgtttcgttaKattcM
+atttgtttWcaaStNatgtHcaaaDtaagBaKBtRgaBgDtSagtatMtaacYaatYtVc
+KatgtgcaacVaaaatactKcRgtaYtgtNgBBNcKtcttaccttKgaRaYcaNKtactt
+tgagSBtgtRagaNgcaaaNcacagtVtttHWatgttaNatBgtttaatNgVtctgaata
+tcaRtattcttttttttRaaKcRStctcggDgKagattaMaaaKtcaHacttaataataK
+taRgDtKVBttttcgtKaggHHcatgttagHggttNctcgtatKKagVagRaaaggaaBt
+NatttVKcRttaHctaHtcaaatgtaggHccaBataNaNaggttgcWaatctgatYcaaa
+HaatWtaVgaaBttagtaagaKKtaaaKtRHatMaDBtBctagcatWtatttgWttVaaa
+ScMNattRactttgtYtttaaaagtaagtMtaMaSttMBtatgaBtttaKtgaatgagYg
+tNNacMtcNRacMMHcttWtgtRtctttaacaacattattcYaMagBaacYttMatcttK
+cRMtgMNccattaRttNatHaHNaSaaHMacacaVaatacaKaSttHatattMtVatWga
+ttttttaYctttKttHgScWaacgHtttcaVaaMgaacagNatcgttaacaaaaagtaca
+HBNaattgttKtcttVttaaBtctgctacgBgcWtttcaggacacatMgacatcccagcg
+gMgaVKaBattgacttaatgacacacaaaaaatRKaaBctacgtRaDcgtagcVBaacDS
+BHaaaaSacatatacagacRNatcttNaaVtaaaataHattagtaaaaSWccgtatWatg
+gDttaactattgcccatcttHaSgYataBttBaactattBtcHtgatcaataSttaBtat
+KSHYttWggtcYtttBttaataccRgVatStaHaKagaatNtagRMNgtcttYaaSaact
+cagDSgagaaYtMttDtMRVgWKWtgMaKtKaDttttgactatacataatcNtatNaHat
+tVagacgYgatatatttttgtStWaaatctWaMgagaRttRatacgStgattcttaagaD
+taWccaaatRcagcagaaNKagtaaDggcgccBtYtagSBMtactaaataMataBSacRM
+gDgattMMgtcHtcaYDtRaDaacggttDaggcMtttatgttaNctaattaVacgaaMMt
+aatDccSgtattgaRtWWaccaccgagtactMcgVNgctDctaMScatagcgtcaactat
+acRacgHRttgctatttaatgaattataYKttgtaagWgtYttgcHgMtaMattWaWVta
+RgcttgYgttBHtYataSccStBtgtagMgtDtggcVaaSBaatagDttgBgtctttctc
+attttaNagtHKtaMWcYactVcgcgtatMVtttRacVagDaatcttgctBBcRDgcaac
+KttgatSKtYtagBMagaRtcgBattHcBWcaactgatttaatttWDccatttatcgagS
+KaWttataHactaHMttaatHtggaHtHagaatgtKtaaRactgtttMatacgatcaagD
+gatKaDctataMggtHDtggHacctttRtatcttYattttgacttgaaSaataaatYcgB
+aaaaccgNatVBttMacHaKaataagtatKgtcaagactcttaHttcggaattgttDtct
+aaccHttttWaaatgaaatataaaWattccYDtKtaaaacggtgaggWVtctattagtga
+ctattaagtMgtttaagcatttgSgaaatatccHaaggMaaaattttcWtatKctagDtY
+tMcctagagHcactttactatacaaacattaacttaHatcVMYattYgVgtMttaaRtga
+aataaDatcaHgtHHatKcDYaatcttMtNcgatYatgSaMaNtcttKcWataScKggta
+tcttacgcttWaaagNatgMgHtctttNtaacVtgttcMaaRatccggggactcMtttaY
+MtcWRgNctgNccKatcttgYDcMgattNYaRagatHaaHgKctcataRDttacatBatc
+cattgDWttatttaWgtcggagaaaaatacaatacSNtgggtttccttacSMaagBatta
+caMaNcactMttatgaRBacYcYtcaaaWtagctSaacttWgDMHgaggatgBVgcHaDt
+ggaactttggtcNatNgtaKaBcccaNtaagttBaacagtatacDYttcctNgWgcgSMc
+acatStctHatgRcNcgtacacaatRttMggaNKKggataaaSaYcMVcMgtaMaHtgat
+tYMatYcggtcttcctHtcDccgtgRatcattgcgccgatatMaaYaataaYSggatagc
+gcBtNtaaaScaKgttBgagVagttaKagagtatVaactaSacWactSaKatWccaKaaa
+atBKgaaKtDMattttgtaaatcRctMatcaaMagMttDgVatggMaaWgttcgaWatga
+aatttgRtYtattaWHKcRgctacatKttctaccaaHttRatctaYattaaWatVNccat
+NgagtcKttKataStRaatatattcctRWatDctVagttYDgSBaatYgttttgtVaatt
+taatagcagMatRaacttBctattgtMagagattaaactaMatVtHtaaatctRgaaaaa
+aaatttWacaacaYccYDSaattMatgaccKtaBKWBattgtcaagcHKaagttMMtaat
+ttcKcMagNaaKagattggMagaggtaatttYacatcWaaDgatMgKHacMacgcVaaca
+DtaDatatYggttBcgtatgWgaSatttgtagaHYRVacaRtctHaaRtatgaactaata
+tctSSBgggaaHMWtcaagatKgagtDaSatagttgattVRatNtctMtcSaagaSHaat
+aNataataRaaRgattctttaataaagWaRHcYgcatgtWRcttgaaggaMcaataBRaa
+ccagStaaacNtttcaatataYtaatatgHaDgcStcWttaacctaRgtYaRtataKtgM
+ttttatgactaaaatttacYatcccRWtttHRtattaaatgtttatatttgttYaatMca
+RcSVaaDatcgtaYMcatgtagacatgaaattgRtcaaYaaYtRBatKacttataccaNa
+aattVaBtctggacaagKaaYaaatatWtMtatcYaaVNtcgHaactBaagKcHgtctac
+aatWtaDtSgtaHcataHtactgataNctRgttMtDcDttatHtcgtacatcccaggStt
+aBgtcacacWtccNMcNatMVaVgtccDYStatMaccDatggYaRKaaagataRatttHK
+tSaaatDgataaacttaHgttgVBtcttVttHgDacgaKatgtatatNYataactctSat
+atatattgcHRRYttStggaactHgttttYtttaWtatMcttttctatctDtagVHYgMR
+BgtHttcctaatYRttKtaagatggaVRataKDctaMtKBNtMtHNtWtttYcVtattMc
+gRaacMcctNSctcatttaaagDcaHtYccSgatgcaatYaaaaDcttcgtaWtaattct
+cgttttScttggtaatctttYgtctaactKataHacctMctcttacHtKataacacagcN
+RatgKatttttSaaatRYcgDttaMRcgaaattactMtgcgtaagcgttatBtttttaat
+taagtNacatHgttcRgacKcBBtVgatKttcgaBaatactDRgtRtgaNacWtcacYtt
+aaKcgttctHaKttaNaMgWgWaggtctRgaKgWttSttBtDcNtgtttacaaatYcDRt
+gVtgcctattcNtctaaaDMNttttNtggctgagaVctDaacVtWccaagtaacacaNct
+gaScattccDHcVBatcgatgtMtaatBgHaatDctMYgagaatgYWKcctaatNaStHa
+aaKccgHgcgtYaaYtattgtStgtgcaaRtattaKatattagaWVtcaMtBagttatta
+gNaWHcVgcaattttDcMtgtaRHVYtHtctgtaaaaHVtMKacatcgNaatttMatatg
+ttgttactagWYtaRacgataKagYNKcattataNaRtgaacKaYgcaaYYacaNccHat
+MatDcNgtHttRaWttagaaDcaaaaaatagggtKDtStaDaRtaVtHWKNtgtattVct
+SVgRgataDaRaWataBgaagaaKtaataaYgDcaStaNgtaDaaggtattHaRaWMYaY
+aWtggttHYgagVtgtgcttttcaaDKcagVcgttagacNaaWtagtaataDttctggtt
+VcatcataaagtgKaaaNaMtaBBaattaatWaattgctHaVKaSgDaaVKaHtatatat
+HatcatSBagNgHtatcHYMHgttDgtaHtBttWatcgtttaRaattgStKgSKNWKatc
+agDtctcagatttctRtYtBatBgHHtKaWtgYBgacVVWaKtacKcDttKMaKaVcggt
+gttataagaataaHaatattagtataatMHgttYgaRttagtaRtcaaVatacggtcMcg
+agtaaRttacWgactKRYataaaagSattYaWgagatYagKagatgSaagKgttaatMgg
+tataatgttWYttatgagaaacctNVataatHcccKtDctcctaatactggctHggaSag
+gRtKHaWaattcgSatMatttagaggcYtctaMcgctcataSatatgRagacNaaDagga
+VBagaYttKtacNaKgtSYtagttggaWcatcWttaatctatgaVtcgtgtMtatcaYcg
+tRccaaYgDctgcMgtgtWgacWtgataacacgcgctBtgttaKtYDtatDcatcagKaV
+MctaatcttgVcaaRgcRMtDcgattaHttcaNatgaatMtactacVgtRgatggaWttt
+actaaKatgagSaaKggtaNtactVaYtaaKRagaacccacaMtaaMtKtatBcttgtaa
+WBtMctaataaVcDaaYtcRHBtcgttNtaaHatttBNgRStVDattBatVtaagttaYa
+tVattaagaBcacggtSgtVtatttaRattgatgtaHDKgcaatattKtggcctatgaWD
+KRYcggattgRctatNgatacaatMNttctgtcRBYRaaaHctNYattcHtaWcaattct
+BtMKtVgYataatMgYtcagcttMDataVtggRtKtgaatgccNcRttcaMtRgattaac
+attRcagcctHtWMtgtDRagaKaBtgDttYaaaaKatKgatctVaaYaacWcgcatagB
+VtaNtRtYRaggBaaBtgKgttacataagagcatgtRattccacttaccatRaaatgWgD
+aMHaYVgVtaSctatcgKaatatattaDgacccYagtgtaYNaaatKcagtBRgagtcca
+tgKgaaaccBgaagBtgSttWtacgatWHaYatcgatttRaaNRgcaNaKVacaNtDgat
+tgHVaatcDaagcgtatgcNttaDataatcSataaKcaataaHWataBtttatBtcaKtK
+tatagttaDgSaYctacaRatNtaWctSaatatttYaKaKtaccWtatcRagacttaYtt
+VcKgSDcgagaagatccHtaattctSttatggtKYgtMaHagVaBRatttctgtRgtcta
+tgggtaHKgtHacHtSYacgtacacHatacKaaBaVaccaDtatcSaataaHaagagaat
+ScagactataaRttagcaaVcaHataKgDacatWccccaagcaBgagWatctaYttgaaa
+tctVNcYtttWagHcgcgcDcVaaatgttKcHtNtcaatagtgtNRaactttttcaatgg
+WgBcgDtgVgtttctacMtaaataaaRggaaacWaHttaRtNtgctaaRRtVBctYtVta
+tDcattDtgaccYatagatYRKatNYKttNgcctagtaWtgaactaMVaacctgaStttc
+tgaKVtaaVaRKDttVtVctaDNtataaaDtccccaagtWtcgatcactDgYaBcatcct
+MtVtacDaaBtYtMaKNatNtcaNacgDatYcatcgcaRatWBgaacWttKttagYtaat
+tcggttgSWttttDWctttacYtatatWtcatDtMgtBttgRtVDggttaacYtacgtac
+atgaattgaaWcttMStaDgtatattgaDtcRBcattSgaaVBRgagccaaKtttcDgcg
+aSMtatgWattaKttWtgDBMaggBBttBaatWttRtgcNtHcgttttHtKtcWtagHSt
+aacagttgatatBtaWSaWggtaataaMttaKacDaatactcBttcaatatHttcBaaSa
+aatYggtaRtatNtHcaatcaHtagVtgtattataNggaMtcttHtNagctaaaggtaga
+YctMattNaMVNtcKtactBKcaHHcBttaSagaKacataYgctaKaYgttYcgacWVtt
+WtSagcaacatcccHaccKtcttaacgaKttcacKtNtacHtatatRtaaatacactaBt
+ttgaHaRttggttWtatYagcatYDatcggagagcWBataagRtacctataRKgtBgatg
+aDatataSttagBaHtaatNtaDWcWtgtaattacagKttcNtMagtattaNgtctcgtc
+ctcttBaHaKcKccgtRcaaYagSattaagtKataDatatatagtcDtaacaWHcaKttD
+gaaRcgtgYttgtcatatNtatttttatggccHtgDtYHtWgttatYaacaattcaWtat
+NgctcaaaSttRgctaatcaaatNatcgtttaBtNNVtgttataagcaaagattBacgtD
+atttNatttaaaDcBgtaSKgacgtagataatttcHMVNttgttBtDtgtaWKaaRMcKM
+tHtaVtagataWctccNNaSWtVaHatctcMgggDgtNHtDaDttatatVWttgttattt
+aacctttcacaaggaSaDcggttttttatatVtctgVtaacaStDVaKactaMtttaSNa
+gtgaaattaNacttSKctattcctctaSagKcaVttaagNaVcttaVaaRNaHaaHttat
+gtHttgtgatMccaggtaDcgaccgtWgtWMtttaHcRtattgScctatttKtaaccaag
+tYagaHgtWcHaatgccKNRtttagtMYSgaDatctgtgaWDtccMNcgHgcaaacNDaa
+aRaStDWtcaaaaHKtaNBctagBtgtattaactaattttVctagaatggcWSatMaccc
+ttHttaSgSgtgMRcatRVKtatctgaaaccDNatYgaaVHNgatMgHRtacttaaaRta
+tStRtDtatDttYatattHggaBcttHgcgattgaKcKtttcRataMtcgaVttWacatN
+catacctRataDDatVaWNcggttgaHtgtMacVtttaBHtgagVttMaataattatgtt
+cttagtttgtgcDtSatttgBtcaacHattaaBagVWcgcaSYttMgcttacYKtVtatc
+aYaKctgBatgcgggcYcaaaaacgNtctagKBtattatctttKtaVttatagtaYtRag
+NtaYataaVtgaatatcHgcaaRataHtacacatgtaNtgtcgYatWMatttgaactacR
+ctaWtWtatacaatctBatatgYtaagtatgtgtatSttactVatcttYtaBcKgRaSgg
+RaaaaatgcagtaaaWgtaRgcgataatcBaataccgtatttttccatcNHtatWYgatH
+SaaaDHttgctgtccHtggggcctaataatttttctatattYWtcattBtgBRcVttaVM
+RSgctaatMagtYtttaaaaatBRtcBttcaaVtaacagctccSaaSttKNtHtKYcagc
+agaaaccccRtttttaaDcDtaStatccaagcgctHtatcttaDRYgatDHtWcaaaBcW
+gKWHttHataagHacgMNKttMKHccaYcatMVaacgttaKgYcaVaaBtacgcaacttt
+MctaaHaatgtBatgagaSatgtatgSRgHgWaVWgataaatatttccKagVgataattW
+aHNcYggaaatgctHtKtaDtctaaagtMaatVDVactWtSaaWaaMtaHtaSKtcBRaN
+cttStggtBttacNagcatagRgtKtgcgaacaacBcgKaatgataagatgaaaattgta
+ctgcgggtccHHWHaaNacaBttNKtKtcaaBatatgctaHNgtKcDWgtttatNgVDHg
+accaacWctKaaggHttgaRgYaatHcaBacaatgagcaaattactgtaVaaYaDtagat
+tgagNKggtggtgKtWKaatacagDRtatRaMRtgattDggtcaaYRtatttNtagaDtc
+acaaSDctDtataatcgtactaHttatacaatYaacaaHttHatHtgcgatRRttNgcat
+SVtacWWgaaggagtatVMaVaaattScDDKNcaYBYaDatHgtctatBagcaacaagaa
+tgagaaRcataaKNaRtBDatcaaacgcattttttaaBtcSgtacaRggatgtMNaattg
+gatatWtgagtattaaaVctgcaYMtatgatttttYgaHtgtcttaagWBttHttgtctt
+attDtcgtatWtataataSgctaHagcDVcNtaatcaagtaBDaWaDgtttagYctaNcc
+DtaKtaHcttaataacccaRKtacaVaatNgcWRaMgaattatgaBaaagattVYaHMDc
+aDHtcRcgYtcttaaaWaaaVKgatacRtttRRKYgaatacaWVacVcRtatMacaBtac
+tggMataaattttHggNagSctacHgtBagcgtcgtgattNtttgatSaaggMttctttc
+ttNtYNagBtaaacaaatttMgaccttacataattgYtcgacBtVMctgStgMDtagtaR
+ctHtatgttcatatVRNWataDKatWcgaaaaagttaaaagcacgHNacgtaatctttMR
+tgacttttDacctataaacgaaatatgattagaactccSYtaBctttaataacWgaaaYa
+tagatgWttcatKtNgatttttcaagHtaYgaaRaDaagtaggagcttatVtagtctttc
+attaaaatcgKtattaRttacagVaDatgcatVgattgggtctttHVtagKaaRBtaHta
+aggccccaaaaKatggtttaMWgtBtaaacttcactttKHtcgatctccctaYaBacMgt
+cttBaBaNgcgaaacaatctagtHccHtKttcRtRVttccVctttcatacYagMVtMcag
+aMaaacaataBctgYtaatRaaagattaaccatVRatHtaRagcgcaBcgDttStttttc
+VtttaDtKgcaaWaaaaatSccMcVatgtKgtaKgcgatatgtagtSaaaDttatacaaa
+catYaRRcVRHctKtcgacKttaaVctaDaatgttMggRcWaacttttHaDaKaDaBctg
+taggcgtttaHBccatccattcNHtDaYtaataMttacggctNVaacDattgatatttta
+cVttSaattacaaRtataNDgacVtgaacataVRttttaDtcaaacataYDBtttaatBa
+DtttYDaDaMccMttNBttatatgagaaMgaNtattHccNataattcaHagtgaaggDga
+tgtatatatgYatgaStcataaBStWacgtcccataRMaaDattggttaaattcMKtctM
+acaBSactcggaatDDgatDgcWctaacaccgggaVcacWKVacggtaNatatacctMta
+tgatagtgcaKagggVaDtgtaacttggagtcKatatcgMcttRaMagcattaBRaStct
+YSggaHYtacaactMBaagDcaBDRaaacMYacaHaattagcattaaaHgcgctaaggSc
+cKtgaaKtNaBtatDDcKBSaVtgatVYaagVtctSgMctacgttaacWaaattctSgtD
+actaaStaaattgcagBBRVctaatatacctNttMcRggctttMttagacRaHcaBaacV
+KgaataHttttMgYgattcYaNRgttMgcVaaacaVVcDHaatttgKtMYgtatBtVVct
+WgVtatHtacaaHttcacgatagcagtaaNattBatatatttcVgaDagcggttMaagtc
+ScHagaaatgcYNggcgtttttMtStggtRatctacttaaatVVtBacttHNttttaRca
+aatcacagHgagagtMgatcSWaNRacagDtatactaaDKaSRtgattctccatSaaRtt
+aaYctacacNtaRtaactggatgaccYtacactttaattaattgattYgttcagDtNKtt
+agDttaaaaaaaBtttaaNaYWKMBaaaacVcBMtatWtgBatatgaacVtattMtYatM
+NYDKNcKgDttDaVtaaaatgggatttctgtaaatWtctcWgtVVagtcgRgacttcccc
+taDcacagcRcagagtgtWSatgtacatgttaaSttgtaaHcgatgggMagtgaacttat
+RtttaVcaccaWaMgtactaatSSaHtcMgaaYtatcgaaggYgggcgtgaNDtgttMNg
+aNDMtaattcgVttttaacatgVatgtWVMatatcaKgaaattcaBcctccWcttgaaWH
+tWgHtcgNWgaRgctcBgSgaattgcaaHtgattgtgNagtDttHHgBttaaWcaaWagc
+aSaHHtaaaVctRaaMagtaDaatHtDMtcVaWMtagSagcttHSattaacaaagtRacM
+tRtctgttagcMtcaBatVKtKtKacgagaSNatSactgtatatcBctgagVtYactgta
+aattaaaggcYgDHgtaacatSRDatMMccHatKgttaacgactKtgKagtcttcaaHRV
+tccttKgtSataatttacaactggatDNgaacttcaRtVaagDcaWatcBctctHYatHa
+DaaatttagYatSatccaWtttagaaatVaacBatHcatcgtacaatatcgcNYRcaata
+YaRaYtgattVttgaatgaVaactcRcaNStgtgtattMtgaggtNttBaDRcgaaaagc
+tNgBcWaWgtSaDcVtgVaatMKBtttcgtttctaaHctaaagYactgMtatBDtcStga
+ccgtSDattYaataHctgggaYYttcggttaWaatctggtRagWMaDagtaacBccacta
+cgHWMKaatgatWatcctgHcaBaSctVtcMtgtDttacctaVgatYcWaDRaaaaRtag
+atcgaMagtggaRaWctctgMgcWttaagKBRtaaDaaWtctgtaagYMttactaHtaat
+cttcataacggcacBtSgcgttNHtgtHccatgttttaaagtatcgaKtMttVcataYBB
+aKtaMVaVgtattNDSataHcagtWMtaggtaSaaKgttgBtVtttgttatcatKcgHac
+acRtctHatNVagSBgatgHtgaRaSgttRcctaacaaattDNttgacctaaYtBgaaaa
+tagttattactcttttgatgtNNtVtgtatMgtcttRttcatttgatgacacttcHSaaa
+ccaWWDtWagtaRDDVNacVaRatgttBccttaatHtgtaaacStcVNtcacaSRttcYa
+gacagaMMttttgMcNttBcgWBtactgVtaRttctccaaYHBtaaagaBattaYacgat
+ttacatctgtaaMKaRYtttttactaaVatWgctBtttDVttctggcDaHaggDaagtcg
+aWcaagtagtWttHtgKtVataStccaMcWcaagataagatcactctHatgtcYgaKcat
+cagatactaagNSStHcctRRNtattgtccttagttagMVgtatagactaactctVcaat
+MctgtttgtgttgccttatWgtaBVtttctggMcaaKgDWtcgtaaYStgSactatttHg
+atctgKagtagBtVacRaagRtMctatgggcaaaKaaaatacttcHctaRtgtDcttDat
+taggaaatttcYHaRaaBttaatggcacKtgctHVcaDcaaaVDaaaVcgMttgtNagcg
+taDWgtcgttaatDgKgagcSatatcSHtagtagttggtgtHaWtaHKtatagctgtVga
+ttaBVaatgaataagtaatVatSttaHctttKtttgtagttaccttaatcgtagtcctgB
+cgactatttVcMacHaaaggaatgDatggKtaHtgStatattaaSagctWcctccRtata
+BaDYcgttgcNaagaggatRaaaYtaWgNtSMcaatttactaacatttaaWttHtatBat
+tgtcgacaatNgattgcNgtMaaaKaBDattHacttggtRtttaYaacgVactBtaBaKt
+gBttatgVttgtVttcaatcWcNctDBaaBgaDHacBttattNtgtDtatttVSaaacag
+gatgcRatSgtaSaNtgBatagttcHBgcBBaaattaHgtDattatDaKaatBaaYaaMa
+ataaataKtttYtagtBgMatNcatgtttgaNagtgttgtgKaNaSagtttgaSMaYBca
+aaacDStagttVacaaaaactaaWttBaagtctgtgcgtMgtaattctcctacctcaNtt
+taaccaaaaVtBcacataacaccccBcWMtatVtggaatgaWtcaaWaaaaaaaaWtDta
+atatRcctDWtcctaccMtVVatKttaWaaKaaatataaagScHBagaggBaSMtaWaVt
+atattactSaaaKNaactatNatccttgaYctattcaaaVgatttYHcRagattttaSat
+aggttattcVtaaagaKgtattattKtRttNcggcRgtgtgtWYtaacHgKatKgatYta
+cYagDtWcHBDctctgRaYKaYagcactKcacSaRtBttttBHKcMtNtcBatttatttt
+tgSatVgaaagaWtcDtagDatatgMacaacRgatatatgtttgtKtNRaatatNatgYc
+aHtgHataacKtgagtagtaacYttaNccaaatHcacaacaVDtagtaYtccagcattNt
+acKtBtactaaagaBatVtKaaHBctgStgtBgtatgaSNtgDataaccctgtagcaBgt
+gatcttaDataStgaMaccaSBBgWagtacKcgattgaDgNNaaaacacagtSatBacKD
+gcgtataBKcatacactaSaatYtYcDaactHttcatRtttaatcaattataRtttgtaa
+gMcgNttcatcBtYBagtNWNMtSHcattcRctttttRWgaKacKttgggagBcgttcgc
+MaWHtaatactgtctctatttataVgtttaBScttttaBMaNaatMacactYtBMggtHa
+cMagtaRtctgcatttaHtcaaaatttgagKtgNtactBacaHtcgtatttctMaSRagc
+agttaatgtNtaaattgagagWcKtaNttagVtacgatttgaatttcgRtgtWcVatcgt
+taaDVctgtttBWgaccagaaagtcSgtVtatagaBccttttcctaaattgHtatcggRa
+ttttcaaggcYSKaagWaWtRactaaaacccBatMtttBaatYtaagaactSttcgaaSc
+aatagtattgaccaagtgttttctaacatgtttNVaatcaaagagaaaNattaaRtttta
+VaaaccgcaggNMtatattVctcaagaggaacgBgtttaacaagttcKcYaatatactaa
+ccBaaaSggttcNtattctagttRtBacgScVctcaatttaatYtaaaaaaatgSaatga
+tagaMBRatgRcMcgttgaWHtcaVYgaatYtaatctttYttatRaWtctgBtDcgatNa
+tcKaBaDgatgtaNatWKctccgatattaacattNaaacDatgBgttctgtDtaaaMggt
+gaBaSHataacgccSctaBtttaRBtcNHcDatcDcctagagtcRtaBgWttDRVHagat
+tYatgtatcWtaHtttYcattWtaaagtctNgtStggRNcgcggagSSaaagaaaatYcH
+DtcgctttaatgYcKBVSgtattRaYBaDaaatBgtatgaHtaaRaRgcaSWNtagatHa
+acttNctBtcaccatctMcatattccaSatttgcgaDagDgtatYtaaaVDtaagtttWV
+aagtagYatRttaagDcNgacKBcScagHtattatcDaDactaaaaaYgHttBcgaDttg
+gataaaKSRcBMaBcgaBSttcWtgNBatRaccgattcatttataacggHVtaattcaca
+agagVttaaRaatVVRKcgWtVgacctgDgYaaHaWtctttcacMagggatVgactagMa
+aataKaaNWagKatagNaaWtaaaatttgaattttatttgctaaVgaHatBatcaaBWcB
+gttcMatcgBaaNgttcgSNaggSaRtttgHtRtattaNttcDcatSaVttttcgaaaaa
+ttgHatctaRaggSaNatMDaaatDcacgattttagaHgHaWtYgattaatHNSttatMS
+gggNtcKtYatRggtttgtMWVtttaYtagcagBagHaYagttatatggtBacYcattaR
+SataBatMtttaaatctHcaaaSaaaagttNSaaWcWRccRtKaagtBWtcaaattSttM
+tattggaaaccttaacgttBtWatttatatWcDaatagattcctScacctaagggRaaYt
+aNaatgVtBcttaaBaacaMVaaattatStYgRcctgtactatcMcVKatttcgSgatRH
+MaaaHtagtaaHtVgcaaataatatcgKKtgccaatBNgaaWcVttgagttaKatagttc
+aggKDatDtattgaKaVcaKtaataDataataHSaHcattagttaatRVYcNaHtaRcaa
+ggtNHcgtcaaccaBaaagYtHWaaaRcKgaYaaDttgcWYtataRgaatatgtYtgcKt
+aNttWacatYHctRaDtYtattcBttttatcSataYaYgttWaRagcacHMgtttHtYtt
+YaatcggtatStttcgtRSattaaDaKMaatatactaNBaWgctacacYtgaYVgtgHta
+aaRaaRgHtagtWattataaaSDaaWtgMattatcgaaaagtaYRSaWtSgNtBgagcRY
+aMDtactaacttaWgtatctagacaagNtattHggataatYttYatcataDcgHgttBtt
+ctttVttgccgaaWtaaaacgKgtatctaaaaaNtccDtaDatBMaMggaatNKtatBaa
+atVtccRaHtaSacataHattgtttKVYattcataVaattWtcgtgMttcttKtgtctaa
+cVtatctatatBRataactcgKatStatattcatHHRttKtccaacgtgggtgRgtgaMt
+attattggctatcgtgacMtRcBDtcttgtactaatRHttttaagatcgVMDStattatY
+BtttDttgtBtNttgRcMtYtgBacHaWaBaatDKctaagtgaaactaatgRaaKgatcc
+aagNaaaatattaggWNtaagtatacttttKcgtcggSYtcttgRctataYcttatataa
+agtatattaatttataVaacacaDHatctatttttKYVatHRactttaBHccaWagtact
+BtcacgaVgcgttRtttttttSVgtSagtBaaattctgaHgactcttgMcattttagVta
+agaattHctHtcaDaaNtaacRggWatagttcgtSttgaDatcNgNagctagDgatcNtt
+KgttgtaDtctttRaaYStRatDtgMggactSttaDtagSaVtBDttgtDgccatcacaM
+attaaaMtNacaVcgSWcVaaDatcaHaatgaattaMtatccVtctBtaattgtWattat
+BRcWcaatgNNtactWYtDaKttaaatcactcagtRaaRgatggtKgcgccaaHgaggat
+StattYcaNMtcaBttacttatgagDaNtaMgaaWtgtttcttctaHtMNgttatctaWW
+atMtBtaaatagDVatgtBYtatcggcttaagacMRtaHScgatatYgRDtcattatSDa
+HggaaataNgaWSRRaaaBaatagBattaDctttgHWNttacaataaaaaaatacggttt
+gHgVtaHtWMttNtBtctagtMcgKMgHgYtataHaNagWtcaacYattaataYRgtaWK
+gaBctataaccgatttaHaNBRaRaMtccggtNgacMtctcatttgcaattcWgMactta
+caaDaaNtactWatVtttagccttMaatcagVaagtctVaaDaBtattaattaYtNaYtg
+gattaKtaKctYaMtattYgatattataatKtVgDcttatatNBtcgttgtStttttMag
+aggttaHYSttcKgtcKtDNtataagttataagSgttatDtRttattgttttSNggRtca
+aKMNatgaatattgtBWtaMacctgggYgaSgaagYataagattacgagaatBtggtRcV
+HtgYggaDgaYaKagWagctatagacgaaHgtWaNgacttHRatVaWacKYtgRVNgVcS
+gRWctacatcKSactctgWYtBggtataagcttNRttVtgRcaWaaatDMatYattaact
+ttcgaagRatSctgccttgcRKaccHtttSNVagtagHagBagttagaccaRtataBcca
+taatSHatRtcHagacBWatagcaMtacaRtgtgaaBatctKRtScttccaNaatcNgta
+atatWtcaMgactctBtWtaaNactHaaaaRctcgcatggctMcaaNtcagaaaaacaca
+gtggggWttRttagtaagaVctVMtcgaatcttcMaaaHcaHBttcgattatgtcaDagc
+YRtBtYcgacMgtDcagcgaNgttaataatagcagKYYtcgtaBtYctMaRtaRtDagaa
+aacacatgYaBttgattattcgaaNttBctSataaMataWRgaHtttccgtDgaYtatgg
+tDgHKgMtatttVtMtVagttaRatMattRagataaccctKctMtSttgaHagtcStcta
+tttccSagatgttccacgaggYNttHRacgattcDatatDcataaaatBBttatcgaHtN
+HaaatatDNaggctgaNcaaggagttBttMgRagVatBcRtaWgatgBtSgaKtcgHttt
+gaatcaaDaHttcSBgHcagtVaaSttDcagccgttNBtgttHagYtattctttRWaaVt
+SttcatatKaaRaaaNacaVtVctMtSDtDtRHRcgtaatgctcttaaatSacacaatcg
+HattcaWcttaaaatHaaatcNctWttaNMcMtaKctVtcctaagYgatgatcYaaaRac
+tctaRDaYagtaacgtDgaggaaatctcaaacatcaScttcKttNtaccatNtaNataca
+tttHaaDHgcaDatMWaaBttcRggctMaagctVYcacgatcaDttatYtaatcKatWat
+caatVYtNagatttgattgaYttttYgacttVtcKaRagaaaHVgDtaMatKYagagttN
+atWttaccNtYtcDWgSatgaRgtMatgKtcgacaagWtacttaagtcgKtgatccttNc
+ttatagMatHVggtagcgHctatagccctYttggtaattKNaacgaaYatatVctaataM
+aaaYtgVtcKaYtaataacagaatHcacVagatYWHttagaaSMaatWtYtgtaaagNaa
+acaVgaWtcacNWgataNttcaSagctMDaRttgNactaccgataMaaatgtttattDtc
+aagacgctDHYYatggttcaagccNctccttcMctttagacBtaaWtaWVHggaaaaNat
+ttaDtDtgctaaHHtMtatNtMtagtcatttgcaaaRatacagRHtatDNtgtDgaatVg
+tVNtcaaatYBMaaaagcaKgtgatgatMgWWMaHttttMgMagatDtataaattaacca
+actMtacataaattgRataatacgBtKtaataattRgtatDagDtcRDacctatRcagag
+cSHatNtcaScNtttggacNtaaggaccgtgKNttgttNcttgaaRgYgRtNtcagttBc
+ttttcHtKtgcttYaaNgYagtaaatgaatggWaMattBHtatctatSgtcYtgcHtaat
+tHgaaMtHcagaaSatggtatgccaHBtYtcNattWtgtNgctttaggtttgtWatNtgH
+tgcDttactttttttgcNtactKtWRaVcttcatagtgSNKaNccgaataaBttataata
+YtSagctttaaatSttggctaaKSaatRccgWHgagDttaaatcatgagMtcgagtVtaD
+ggaBtatttgDacataaacgtagYRagBWtgDStKDgatgaagttcattatttaKWcata
+aatWRgatataRgttRacaaNKttNtKagaaYaStaactScattattaacgatttaaatg
+DtaattagatHgaYataaactatggggatVHtgccgtNgatNYcaStRtagaccacWcaM
+tatRagHgVactYtWHtcttcatgatWgagaKggagtatgaWtDtVtNaNtcgYYgtaaa
+ctttaDtBactagtaDctatagtaatatttatatataacgHaaaRagKattSagttYtSt
+>THREE Homo sapiens frequency
+agagagacgatgaaaattaatcgtcaatacgctggcgaacactgagggggacccaatgct
+cttctcggtctaaaaaggaatgtgtcagaaattggtcagttcaaaagtagaccggatctt
+tgcggagaacaattcacggaacgtagcgttgggaaatatcctttctaccacacatcggat
+tttcgccctctcccattatttattgtgttctcacatagaattattgtttagacatccctc
+gttgtatggagagttgcccgagcgtaaaggcataatccatataccgccgggtgagtgacc
+tgaaattgtttttagttgggatttcgctatggattagcttacacgaagagattctaatgg
+tactataggataattataatgctgcgtggcgcagtacaccgttacaaacgtcgttcgcat
+atgtggctaacacggtgaaaatacctacatcgtatttgcaatttcggtcgtttcatagag
+cgcattgaattactcaaaaattatatatgttgattatttgattagactgcgtggaaagaa
+ggggtactcaagccatttgtaaaagctgcatctcgcttaagtttgagagcttacattagt
+ctatttcagtcttctaggaaatgtctgtgtgagtggttgtcgtccataggtcactggcat
+atgcgattcatgacatgctaaactaagaaagtagattactattaccggcatgcctaatgc
+gattgcactgctatgaaggtgcggacgtcgcgcccatgtagccctgataataccaatact
+tacatttggtcagcaattctgacattatacctagcacccataaatttactcagacttgag
+gacaggctcttggagtcgatcttctgtttgtatgcatgtgatcatatagatgaataagcg
+atgcgactagttagggcatagtatagatctgtgtatacagttcagctgaacgtccgcgag
+tggaagtacagctgagatctatcctaaaatgcaaccatatcgttcacacatgatatgaac
+ccagggggaaacattgagttcagttaaattggcagcgaatcccccaagaagaaggcggag
+tgacgttgaacgggcttatggtttttcagtacttcctccgtataagttgagcgaaatgta
+aacagaataatcgttgtgttaacaacattaaaatcgcggaatatgatgagaatacacagt
+gtgagcatttcacttgtaaaatatctttggtagaacttactttgctttaaatatgttaaa
+ccgatctaataatctacaaaacggtagattttgcctagcacattgcgtccttctctattc
+agatagaggcaatactcagaaggttttatccaaagcactgtgttgactaacctaagtttt
+agtctaataatcatgattgattataggtgccgtggactacatgactcgtccacaaataat
+acttagcagatcagcaattggccaagcacccgacttttatttaatggttgtgcaatagtc
+cagattcgtattcgggactctttcaaataatagtttcctggcatctaagtaagaaaagct
+cataaggaagcgatattatgacacgctcttccgccgctgttttgaaacttgagtattgct
+cgtccgaaattgagggtcacttcaaaatttactgagaagacgaagatcgactaaagttaa
+aatgctagtccacagttggtcaagttgaattcatccacgagttatatagctattttaatt
+tatagtcgagtgtacaaaaaacatccacaataagatttatcttagaataacaacccccgt
+atcatcgaaatcctccgttatggcctgactcctcgagcttatagcatttgtgctggcgct
+cttgccaggaacttgctcgcgaggtggtgacgagtgagatgatcagtttcattatgatga
+tacgattttatcgcgactagttaatcatcatagcaagtaaaatttgaattatgtcattat
+catgctccattaacaggttatttaattgatactgacgaaattttttcacaatgggttttc
+tagaatttaatatcagtaattgaagccttcataggggtcctactagtatcctacacgacg
+caggtccgcagtatcctggagggacgtgttactgattaaaagggtcaaaggaatgaaggc
+tcacaatgttacctgcttcaccatagtgagccgatgagttttacattagtactaaatccc
+aaatcatactttacgatgaggcttgctagcgctaaagagaatacatacaccaccacatag
+aattgttagcgatgatatcaaatagactcctggaagtgtcagggggaaactgttcaatat
+ttcgtccacaggactgaccaggcatggaaaagactgacgttggaaactataccatctcac
+gcccgacgcttcactaattgatgatccaaaaaatatagcccggattcctgattagcaaag
+ggttcacagagaaagatattatcgacgtatatcccaaaaaacagacgtaatgtgcatctt
+cgaatcgggatgaatacttgtatcataaaaatgtgacctctagtatacaggttaatgtta
+gtgatacacaatactcgtgggccatgggttctcaaataaaatgtaatattgcgtcgatca
+ctcacccacgtatttggtctaattatgttttatttagtgacaatccaatagataaccggt
+cctattaagggctatatttttagcgaccacgcgtttaaacaaaggattgtatgtagatgg
+taccagtttaattgccagtgggcaatcctaagcaaaatgagattctatcctaaagtttgg
+gcttgatataagatttcggatgtatgggttttataatcgttggagagctcaatcatgagc
+taatacatggatttcgctacctcaccgagagaccttgcatgaagaattctaaccaaaagt
+ttaataggccggattggattgagttaattaagaccttgttcagtcatagtaaaaaccctt
+aaattttaccgattgacaaagtgagcagtcgcaataccctatgcgaaacgcctcgatagt
+gactaggtatacaaggtttttgagttcctttgaaatagttaactaatttaaaattaatta
+acgacatggaaatcacagaacctaatgctttgtaggagttatttatgctgtttactgcct
+ctacaaccctaataaagcagtcctaagaatgaaacgcatcttttagttcagaaagtggta
+tccagggtggtcaatttaataaattcaacatcgggtctcaggatattcggtcatataatt
+tattaagggctcttcgagtcttactctgagtgaaattggaaacagtcatccttttcgttg
+tgaggcatcttacaccgctatcgatatacaatgcattccaccgcggtgtcccgtacacaa
+ggaaacttgttaccttggggatataagaaaactcacacgtctcattattaaactgagtac
+aatttttgcacgagaaagtaatgcaatacaatatgatgaaagccagctaatgaaaaggga
+tggaacgcacctcggatctgttgcactggattaaaatccgattatttttaaaaatattca
+gtgctagagcatatcaggtctacttttttatctggtatgtaaagcccacggagcgatagt
+gagatccttacgactcaacgaaaagttataacataactcccgttagccaaagcccaatcc
+cgattactgccctaccctaacgtctgccatctaaatatcgaacttgttatgatcaatgtg
+actacctcccaccctttccccttcatttgttccactggggataagctagcgttttcagaa
+tcaatgcaataagaatagccaattgtctcacttcatcagagctcttggcaattccaggcg
+ctacgtggttctggaatatattcatttttcaaatagtaatacgtttagtgttgctattgt
+ctacacgtttggatattacgttatgtgagcggacatcaatagttgtctaactctttagta
+agccagagatagcactcttagcgaatggataccatcttccataagtttagttaatagtcc
+gaaacaactgcttcgagcatatttgaacctccttgtaggcaaatagcctcttcaaagcaa
+tcttactaatagatagagtttgttttaagggactactagaaatgggacaatcttaatagt
+atgacctaaactgacatttaaagatatatccaggtggcaagcataaagatcattgcgcca
+cctccaccgtgggattacttatcagtcgatatcctatatgctaagtttgcgacggcagaa
+tacaaactaagctgagttgatgctaaccttacctatgataccccattggaccggttaaca
+gccctacttattccaaataaaagaacttttatgctgtagaagctattatagtgatgcctg
+gtaacttcagtatattaaaatgacacacatacgccatatagagctcctggaactttgaat
+aatgagcgaacttcgaagttgaagagcaagaaaccatatgtcacggttgcctaaagcccg
+gtaaccagacatgtgctatcattgatcattatcgaggttttcataaccttgacccattat
+cggctgtgcgcggacaagtacttaaatcactagtttcttcacctgcttatcggtaagaaa
+taaggttggcaaagaatcgcataagacggacgtagagccgcagcgttgtgcgagtccagg
+tgcatgcgcagcaataggattttaaattttgttccatttttaatttagccgtaaggatgt
+ccgtaaatgattgaaaattggattcaatctttgggcctatgctactggaacctgatcgac
+aaaatttcaaacatacgttaactccgaaagaccgtatttttgcggctagaatagtcagtc
+gcttggagccatataccttaccacttaaacgacgtgctcctgtagttgaaatataaacag
+aacacaaagactaccgatcatatcaactgaagatctttgtaactttgaggcgaagcaccc
+tcttcgagacaactaagagtaaagtaccgggcgccgcaaggagtcgattgggaccctaaa
+tcttgacgaattgctaagaggctcagagctaccactgtaatttctctagagcccataata
+aatgaacgatacatccgtaggtagcacctaagggattataatggaagccaaatgcagtta
+ataatattatatactggcgtacacgattcgacggatctctcacatagtgattcacgaccc
+ccccctttgattgacacagcgtcagcattttgcaagaacgatcttctgcatagggtgcgc
+caccgtaaggatgacgtcgaagctacaactgggtataatttaccatgcttccctgatgct
+gagtgcaatacactaagaatgagtttttaccccatatcaccagtatttgttctgttattg
+cgaagaaatggctatgctgagttggcgactaaagtcacccatcctttttattaggtaacc
+ccctcccttaaactaactgatttgctggagctgccctgcatacatatactttatcattta
+tggacgtccgtgacgcttattatccaccatagtcgatatgctacacggattcattaatgg
+atcgtaggagtttaagttatatttactaagatcggtctcggctactatcccgccttaccc
+ggcgctatttacggccatttttaatatattgacggtaattattcctatggtttcgaccgc
+acgtccttggacaagaaagaatggcaaaaaaaatgtaaaagaaaaaaaatattgagtccc
+taccatcatataaaaaatatgtgatgagtaacttgacgaaatgttagtggttattaaaga
+ctatctattacaccttttgttttctgtcgtagtatattaaagtctagaagccttacagga
+aaatcagggttatacagccgatactccgcagcatgaatcatcgaggaggtgtcctaccat
+cgcgccttgtaatcttgtctgtgtatactgtatttagaccttttatacaaagtaaatatc
+tcggctttatgtgattgggaggggcctactcaaacatgatgacttgacctaataatcact
+gtgcgggcgtcttatgactagctattccttgaaatccaccaccaaatggttaatatgtaa
+aaactttgacgatgaaacaaggtgaatgtgtagttactttgtgtaattagctgcgtcgag
+cattgcttgtaaaaccgtcaatcgcacacgttacttccataaaatttctacgaatacacc
+cttcttaaaaaaaacgtaggaattcacgagtttaacaaacgataactgtataaagtggaa
+gtccgaagaaagcagatgcccgaactactcgaagatgtttcgttttcttaaccatagggg
+cttcttaatggcccactacgcacattttgttcaagcccgagagggacatccccattacgg
+gagtattactaaaactgttccgtaatacgttcagcaagggatgaaaaaggccactgctca
+agttattgacgtgggagtattacatcggaagcctgaatcccacactatgatggtctgtac
+aggcctagggactgcgtctagacggtattaccggcttctaatcatacgatcgtgagtctt
+aacgggaagtaaggctcacacctaccccaaaccatttatctatgtaagtataaaattgtg
+cgtaagtgttcaaagtggacaataaagacgtggcaaaaacccccgcacataagccgcttt
+agatttcacaaataccaatgcggttaaaaacatccttgagtcgtacatacaccatactcg
+cgttaaacggatataacagaagataataaatccggatgtggagtcggtgtaactatagaa
+agccaagtgaaataatgcttaccagtcatttagctatacggctttcatttcatgtcaaga
+gggtggagtttgacctgtacagttgatatatcaccgatacttagaactcacctaaagcta
+aaattgctcgcagcgtgtaatccgcatattacaaacaatagatgggattcattatacata
+agacacgatgatctgctttttcaggttgcgagatgttgcctatcgtcaatcgagtcctgc
+cttacaccacttaaacaaaagtattgacagggaacctattttcgaggtattatatagtcc
+agcttgaatatcaatttgacagttaacctagtgaaaatcagtaagaggaaatacgccaca
+ttctccagtgaaattctacgggttatcgtctagtccaactatcaattataactcacgaga
+tataagtaaattctcgtacttggcctgatttttattatactttggatccttagtaaacag
+gaagggagaaaccttcaacgaaaaacactggattttgttttactctcaaagctcttatat
+gacggaaataccctgtcaagtcttaactttattactagactaatgaaatgggcttggggt
+ggccagaatcatagtacaatttagcggatacactattcggactttcctatcggctgtctg
+gttggataagtatggggactaataggctagacatacctatacttaaactatacaggcgtc
+atctatctctgcaactttggagttccctgatgttctcccgccctttgggttcacatcttc
+tataccgacacccctaataacgattagtttgtgggttagagtaaattaatacggttaata
+ttaatgtatcgttgaaaagctggtgtcgccaataaggtaaccggctaggcagagtatatg
+tcacgaagtataactaccctaatgataagctgtaggaataaaattaatgctgtctctaag
+cgaagagatatttccgactctgttttaatgacgaatctcattacttctgacttgcaaatg
+ttcaatatggcacggtttcacggcacctttgtgacgcatataatgaacttagaagattat
+aacgacggaactttatatgataatccgttacgattaaagaatctgttaaatatcataatg
+gcattcagttctagaccgtgcatcatggtaaacttactttctctgcatggcgacatacat
+ttcgctattcaaattcgcgtgtggttacacccactcgcacctttggaatattaagagaag
+atgatcagaaaatccattcgctcaatttttctgacgtacgtctaatttatcctaggagac
+aaatcgttttatgtctctcacatttttgaagaaaggttcgagagacaatactcaggtcct
+gaactgctagaagatactcggtggagcgtggcaacaatgaaaaactcgtgacataaatga
+atgatacttttccaagttcagttaagtgaatatgtttaacatacccggcttttcgatctt
+aagctgacgctggacgtgcgagtaatgtcagtctcttacatacactagtgactccaagtt
+tcgtcaaaaacgccccctcccttctcgagcccactcacgctatgtattgacgcgaacttg
+ttcgggatcagacttttcaggagttcggtcgcgtgtccctatgtgctaatatataagtta
+gatcgcattagatgctaatctgaatacttatagacgaccttcaacgagaacgggtaccac
+cttgaggctagagttaggtgtgaaacgacaggtagggacatataaaatttgagtgcggct
+ttagttaagggtttaattacctactcaaacatcacgctcgcgcccttcgtacgtaatcga
+ccatctagaggctaaggggactgtactaggtagtgattaatgatatcctagacgcacgtg
+ccttagatcttcagactctgatggtccgcgatcaccgtaattgtagtcctccaactcgat
+cactttgttggcgtcaaagaaattacgatatctaaatacttataatacaataaccaagga
+tgagaatgactcatcgcgttggagttatattgcttgaagttctatggaatgaaagcacgt
+tatctgccgtcccaatatctccagtgagctaattcattggacggtccactttgatcaatc
+cccgaggagatgttcggacactttagtctgtaacacttagcgttgagaccacgaacaatt
+gattactcagtcttgaaggtgttttccaaagttcattttaaataagactacgataggcct
+ttcctattgatataaactacccggctctgttgttcgtgtgagtcgtacttctctgtgttt
+ttctgattatagcaagattcgattcttagtgtaaacagcgatttttatttgacccgtcaa
+tgagaagcgcataggatctaagcaaaattatcaagttgtgccacaaggtaagatctttcc
+agttattgcaggtaggatgtatcccacgttgatagtatgaggtctgacgtcaactgtcta
+ggagagttgaccgcgtgcgggtacaccggatttgcatcgatgttgagaacgcagaactcc
+cactgtcgtggcggcgttcctgatatttagcaagaggcgttgataaagccctcatcatct
+agatctcgacctcatctgccctcttgctccatcattttctacacagactactttcctatc
+tacgttagtataattgctttctatcttagtatcatttagagcttctccgtcaacaggttc
+gtgctattaaagttagtacgaaagggacaacttgtagcaacgcatttaatcggttttcga
+ctacttcgcacaaaatcagataaagaagtttgtcattctattagacattgaattgcgcaa
+ttgacttgtaccacttatgatcgaacactgaatcaagactgtgattaactaaaatagaca
+agccactatatcaactaataaaaacgcccctggtggtcgaacatagttgactacaggata
+attaattggactggagccattacattctctacaatcgtatcacttcccaagtagacaact
+ttgaccttgtagtttcatgtacaaaaaaatgctttcgcaggagcacattggtagttcaat
+agtttcatgggaacctcttgagccgtcttctgtgggtgtgttcggatagtaggtactgat
+aaagtcgtgtcgctttcgatgagagggaattcaccggaaaacaccttggttaacaggata
+gtctatgtaaacttcgagacatgtttaagagttaccagcttaatccacggtgctctacta
+gtatcatcagctgtcttgcctcgcctagaaatatgcattctatcgttatcctatcaacgg
+ttgccgtactgagcagccttattgtggaagagtaatatataaatgtagtcttgtctttac
+gaagcagacgtaagtaataatgacttggaataccaaaactaaacatagtggattatcata
+ctcaagaactctccagataaataacagtttttacgatacgtcaccaatgagcttaaagat
+taggatcctcaaaactgatacaaacgctaattcatttgttattggatccagtatcagtta
+aactgaatggagtgaagattgtagaatgttgttctggcctcgcatggggtctaggtgata
+tacaatttctcatacttacacggtagtggaaatctgattctagcttcgtagctgactata
+ctcaaggaaccactgctcaaggtaggagactagttccgaccctacagtcaaagtggccga
+agcttaaactatagactagttgttaaatgctgatttcaagatatcatctatatacagttt
+ggacaattatgtgtgcgaaactaaaattcatgctattcagatggatttcacttatgcctt
+agaaacagatattgcccgagctcaatcaacagttttagccggaaacaatcgaagcatagg
+gacaatgtatcttttcctaaattgccatgtgcagatttctgagtgtcacgaagcgcataa
+tagaatcttgtgttgcctcaactcgttgaaaagtttaaaacaatcgcagcagtctttttg
+gggtctactgtgtgtttgcaaaataactgaaagaaacgcttgaacaactctgaagtagct
+cgagtactcattaaagtgtaacacattagtgaatatcggccaatgaaccaaacgcttccc
+ggtacgctatctctctcatcgggaggcgatgtgcaggttatctacgaaagcatcccttta
+cgttgagagtgtcgatgcatgaacctcattgtaacaatagcccagcaaattctcatacgt
+gcctcagggtccgggcgtactcctccatggaagggcgcgcatctagtgttataccaactc
+gctttttaactactatgctgtagttctacaggcatagtggccagtattttctaacttctc
+tggatagatgctctcactcctcatccatcacggcttcagtttacgtcttacttgcttgtt
+cagcaacggatggaggcattaagtatcttcactgttccctaaaattgctgttcaatatca
+aagtaaggacgatacagggaaagctcaagcacactcattgaatactgccccagttgcaac
+ctcacttaatctgacaaaaataatgactactctaagtgttgcggaagcagtctcttccac
+gagcttgtctgtatcacttcgtataggcatgtaactcgatagacacgaacaccgagtgag
+aaactatattcttgcttccgtgtgtgtgacaccaggtaattgatgcggatataagctgga
+gatcactcacgcccacacaaggcgctgctacctctttattccaatgtgtaagaatttgct
+aacttcatttctagaccgcagctttgcggtcataatttcacggtacggacccttgggtta
+gagacttgataacacacttcgcagtttccaccgcgcacatgttttagtggcttctaacat
+agaatttttgttgtgacataaagagtgcgtgggagacttgcccgaccgttaagccataat
+caattgaaagccccgtgagtcacatctaattggttgtactgcgcatttagctatccttta
+gctgactcgaagagattcgattcctaatataggttaattagatggctgccgcgcgaagta
+aaacgtgaaaaacgtagtgcgcagatctgcataactcgcgcttaattacttatgagtagt
+tccaagttcgctacgttatgagagagattggaattaagcaaatatgttttatggtgattt
+tgggatgagaaggactgctaagtacggctactaaacaaatttctaaaaccgccatctacc
+ttatcttggagacatttaagttgtatatgtcactagtctagcttttgtctgtgggacgcg
+ttctcggaatgagggaaatgcaagagccgattcatcaaatgcttatctaagaaagtagtg
+gactattacaccaagcacgaatgccagggaactgctttcttgctcaggacctcgcgacaa
+ggtaccccgcataagtcctagaattacatttggtcagcaatgctgacatttgaccgtgaa
+aacataattttaatcagaaggcagctcacccgcttgctctagatcttatctttgtatgaa
+tgtcagaatttactgcaatatccgttccgaatagtgagggcttagtatagttctctgtat
+acaggtcacatcaaactccccctgtcctagtacagctctgagctttaattaattgcatac
+atttccttcaatcatcagatgaaaacaccgcgaatcatgctcttctcgtatagggcaaga
+gaagcaacaaacaactagcccgactcacgttcatccgccgtatccttgttcagttcttac
+tccgtattaggtcagcgaaatctaatcagaataatcggtcgcgtatcaaaattaaaatcc
+cgcttgaggttgacaattaaaacgctgagcagttatcggctattagatagtggggtgaaa
+gtaattggctggaattatgttaaaacgtgatattaagctaaaatacgctacttgttgccg
+acctaattcagtcattcgatattcagttagagccaagaataacaagcttgtataaattga
+acggggtgcactaaacgatgtgttactctaatattcagcttggagtatacctgaaggcga
+attcatgtatcggccaataataagacgttgaagatcacaatttggactagcaaaagaagg
+tgatttatgcgtggggattgagtccactgtacgagtacggtctctggaaaattataggtt
+cagggaatataaggaagtaaagataattaccaagagatttttggtatcgctatgacccag
+aggtgttctaacgtctgttttgatccgcagaatttctgcctcaatgcatatttgacggac
+ttgaactagagcctctaaagttaaatggcgacgcaactgttcctaaacttcaattattac
+tactctttttttcctagggtattgtagaggccagtggacaaaataaatcaaatttaagat
+gtttcggacattaacatcccccgtagcatagaaatcatcagttatccaatctctcatcga
+gcttttacaatttctgctggcgctatggacagcatatgccgcgagacctccgcaagactc
+acttgatcactgtaagtatcttcattagaggttagagcctatagttaagctgctgaccta
+gtaaaattggtattttctaattttattgctcaagttaaaggttagtgaagggataatgac
+gttatttttgaacaatgggttgtattcaattttatatcacgaatggaacccttcattccc
+ggcataatactagacgacacgaacaagctccgatctatcagccaggcacgtgttaaggtt
+taattccggcaaaccaatgaagcatcaaaaggtgacctgatgcaacttagggtcacgatg
+agtttttcaggactacttattacctattaataagttaacatgagccttcataccccgtaa
+gacaatacatactccaccaattagaattctgagccatcttatctttttgtatcatcgaag
+ggtatggccgaataggttaattagttactcctaacgtctctacaggcatgcatttgacgc
+accttcgaaaatagtcaatctctcgccacacgcgtctagtatgcagcatcaaaaatatag
+tccacggtttccggattaccaaacgcggcaaagagaaacattgtatcgacggagataact
+taatacagaaggaaggggcatcttcgaatacggatgaataattctatctgtttattctga
+catcttgttttcaggttaatcttacgcattcaaatgacgcctgccccatgcgtgcgcaat
+tattttctaatattgacgagagcaatctcactccttttgggtctatttatgttttattga
+ggcacaagcctatacagaacaggtactattaaggccgtgagtgtgagactcaaaccgtgg
+aaacaaaggatgggttgttcttggtacaagttttagtgcatgtgggcaatccttaccaaa
+atcagatgctatccttaactttgggctgcatttaagatggcggttggaggcctgtgagaa
+tcctgcgtgtcatctttaatgaccgaattcatccatgtagattcagatcacacactcatt
+ccttgatgttgtctaaacaaaagttgttgtggacgcattggagggagttaagtaacaact
+tgggatcgcatacttataaaaattatatgttaaactttcacaaacgctgaagtccaaagt
+aactagcccaaacgcctcgagagtcactaggtattaatggtgtttgagttcctgtgaaat
+agtgttcgaaggtaaaatttatgtaccaaatcgaaagaacacttaataaggcttgcttgc
+acggaggtatgatgtttactgactctacaaccctaattttccagtacgtacattcattcc
+aataggttagttctcaaagtgctatacaggctcctcaattgatgatatgcttcagccgct
+ctatggatattagctcattttatttaggaagcccgcttagaggcttactatgagggaaat
+gccaaaatgtcatacttttcggtgtgtcccatatgacaccgctttacatagaatttgaat
+taaaacgcgctctcccgttcactaccatacttggtaccgtgcgcatattacatatagata
+taggatcattttttaaagctgtactaggtttgatcgacaatcttatgctatactatatga
+tgtaaccctcataatcaataccgatcgtacgatcctagcataggtggcaagcgattttat
+gccgattattgtgttaaatagtctgtgagtgtgattatcagggctacgttggtagagggg
+ttgtatagacctcgcacacattgtgacatacttaacaatatacgaaaactgatataataa
+atccccttacccaaacaccaatcccgttgaatcaactaccataacgtctcccatataaat
+tgcctacttgtttgcataaatctgaatacataacaccattgcaccttcttgtgttccaat
+cccgttaagattgccttgtcagatgatatgcaagaacaatagcatttgctagcaattatt
+aacagctcttcgaattgcctccacataacgcgggagggtatattttaatttggcaaatac
+taagtactgttggcgtcatatgctattaacggttggatattaagttatgtcagccgtaag
+caagagtgggcgaaatattttgttacccagtgagagcactcttagagtttggatacaata
+ggccatatgttgacttaagaggacgtaactacgccgtacaccattgttcaaccgacttct
+tggcaaatagaatcgtattagcaatcttaagaatagagacacgttcgtgttagggtatac
+tacaaatccgaaaatcttaagaggatcacctaaactgaaatttatacatatttcaacgtg
+gatagatttaacataattcagccacctccaacctgggagtaattttcagtagatttacta
+gatgattagtggcccaacgcacttgactatataagatctggggatcctaacctgacctat
+gagacaaaattggaaacgttaacagcccttatgtgtacaaagaaaagtaagttgttgctg
+ttcaacagatgatagtcatgacgcgtaacttcactatagtaaattgaaacaaatacgcaa
+tttagacagaatggtacggtcatgaatgacagtaattcgaagtgctagaccaacttaaaa
+taggtaaacgtgcccgaaaccccccttaacagaaagctgctatcatggtgcagtatcgac
+gtgttcagaaacttgtaacttttgagcaggtccgagcacatggaagtatatcacgtgttt
+ctgaaccggcttatccctaagatatatccgtcgcaaactttcgatttagtcccacgtaga
+gcccaagcgttgtgcgactccacgtgcatgcccagaaatacgagtttaaatttggttaca
+tggttaattttgaccgaagcatcgcactttatgattgataattggattcaatatgtcgcc
+ctatgcgaatgcaacatgatccacaatttggctataagacgtttaatccgtatcacactt
+tgtttgcggctagtatagtaacgcccgtgcaccaagagtcagtaacaattataagtactc
+cgcaggtacttcaaatataaaaactaatcaaacacgacccatatgatcatctgaagatat
+ttggaactttctcgacaaccaccctcgtactcaatacttacactaatcgacaggcacacg
+caacgtgtacagtcgcaccatattgagtcaagatttgcttagtggcgatgagcgtacacg
+cttatttctctagtcacaattagttatctacgagacatcacgagggagcaaataagcgat
+gttatggctacacataggcacgtatgaatatgatataagccagttaaacagtcgaaccat
+cgagcaaattctcatgcaccaacccacacgttgaggcacaaagagtaagctgtttgaatg
+taacttcttctgctgagcgggccccaacgtaaggatcaactagaagagaaaactcggtat
+tagtttaaatgcgtcacggagcatgagtgcatttcactaagaatgtctgtgtaaccaata
+taacatctatttgttatctgattgcctacttatggctttgcggtcgtggcgactaatgtc
+tccaatccttttgaggtcggtaccaactccctttaaattacgctgtgcaggctcatgcac
+tgcatacatatacggtagcaggtagggacctcacgcacccttattataatcaatagtagt
+tatcagtcaacgaggcaggaatgctgaggtcgaggtgttggtatattttctatgtgccgt
+ctaggcgactatcacgcattaccaggcgagatttaagccaattttgaatatagtcaacgt
+aatttttactatgggttccaccgaaacgccttgcacaactaagaatcccataaaatatcg
+atatcaaataaaagattgtgtcaataccttcatatatattttttcggttgactaacgtga
+actaaggttaggggttttgtatgtctatataggaaacagtttcttttctgtcctacttta
+gtaaagtcttcaagccttactccaaaatcacggtgattaagccgttactcagcagcatga
+ttctgcctgctcgggtcctaaaatccagccttgtaagagtcgctgtgtattagctaggga
+gacctttgttaaaaaggatatatcgcggcgggatgtgagtgcgtggcgcatactcaatct
+tcagctcgtgtcattataatatctctcccccacgcttttcactagatatgccgtgtaagc
+aaacaccttatgcttaatttcgaaaatattggtacttgaaaaaagctgtaggggtactta
+atgtctggtaggagatcaggagagaattgagtgtaaaaccgtaaagccctcacctgactt
+catgtaaatggcttagaagactccatgatttaataaatactacgaaggaaagactggatc
+taaagataactctagtaaggccaactcccttcaatgctgttgccagttataatccaagag
+ctgtccttttctgaaccatagcggcttctgaagcgaactagaagcaaagttggttctagc
+cagacagccacataccctgtacgggtgtattactaaaactggtccggtattagttcacca
+agggaggaattaggcaaaggatctaggtatgcaagtcggagtattacatccctaccctga
+atccatcaataggttcctctgtactggccttcgcaatgagtattcaaggttgtacagccg
+tataataataagatagtgactatgaacgggaagtaacccgctcaccttccccaaaacatt
+gttatatctaagtattaaagtctgccgtagtgttaatactcgaaaataaacaactggcaa
+attacaccgcacttaagccgcttttgatttatatttttccaatgcgcttttaaaaataat
+tcagtcctacatactaattaagacccttaaacggagatatcacaagttaagttttaacca
+tctcgactaggtggaactatagatacccaactcaatttatcattacctgtaatgttccta
+gaaggattgcatttcatgtcaagacggtggagtttcacagcgaaacttcagtgtgaacag
+attctgagaaatcacctaaacctattagtcagagcacccggttagaaccagttgtcaaaa
+aatagagcggttgcatgagacagaagtaacgatgagatccgttgtaacgttgagacatct
+ggcctatcgtcaatacagtcctcccttaaaaatatttttaaatactaggcaaacccaaca
+taggttagtcctatgtgatacgccacatggtatatcattttgtaacgttacctagggata
+atcaggaagtggaattacgcaaaagtagacagtgaaatgcttagggttatagtctagtcc
+aaagataaaggataaagcacgtcagagaactatattagccgaatgggaatcattgttagg
+agactgtggatcatgtctaaaaagcaacgcagaaacagtcatcgaaaaaatctcgttttt
+gtttgaatctaaaagagctttgatgaccgatagtacctgtatactagttactgtattacg
+tgtctaatgatttcggattggggtccccagaatcagacgtcattgtagacgattcaagtt
+taccaatttaatttcccagctctccttggagaactatcgccaataattgcagtcactttc
+cttttctgaaacgataaagccgtcagagttctctgcaacgttggacttacctgaggttct
+aacccactttcggttctaatagtagttaacgacacaacgaataacctttactgtggggct
+ttcacgatattttttcgcttattattaatggttacgtcataagctggtgtccaaattaag
+gttaccggcttcgcagagtagttgtatccaagtataacttccctaatcataagatcgagg
+tagaaaattaatgctgtctctaaccgaacagatatgtcccactatgtggtatggacgttg
+ctaattacttctgaagggaaattggtcattatggatacgtgtctaccatcaggtcggacg
+cagatatggttctgtcttcagttgatccaccgttctttataggataataactgacgatta
+aagattatggtaaatagattaagccaattctcttcttgtcagtgaagcatccttaactga
+cttgctctgcagcccctcatacatttagctattcaaagtaccggctcgtttcaaactctc
+ccacctttggaagaggttgtcaacttgataagtatatcatttacagcattttttcggacg
+tacctctaatgtttcattgcagaaaattagttttttctatcgcacattttgcaagtaacg
+ttagagacacaattatctgcgaatgaactgctagatctgacgaccgggagcctcgcaaat
+atcaaaaaagactgacatatatcaaggagtcgttgacaagtgctggtaagtcaattggtt
+tatctgtcccggcgtttcgatcttaagctgaccatgcacggcagagtaatgtcactctcg
+ttcttacaagtctgtctccaagggtcggcaaaaaagacccctccattctcgagcccactc
+acgatatgtagggacgacaacttgtgcggcttatgaattgtctggactgcgggcgagggt
+ccatatctccgaagttagaagggacatacctttagatgataagatcaattcttattgacg
+aaattcatccacaacggggaacaacttcaccctagacttacgtctgaaaagacacctagc
+gtcttataaaaggtcagtgccccgtttcgtaaggctggaattacctacgcaaacttaaac
+ctcgcgcccttccttacgtatcgacaagatagaggctatcgcgaatgtactacggaggca
+tgaatcatatactagaaccaagtgcctgtgatattaacaagatgatccgacgcgagcacc
+gtaattctaggcataaaactccagcaatttgggggccgaaaacaaatgacgttagctaat
+taattatatgacatgatcaaaggaggtcaatcacgcatcgagttcgacgtatattcattg
+aacttcgtgcgtttgaaagaaacttttatgaaggcaaaattgatcctgtctcctatttca
+tgcgtacctcctagttgataattccccgagcagtggttaggacacttttgtcggtatcaa
+gttccggtctcaaaacgtaaaattctgtaatctgtatggatggtctgtgaattagttaat
+ttttatgaagtcgtcgagacgcagttcctattgatttattctaaacggagatgtgcttcg
+tgggactcggaagtagatctgtgtttatgattattgctactttagatgctgactgttaac
+tccgtgttgtttttcaaccgtatatcacaaccgaattggatagaacctatagtttcaagt
+tctgccacaaggtatcatatttacagttagtgctggttgcttctttcaaacgtggtgagt
+ttgtgctatcacgtcaacggtagagctcagtggaccgagtgcgcgttcaaccctgttcca
+gagagggtgtgatagcacatataccacgctcgtcgaggcgttcatgatagtttgcaagag
+ccggtgttaaacacatattattattgttatccaactaatcggacctatgcataaagcatt
+gtctaaacagaataattgcctatatacggtagttttagtgatttatatcttagtatcagt
+tagagcttcgaactcttcaggttcctcatatttaacgttcttcgaaagcgaaaacttcta
+caaacgaatgtaagcggttttccaagtagtacctataaatcacagaaagatctgtctcag
+tatagttgaaatggtattcagctagtgacgtgtaccaattatcatagttcactcaagcaa
+gacgctcattaacgaatatagacaagacactatatcatataataaaaaagaacatggtgc
+tcgaacatagttgaattcaccatattgaaggggaatgctgacatgtaattcgctactaga
+cgatcaattccctacttgtcaaagttgaactggtacgttcttggaattaaatatgattgc
+gctggaccaaattgcgacttcttgagtttcagggcaaacgattgagccggaggatgtccg
+tctcttacctttcttgcttatgataaacgacggtccctgtacatcactgggaattctcag
+caaaaataattgggtaaatcgagactcgatgtattcggccacaaaggtgttagacgttaa
+agattattcaacggggcgataataggatcataaccggtatgcaagcgcattgaaagagcc
+atgagatccttatccgataaacgctgcacggtatgtgcagccttattgtcgatcacgaat
+ttataaatgtagtctgggctgtaagttgaagacctaagttataatgaagtgcaataccaa
+atcgattcatagtggattatcagactcaagatatctcctgataaattacagttgttaaga
+tacggataaaatgagatttaagattagcagcctctaatctgtttcaatcccgttggaatg
+tggtatgcgatcaaggttaagttaaaatcaagcctgtcttcagtcttgattcttgttctg
+ccatcgcatgcggtctacgtgagttaatatgtagcttacgttctagcttgtgctaatctg
+agtatagattcgtagaggaatattatcaagcttccacgcctcaacgtacgtgtattggtc
+acacaagacactaaaagtggaagtagcgtaaactatagtctagttgttaaatgctcagtt
+cttgttatattcgatatactcttggctaatttatgtctgagtatataaaattaatgatat
+taacttgcatttcacggatcccttagaaaaagattttgaccgagcgcattataaacggtt
+acaccgaatcaatagaagcatacccaatagctttctttgaatttattgcctgcgcaactt
+ggctgactctctagatccgaataattctatatggtcgtgacgaaactagttcattactgt
+ttaaaatgccaacatgtcttttgggccgataatggctctttgcaaaattactcaatgata
+cgattgatcaaagcggtagttgctagtggtagcatgtaagtctatcaaatgtctgattat
+ccgaaaatcttccaaaagagtccacgtaccatatctatctcatagcgacgcgaggggaac
+cttatctaactatcattccatttaccgggtgactctcgatgcaggatccgattgggataa
+attgcccagaaatggctcattcctgactaagggtaaggccgttctcagcaagggaacccc
+gcgaatctaggcttataccatctagattgttaactacttgcctgtagttctacagccata
+ctggacagttgtttctaaatgatcgggattcatgctagcactcctctgaatgcaccgcgt
+aagtttaactattacgtccgtgggcagataaggatggaggctgtatgtatcttaactgtt
+acctaatatggctggtaattatcaaagtaaggaccttaatgccatagcgctagcaatcgc
+tttgtatactgaccatgtgccaacctctcttaatctgtaaaatataatgtcttagctaac
+tgtggacgatcatgtctctgcctagagcttcgctgtatcaattcctatagccagcgtact
+agtgacacaacaacaccgtgtgagaaaagatattagtccttacgtctgtctctctacagc
+ttattgatgaggattgaacatggacatatagctccccctcaaaagcagatgctacctctt
+tattccattctcgaacatttgccgaacttaatttcgacaaacctgaggtcacgtcttaat
+ttatcggtaacgtcacgtccctttgagactggataaatatattaccaggggccaacgagc
+aattgttggaggcgcttctataatacaaggtgtcttgtcaaagaaagacggcgtgcgtct
+cgtgcaactcacttaaccaatattaatgtgaaacccccctctctcacatcttatgcggtg
+tactgccctggtacatttcctgtacaggactccaacagtgtagattcctaagatagctgt
+tggagttgcctcacgccagatcgaaaaactgaataaactagtgagctgagctgcagaaat
+accgcttaattacttatgactagttcaaagggacctacgtgatgtcagacattgcaagga
+agaaattaggtttgtgcgtcattttggctggactagcactccttacttcccctactattc
+aaatgtcgtaaacagcatgagacaggatcgtgctgacatttaaggtctattgggaacgag
+gctacctttggtcgcgcgctcgcgttctccgaatgaccgaaatgcatgagcacagtatgc
+aattgcttatagatctaaggtctggtcgttgaaaccaagcacgtaggcctgggaaatcag
+ttcttcctcagcaactacacaaaagcgtccaagcattagtacttgtagtaaatgtccgaa
+cctatgcgctcatttgaaagtcaaaaaatatttttaagcagtaggcacctaacccgattc
+ctctacttagtagctttctttgattctcagaattgactgcaatatcactgcacaattctg
+tgccattactagacttctctgtattaacgtctcatcttactaacactcgcctaggacaca
+tctgagagtgaagtatttcaatacatttactgaaatcttcagttctaaaatccccgaata
+aggctcttatcggtttggccaacacaagaaaaaaacttcttgcaccactcaccttcatac
+gcaggagcctggggaacttagtaataactatttcggcagacaaagcttataacaagttgc
+cggcgcgtataatatttaaaagaccccttgagctgctcaattaaaacgctcacctggtat
+aggctattagatagtgccgtcttagtaaggggcgggaattatcggataaactgatatttt
+gataaaataaccgacttgttcacgacataagtcactaaggagattttatctttctccaaa
+gtatatcttccttggataatttcaaagcgctgcaatttaagttctgttactagtttatgc
+tgctgggaggtgaccggaaggcgtagtaatctagaggcaaattataagaagttcatcata
+tcattttcgactacaaaaacaaggtgttgtatgccggcgcattgtgtaaactggacgagt
+accctagatggaaaattatacgttaagccaagatttcgatgtaatgataattacctacac
+atttttgctatccataggaacaagagctgttctataggctcgtggcatacgaacatttgc
+tgccgctatgaatattggaagctcttcaactacagactctattcttaattgccgtcgaaa
+atgggccgaatcggctattattaatactcggtttttccgaggggattgttgtcgacagtc
+gtaattattattaatattgatgttggtgaggtcatttaaatacaaccttgcagacaatga
+ataagggatccaatctctcatactccttttacaattgctcatgcccctatgcaaacctta
+tgccgccacacctccgcaactctctcttctgaactgtaagtagcttcattactggtttga
+gactatactgaagctgatgacattctaaaatggctattttcgaatgtgattcataatgtt
+tatcgtttgggatggcagaatcacgttatttttgatatagcccgggtattctattgtata
+gaacgtatgctacaagtcattccccgaagaagactagaagtaaacaacatgcgaccatcg
+ttaagccacgcaaggctgtagctttatttcccgataacctatcttccataaatagcggac
+agcaggatactgacgctcaacatcagtggttatggtctaatttttaacttttaataaggt
+aacttcagcaggcatacacagtaactctttaatttataatcaaattagaagtctgacact
+tcttatatttttctatcatccaacgcgatcgcccattagcttattgtgttactaataacg
+tatctaaaccaatccttttcaagctactgcctatattgtcaatatatacaaacaacagga
+tagtaggctgcttaaaaaatattgtcaaccgtgtacgctttacaatacccggaaatcaca
+aactttgtagacaacgagtgaaatttatacactacgaagggccagcgtacaagacccatg
+aattaggcgatatgtttattctgacatattggtttatccttaatctgtcgctgtaaaatg
+aagccgcccccatccctgcgaattttttttcgaagattcacgactgaaatataaatacgt
+ttggctatatttatgttggagggaggcaatagcctttactgttaaccgaagatttagcca
+gtgagtgtgacactaaaacactggaataaatgcaggcgttcttctgggtaaaaggtttag
+tcaatctcgcctataagttcatatagctctggatataattatctggcccatgcatttatc
+atggcgcttggtgccctgtgtgaagccggcctctcatattgaaggtccgaagtattccat
+gtacattaagatcactctctcattcatgcatcttggcttaacaaatctggttgtccaagc
+tttccaggcacgtatggtacaaattcggatcgaatacttataaaaatgatatgttaaact
+gtctaaaacgctcatctacaaagtaaagtgcactaaccaatagagtctcaagaccgtgta
+atgctggtgcactgaatgtgtaatacggttagaagggattagttatgttacaaatccatt
+gaaaacttaagaagcattgcgtgctcggagggtgcatcttttatcaagagactaacatta
+ttttcaacgacgtacatgctttacaatagggtacttatcaaacgccgagaaacgcgccta
+tagtgatgttatgattatgacccgatatccattggaccgaattttatgtaggttcccagc
+gtactcgcgtaatatctcggtattgccataatgtaatacttgtcggtctctcccagatga
+aaaagcgttacagagtatttcaatgaaaaacagcgcgcaacgtcaatacctttaggggta
+acggccgctgatttcatatagatatacgataagttggtatagctctactaggtggcatcc
+acaatcgttgcatttactatagctggttacaatcataatctataccgttccttacatact
+accatagcgggatagcgtttttttgccgttgattgggtttaagaggatgtcagtctcatt
+atatccgattcggtgggagagccgttgttttcaaatcgcacactttgtgacataatgtac
+aagataacaaaactgatataagatataaactgtcaatatcaccttgacacttgaatcaaa
+gtaaattaactcgcaaatataatttgactaattgggtgcagatttctcaattaataaaaa
+aatggcaccggatgggcttacaagccccttatcattcacttgtatcatgatttccaagaa
+caatagaatttgctagcaagtatgaacagagattcgaattgcatccacagtacgccggag
+cgtttattttaatgtggatatgacgatgtactgttggcggcatttgctagtaaccggtcc
+ttatttacgtagcgcacacgtaagcatgtctgggagaaatatggtggtacaatctcagag
+aaagattacagtttggtttaaataggacttatcgggtcggaagtggaacttaataagcag
+tacacaattgggcaacagacgtcttgcctattacaataggattacaatgcgttagatttc
+agacacgttcgtgtttggctattcgtcaattccctaaatagttagacgatcaactattat
+caaagtgattctttgttcatcctccattcatgtaacagatggcacactacgcataacgcc
+gaggaattttaacgagatttaagagagcagttcgggcacaacccacttgactttataaca
+gctcggcagcataaacggtaatatgtgacaaatttccaaacgttataagaacgtatgtgt
+acttagaaaactaagtggttcatgttcaacagatgtgacgcagcaagcctaacttatcta
+ttggttttgctataaaagaacaaagttacacagaatcctaagggcttgtttcacacttat
+gcctagtgcttcaccatcttaaaatagcgaaaccggcacgaatcaaaccttaaaacaatg
+cgcagatattggtgatggtgactccgggtatgataatggtaactgttgaccagcgcccac
+ctcatcgaagtatagaaagtggttaggataaggatgagaccgaacttatttccggccata
+actttagattttctacctagtacacaacatcagggcggacacgaaaccgccatcacatca
+tataccaggtttaatttgcttaatgggggaagtgtcaacgaaccttcgaactttagcagg
+catatggccattatatatggccccagagcagaatgctacagcagacaaaatttggattta
+tgtagtttaatacctatcaaacttggtgtgaccatacttgtctaacgacagtgcacaaag
+tgtaagttacaattattactactcagcagcttctgcaatgataaaatcttatcatacacg
+tcacatatgataatatctacttagggggaacgggctccacaacctacatagtactcaata
+cttacactattcgacaggcacaccaaacctgtacagtcccaaaagattgagtcaactttg
+cagtactgcagatcacagtaatagcttagttagcgagtcaaaattagttttctacgagac
+tgcacgaccgtgcaaatttccgatgtgttggctacaaatagcaacgtatgaatttgtttg
+aagccacgtaaactgtacaaccttagagataagtctcaggctactaaaaacacgttgtgg
+cactaacaggatcatggttgattcttacttattcggctgaccggcccaataagtaacctt
+caactagaacagaataatcgggagtagtttaattcagtcaaggtgcaggtctcattgtaa
+ctaacaagctctgtgtaaccaagttaaaatcgttttcttagcggattccctacttatgga
+tttgagctcgtccacaatattcgatacaagaagtttgtggtccgtaacaacgaaatttta
+attacgctgtgcagcctcatccaaggaattaatagaaggttgatggtaggctccgaacgc
+tccatgattataatcaagtggactgtgcagtaaacgaggaaggtatcctgacgtcgtggt
+gttcgtttttgttatttgtgccctatacgagtagataaaccatgaacagcacagtgtgaa
+cccatggttgattttaggctaccttatttttaatttccgttacacagaaacgaattccac
+aactaacatgccattaatttttcgatatcttataaaagatggtcgaaattcattcattta
+ttttttttcggttctcgaaagtcaactaagctgtcgcgttttgtttctctttagaggtaa
+aagtggctttgatctcctacgtttggatactagtcaaccattactccatttgatccgtga
+gtatcacctgtctaacatccagcattatgactcctcggcgaagaaaagacacacttctta
+gagtcgatgtgtattagctagggacacagttgtttaatacgatagtgagcccagggaggg
+cagtgcgtcccccagtagatttattcagctagtgtaagtataagatatctcacccacgag
+gttcaagtgatatgcagtcttagaataatacttatcctgaatttcgatattatgggtact
+tcaataatccgctagcgctactttatgtctcgttggacagcaggacacatggcagtctta
+aacactaaagacatcacctgaatgaatgtaatgggattacaagaatcaatgaggtattat
+atacgacgtaggaaactctggatatatacagtaatctagttacgccatcgcacttcattc
+ctctggaaacttagaagacatcagctgtacgtggaggaaccagacccccgtatgtagcca
+aatagaaccaaagttgcttatacaaacacacccaatgacaatggaccgctggagttcgta
+aactcggaacgtagtactgcacaaacccagcatttagcaataggagctacgtatgcaact
+cccacgtggtaataccttcaagctatcaatatataggtgcctagctaatcgcattcgcaa
+gcagtattcaagcttgtaaaccagtataataattacagaggctctatgaaacccaacttt
+ccagctaaaagtcccaattaaatggttatttcgtacttttaaagtcgcccgttctgttat
+tacgcgaattgattctactccaaaattaaacacaaattatcaaccgtttcatttatattt
+gtcaatgcagctgtttaaaataaggctctactaaattataattaagacacttattaccag
+atttctctagttaagtttgaaccagctcgactaccgcgaaagatacattcccttctctat
+ttttcagttcatctatgggtcagagaagcattgaatttattctattcaccctcgtcgttc
+acagcgaatcgtcagtgtgatcagtgtatgagaaatatcctaaaccgtttagtcagacca
+cacgcttagaacaagtggtctaaaaagactgccctggaaggagtaagaagtatacagctg
+atccggtgtatccttcagtcatctgccctatactaattacacgacgcaaggaaaaatagg
+tttattttctaggcaaacccttcataggtgactccgatgtgttacgaatcatgcttgaga
+atgtgctatcgttaccgacggataataacgatctccaatgaaccaaatgtagaatgtcta
+ttgattacccttttactattcgacttagagataggagatagaacctcagtgtactttttt
+agccgaatgggaatctttgggaggtgaatggccataaggtcgtaaatccaaccctcttaa
+agtcttccatattatatcgttgttcgtggaatcgataacagatttgttgacccatagtaa
+atgtatactagtttatgttgtaagtgtagattgttttccgattgccgtccaaactttatg
+tcgtaattgtagaccagtaaagttgaccaaggtaagtgcccagcgatcctgcgagatcga
+tcgccaatttttccagtcactgtaagtgtaggtttagataaagccgtatgagttatatca
+taagggcctcggaaagcagcttcgaaccaaagttcccttataatagtagtttaactataa
+aagtatatactggtctgtcgccctttcacgatttgttttaccggtttatgaagcgttacg
+tcattagagcggctccaatttaaggttaacggcttccatgtgtagttgtatacaaggata
+acttaaagtatctgttcagcgagctagttaagttatcctcgatagaacacaactcagagg
+tcccaagatcgggtttgcaacttgctaatttattctcaaggcaaattgggaattatcgat
+acctgtataccataaggtcgctcgatgtgatgcttatgtcttctggtgatcctaccttag
+ttagtgctgattaacggaacattaatgtttatcgttttgagatttagccaattctctgat
+tctaactcaagatgccttatctgacgtgctatgcagcccctaagtattttacattgtaat
+aggacacgctcctttaaaactcgccaaaaggtcgttgtggttctctactggttaactata
+taatttacagctttgttgagctagttcctctttggtttaagtcctcaatattagttggtt
+cgagcgataagttggctagttaccttagtcactatattagatccgaatgttatgcttcat
+ctgaagaccgccaccctccaaaatttcttttaagactcacttattgcaaggtgtaggtga
+attcggctcgtttctcaagtggtgtatctgtacacgagtttccatattttcatcaacagc
+caccgcacacttatgtcactctaggtattaaaagtcgctctacaaggggacgcaattaag
+aaacagacatgctagtcaaaaataaacatagcgaggcaccactaattcggccgcttatca
+atgggatgctctgcgcgagacgcgccagagctcagtagttagttcggacatacatttact
+tcagatgatcaattagttttctacaaatgcttactctaccccgaaaaaagtcaccagact
+cttacgtctctttagtatccttccgtcttatataaggtcagtcccccgtttcggtaccct
+ggaatttactaagaataatgaaacagcccccaaggacgtacgtttacaaatgatagacca
+gatcgcctagcttattccgacgcatgttgcatagaattgaaccaacggaatgtgagagta
+actagatgagccgaccacagcacccgtttgcgtcgcagaatacgcctgatagttcggcca
+cgaaatcatatgtcctttgagtattaagtatttgtaatgatcaatcgagctcaagcaagc
+ttacacttcctcggatattcagggaacttagtgcctttgaaagatacgttgatcaacgaa
+aaattgataatggctcatatggaatgcctacctcatagtgctgaattaacacagcactgc
+ggacctaacttttcgaggtttcaagttcacgtctcaaaacctaataggctggaatatgta
+gggatcctcggtgaatttgtgattgggtttgttgtagtactgaccaagtgaatattcttt
+ttttctaaaagcagatctgctgccgggcactacgaaggagatctctgtgtatcattattg
+cttcttgacatgatgactcttaaatcactgtgggtgtgcaaaacgatagcacaacccaat
+tcgatagtacatattgttgatacttcgcactaaaccgttcatatttaaaggttgtgctcc
+ttccttcgttaaatactggtgacttggtcctatctactattagctagacctctggggaac
+cacgcccccgtaaaacctgtgcaagagagggggtcatacatcttagacatcgcgcctcca
+ccagggaagcattgggtgattgaccaggtgtgtaacaaatatgattattcttatactaat
+attagcaaagatgcataatgatttgtattaaatgtataattgaattgataagggtctttt
+agtcagtgatagagtagtataaggtagacattagaactcttaaccggacgcagatttttc
+ggtcttagtaagccaattagtcgacaaaacaaggtaagagcggttactagtagtacctat
+aatgcactgaatcttcggtcgaagtatagttctaatgctatgcagattgtgacggcgaca
+aatgttcagacttatatcatgaaacaagctcttgtaagtattgacaaatgaaaagattga
+atatttttaaatacaaaatgcgcctacttattaggggaattaaccagattgaaggccaat
+cctcacatgtaatgagataatagacgataaatgaaattcttgtaatagttgaactgctac
+gtgatgggtattatatatgattgagatcctccaattgccgacgtcttgtcttgatgccca
+aaagattgtcaacgaggagctccctcgcgtacctgtcgtccgtatcataaacgacgcgac
+atgtacagcactccgaagtataagcaataataatgcgggtaatccagactagatcttttc
+ggactcaatgcggtttcacggtaaacatgattaataccggagagtagtcgagcttatcag
+cgatgcaagcgaattcattgtgccaggagatacgttgcagataaaaccggcaacgtatgt
+caacaagttttggcgatctcgttgtttgtattcgacgaggcgcgggaacttcaagaacta
+tcgtatattcaagtccattaccttttagtttcagactggtggagctgactaaagttatat
+catcattttgtacactggtttagttaacgataatttcagatttaacatgaccagacgata
+atcgctgtatatccagttggaatgtggtttgccagaaaggttaacttataatcaagcctc
+tcttcagtcttgattcgtcgtatcccatccattgcgctatacctcagtgtatttggagct
+gtagttataccgtgtgctaagatcagtagacatgacgagagcaatattatctaccttaca
+agcatcaacggacgtctagtcggaacaaaagactctaaaactcgaacttcaggttaatat
+actatagttctgtattcagcagttattcttatattcgatattatcttgcctattggatgt
+ctgactttagtatattaatcatagtatctgccatgtaaaggtgccagtactaaatctgtt
+tcacagtgcgaattataaacggttacaaccattaaagacaacaagaccctatagctttat
+ttgaattttgtcaatgcgcaacttggagctcgcgatacatcccaattagtctatagggtc
+gggacgattctacggcatttctggttataatgacaacatggattgtggcccgagaatcgc
+tctttcattaattaagcaatcattacagtcttataagcgctacttccgagtggtagcagg
+taactcgatataaggtcgcatgagccgaatagcttaaaaaacaggccaccgaacattgat
+agagaataccgaccacagcgcaacctttgattactttcattaaattgtacggctcactcg
+acatcaagcttaagattgcgataatgtgaactcaaatggatcagtactgaagaaccgtaa
+cccacttcgcagaaagcgtacccagagaagatacgctgttacaatatacagggtgaaatt
+attgcctgttcttcgtaaccatttcgccaaacttggttagaaatgatagccattcatgat
+agaaataagctgaatgataccagtatctttaactatgtagtcagggggaagataacgatg
+gtccatgtatgtttctgatatgtgacagtattggccgcgtaatttgctaacgaagctact
+taatgcctttgagcttcatatagatttctttaatcaaaatcggcaaaaagatagtatgag
+ctataatatatgctagtagagaactctggaccatcatctatatgaatactgattcgagcg
+tgcaattactttagcctgcgtactactgactctacaaaacactctgagataagtttgtag
+tcagtaagtcgctctctataaaccttttggatgaccattgtacagccacttatagatccc
+aataaatagcacaggagacagagtttttcaatgctcgatcatttgccgatagtattttcg
+tctaacctcagggcacctattatttgatacctaacctaacggccctttcacaatggagaa
+atatatgacatcgggacaaacacaaatggtgggtggccaggagatatgacatggtggcgt
+ctctaagaaacacggactccctctaggcaaactcacgtaaccaattttaatgtcaaacaa
+aacgctcgaaaagattttgccgtgtaatgacctggtacattgactggtcaggaatacatc
+actgtagttgccgtagtgtcctgttggtgttccatcaagacacatcgtataacgcaattt
+acgacggacatcagatcaagttatacagattatttaagtatcacgtgtgcattgggacat
+aagggatctcacacatgccttggaacatttttgctttgtgccgctttttcgctgcactac
+caatccttacttaccagtatattcaaaggtcgttaacagaatgagaaaggttagggctct
+aagttatcgtcgattgggatagacgagacatttgcgagcgccctccacggatacgaatct
+cccatatcaatgtgaactggatgctatgcagtttagttcttacgtctcctagtggtaaaa
+atcaaagtagcactcgcatagcagttattcagaacctaatacacaaaaccgtcaaacatt
+ttctaattctaggtatgggccgatcataggagctaaggtgaaactcataaatgttttgtt
+agatctagcatcctaaaaagatgcatatactgagtagctggcgtgcattctctcaattgt
+atcctttttaactgaactagtcggtcccatttcgtgactgagatctattaaccgataaga
+ttaataacactcgcattcgtatcagctcagagtgaagtttttcaataatttgactgatat
+attaacttctaaaataaccctttaagcctcggatccgtttcccaatcacatcaaaaattc
+ttattccaactatctacggattaacaacgtgcatggggatcgtagtaagaacttgttccg
+atcactttgagtatatcaagttgacggcccggttattattgaatagaaacattcacctgc
+taaattaaataccgcacatcggatacccgatttcagagggccgtcttactaagggcaggc
+tttgttcggtttaactgagatgttcattattttacagtatgcttcaactaatatgtaacg
+aaggacagtggatctgtctccatagtagatcttcagtcgtgaatttcataccgctcctat
+ttaagttcgcgttcgagttgttgatcatggcacgtgaaagcaacccctagtattctagac
+gaaaattttttctagttcatctgataatttgccaattcaaaaacaaccgctggtttcccg
+gcgcattctctaaaatggaagtcgaacctagagccattatttgtcggtaacccatgagtt
+ccttcttttcagaagttaatacactgtggtcctatacagaggaaaaacagcggttatata
+cgatcgtggcataacaacattggatcaagatagcaatttggctacctattctaattctca
+ctagattcggtattccactacaatatcggcagattaggattggatgaataatcggtgttt
+aagtccggttgcgtctccaatctcctaatttttattaatattgatcttggtgacctattg
+taaataaaaacttcaagactttgaataacggtgaaaagatagaagactcatttgaaaatg
+gatcatccacagatccaaacattagcaagacactaatccccaactagctattctgatcgc
+gatcgtgctgcagtactcctgtcacaatagtctgttcatgatctaattctttttgggctt
+tgttcgatggtgattcagaatctttatccggtcgcttccctgtagctactttgtggggat
+attgcccggggattatagggttgagatcgtttcctaaaagtatttaaaccaagtagactt
+caactaaactacatcagaacatcgtgaagacaccatacgcggtacctttatttaccgata
+acatttcttcaagaaataccggtaagcagcataatgaccctaaacagctcggggtatcgt
+cgtagttttaaattttatttaggttactgctcaaggaataaaaactaactatttaattta
+taataatattacaaggctcacactgattagatttgtctataagacttcgcgatcccccat
+taccggattgtcttaagaataaactagataaaccatgcattttctagataaggcctttag
+tctaattagatacaaaaaacacgatagttgcatccttaatttattgtgtcaaacctggaa
+ccttttaattacccgcaaatcactttatgtcgagactacctctgaaatttattatctacc
+taccgcatgaggacttgaaccatcttgtaggagttatgtttattagctaagattcgttta
+tcctgtagcggtccatgtatattcaacaagcaaaaagcactcagaattgtttttagttga
+gtcaagactgatatataaataagtttccctagttttttcgtggtgggacgatattgaatt
+gaatcttaaccgaagagtttcccactctgtcgcacaataatacacgccaatatttccagc
+cctgcttatgccttaatcggttactcaatctcccattgaagttcattttgatctgcatag
+aagtttcgggcccagccttttttctgccaccttcctccaagctctgtagacgcactctaa
+gattgatgctcacatgtattaattctacattaacataaatatataagtcatgcatcttcg
+agtaaaatatctggttctccaacatgtcctggcacgtatcgttataatgcccatacatgt
+agtattaaaatgattgggttaactggatattaagatcatcgaaattgtaaagtcaaatta
+acaatactgtctcaagaccgtgtattcctcgtgctcggaagggctattacgcttacttcc
+gttttggtatcttaatatgactttcaaaaattaagttgcagtgagtcctacctgcgtgca
+tcggttagcaagagtataaaagttgtttaaacgaactacttgctttacaataccggtcgt
+atatatcgccgtgaatccagaagattgtcttctttggattatcaaccgagatcctgtgga
+ccgatgttttgggaccttcacagaggactccaggtagagctcgcttttgcattaatctaa
+gaattgtacctctctaaaagatctaaaacagtgaatgtgtatttcatggaaaaacacaga
+gaaacgtaaattactttaggccgaaaggcacatgagttattatacatatacgagatggtg
+gtatacatcgaattcggggcatacactatagttgcattgtatttagctgctttaaataat
+atgatattaccttccttacataagacattaccggcataccctggttttcaacttgtgggg
+ctttttgacgatcgcactctcatttgatccgagtagggcggtgacccctgcttttcaaat
+acaaaaatttcgctatgaaggtaatagattacttttcgctgttatgatagaaacggtaaa
+tttaaaattgaaacttctagaaaagtaaagtaacgagaaatgattttgtgaataatgcgg
+tcatgattgcgcaagtaagaaaaaaaggcaaaaggatgcgcggaatagaaacttatcagt
+cacgggtatcttgatttcattcttcttgtcaattgccgacataggatgaaatcagattcc
+aatgcaatacacagtaacccccacccttgattgtaatgtcgatttgaagttgtacgcgtc
+gacgaagtggatagtatacgggccttttgtacggtgcgatcaactatgaatctcggcgag
+ttagatggtcgtacaatctcacacatagaggtcacttgcctgtaatgacgaattttcggc
+taggtactcgaactttattagaagtaaaaatgtgggcaaaagaaggattccattttacaa
+gacgattacaatgagttacatgtctctcaacgtagtctttccctagtagtctttgaacta
+tttaggtactccagaaaattttagcaaagggtttctgtgtgaatccgccattcatgttta
+tgatggaacaataagaataacgccctcgtatgttatcgacagtgaagtcagcagttcggc
+caaaaacatattcaatttagtacagatccccagaagttaagctaagtgctctaaaatggc
+ctaaacggttatcaaagtaggtctaattactatactaacgggtgcatcgtaataactgct
+gtcgatgcaacactatatgatagtgtcgttttgctatatatgtacaatgtgacaaagaag
+ccttagcgattcttgcaaacttaggacttcggattctcaatcttaaatgtccgaaaacgc
+aaagattcaaaaatttaatctatgagcagatatgcctgatggtgactacgcgtatgttaa
+ggctaaatgttgacaaccgcacacataatcgaactattgatagtcgggagcataaccagg
+tgaacgtactttgttcacgacatttattgacatgttctaaatacgtctcaaaatcacggc
+gcactagaaaacgcaatcaaatcattgtcctggtttaagggccgtaatgccggtagtgtc
+aaacttcatgagaactttagctggcttttggccagtatttagggaccaagagcactagcc
+ttaagctgaatattttgccatttatctactgttataactttaaaacttggtggcaccaga
+cttgtcgatacacacgcatcaatctgtaacgtaaaaggtttactaagaacaagcgtagga
+attgagtttatattatatttaaactaaaagatgatattagcttctgagggcgatagggct
+ccaaatcataaagaggaatatattattacacgattagaaacccacaacatacctcgaatc
+gcccaaaagtttgacgaaacttggcagtactccacatctcagtaatacagttgggagagt
+ctcaaatgttgttttattactcaatgaaccaccctcataatttcactgctgttccattaa
+atttgcaaacgatcatttgctttgaagaaacgtaaaatcgacaaaattacagataagtag
+atgcataataaaaaaaactgctcgctataacacgatcatcgtgcattcttacttaggagc
+atcacccgcacaataacgtaccttaaactacaacactattagaccgagtactgtaattca
+cgaaagctcaagctcgcattgtaaagaacttgctctctcgtaaaatgtgataatagtttg
+cggagaggattcaattattttccattgcacctactccactagattcgataaaagaaggtg
+gtcctcccttaaaaagaaatgttaagtaacatcggaaccataagcaaagcatgtaagtga
+accgtcatccttccctaagaaacataaaggtttttaataatgtcgactgtgaactataac
+tgcatcctttcctgacctactccggttccttgttgttatttctgaacgagaccagtagat
+aaacaatgtaaaccacagtgggtaccaatggtgcatgtgacgctaccgttgttttaagtg
+cccgtacaaacataagaagtcataatcttacttgaaattaattttgccttttattttttt
+tcaggctcgaaattaatgatttgttttttttgaccttctagttacgctaatatgcggtcg
+cctgtggtttctattgagtcctataacgggatgggatctaatacgtttggttactagtaa
+acaaggtataaatttgataccggagtatcaactgtataacatcaagctttatgactcata
+cgcgaagtaatgacacaaggctttcaggagatcgcgagtacagagccactaaggggtgta
+ttacgatagtgacaccaccgagcgcactcactccccaagtagatttatgatcctacgcta
+agtattagatatataaccaaagaggttctagtcagtgcaactcttagaataataattagc
+cggttttgcctttttaggcctaatgcaatattcagctagcccttatgtatctcgcgttcc
+acagcaccactcatggcacgcgtttaaactaatcaaatataatctatgaatgttatgcca
+gtacttgaataaatcaggttttttataagtccttgcatactctcgttatatactgttaga
+gtcttaccccatagaaattctttcatctgcaaacttagaagaattctcagctacggggag
+cataaagtccccaggatgttgacaaatacaacaaatgtggcttatacaaacactccatat
+gaaaatcgaaccctcgtggtagttttagccgaaccttgtacggataaatccctccatttt
+ccaatagcagatacctatcctactacctcgtggtattaaattaaagcttgaaatatagag
+ctgcatagcttatccaattcccaagcacgagtctaccgtcgtaaccacgatttgatttac
+agacgctagagcaaacccatctttaaacatataagtaaaaattaaagggtgagtgcgtac
+gtgtttactagcaacttcgcttattaagacaattgtttataagccataattaaaaacata
+tgttcaacaggttcattgatatttgtaattgcacaggtttttaataaggatctacgtaag
+tataatgaacaaactttttaccagagttatattctgtactttgaaaatgctcctctaccg
+ccttagagactttcaattagattttttgcagttaatctatgcgtaagtgaaccatgcaag
+ggatgcgattcaaccgcctcgtgctaaccctatcgtctgtctcataactgtaggtctaat
+ataattttcagttttcgaacacataaccctttgaaaatctgctatttaatgtctcacctg
+catgcactatcttctatactgctcagaacggctatacgtcactatgctccaagtgacgat
+ttaaacgaagcaaggaataataggtttattttagtgcaaaacaattaagtgcggactacg
+tgctctttacaataagccttgtgattgggctataggttaagtcccatattaacgatctcc
+aatgtacaaaatcgacaatcgctttgcattacccggttactagtcgaattacagatagct
+gttagatactcactctaattttggacaacaatcccaatcttggggtcgtctatcgcctga
+agctcgtaaatccttccatcttaaacgattacatattatagacttgttcggggtagagat
+atcacagttgtgcaaacattgtaaatcgatactagtttatgttggtagtctagttgcttt
+taccattccccgaaaaacttgatctactatttcgacaacagtaaacttgaactaggtaag
+tgaaaacagagaatgcctcatagtgccactatttgtccactatatgtaagtgtagcttta
+cataatccactatgactgagatcattacggcctaggaaagcagcgtagaaaaaaagggcc
+cggatattacgactgtaactataaaactagttactggtagcgcgccatgtatagatttgt
+tttaccggttgtggttgcgttaacgaatttcagccgcgaaaattgatccgttaaccagtc
+catctcgacttctataaaacgataaagtaaagttgatgttcagcctccttcttatggttg
+catcgagagtacactactcagtgggaaatagatcggggttcctacttcagattgtattat
+ctaggcaattgccgattgtgccatacctggataaaataagctacctacatgtgatgctta
+tctattatcgtcatactaccttagggtgtcctgttgaacgctacattaatctttagccgt
+ttgagatgttccaatggataggagtctaacgcatgatgaagtttaggaaggcagagcatc
+ccactaagtatgtgacagtgtatttcgaaacgagacgttataaatagaaaaaaggtcctt
+ctggttctattctgctgaactattgaatggaaagattggttgacctacgtactatttgct
+tgaagtcatcaatttgacggggtgagagacatatggtgcatactttacggactctatatt
+ttagatcagaagcttagcagtcttctctacaccccctcacgacataattgcttttaagaa
+tctatgtttgattcctctacgggaattcggatccgttcgcatgtgcggtttatctaaacc
+aggggacatatgttcagctaaagcatacgaacactttgctaactagacgtatgtatagta
+gctataaatcccgacgatatttacaaaaagaaatgagactcaaatatatacatagcgacc
+ctacacttattcgcaccctgatctaggcgatcctagcacccacacccgaaagtgagcact
+agtgtcttccgtattaaatttactgcagttgagattttagttgtctactaaggattactc
+taacccgtaataaggatcaagactcggtactagctttactatcattccctatgtgttttc
+ctaactcacaagggtacgtaccagcctatgtaattacaataatgataaagacacaaagga
+agtaactttacaaatgagtctccagttacactagcttagtccctcccatcttgctttgaa
+gtctaaatacgcaatctctgaggatatacagcagaagaacactcataacgttggagtcca
+agaattagactcatagggcccccaacatttaatatgtactgtgagtttgaaggtgttcta
+ttgttaattcctgctcttgatacatgacacgtactccgtgtttaaggcttcggactgact
+ttctttcataagttgagcaacgaaaatttcagaatcgataagttggattcactaactaat
+acggctgattgaaaactccactccggacctatatggtcgacctttatacgtaaccgatat
+aaaacttataggctggtatatcgagccttcctagcgcaatttcggatggggtttcttcta
+ctactcaacaacggaatagtctttgtttagtaaaccagagctcaggacgcccaatacgta
+ggagagcgctgtggagcatgtgtcattatggactggagcactcttaaatcactctgcgtg
+tgctaaacgatagatcataacatgtcctgagtaaattttcttgatacgtcgcaatatacc
+gttattagttaaacgttctcatccgtcatgcgtgaaatacggctgtcgtgctcagatata
+ctattagcgactcatctcgcctaacacgcacacgtataaactcggaatgactgccgctct
+tacatattagaaatacagactacaccacggaagcattgggtcattctcaaccgctgtata
+aaagatgattagtcttataataagattaccaaagaggcagaatcatgggtagtaaatcta
+ttattcaagtgattaccgtcgtgtaggcagggagtgaggacgagatggtactcaggacaa
+atattaaccggacgaagtggtttacgtcgtactttcactattagtagtaaatacaaggta
+acaccggggaatagtactaaatataatgatatctatcttcgggagaacgagtcgtctatt
+gctttgaacattctcaaggcgtaaaatgtgctgacttatagcatgatacaaccgattgtt
+acttttgtctattcaaaagattgaatagttttttatacaaaagccgcatacttatgacgg
+ctagtatacagtttcatcccctagcatcaatgctatggacagtattgaacttataggaaa
+ttcttctaatagggcaaatccgtcgtgatgcctattttttttcagtcacatcctcaaatg
+gcactagtattgtcgggatcccattaacaggctcaaccacgagctcacgcgaggacatgt
+agtccgtatctttaacgaagcgacagcgacagaactcccatggataaccaattataaggc
+ccgtaatcctctagacatcgtttaccaataaatccgctttctccgtaatcatgttgaata
+ccccagagtagtccagatgataaccgatgaaacacaagtctttctcaatgcacttacggt
+gaacttattaccgccaacgtagctcatcaaggttgcgacatctagttgtgtgtttgcgac
+gagcccagcgaacttcatcaactttcgtatattcaacgccttgtaattttactttaagac
+gcctggtgatgtagattcttagataatcagtttgttatcggctgtactttaccataattt
+cacaggtttcaggtcaagaagattatagctgtatatacagttccatgctcggtgcacaga
+aacgtgatcggataataatcaatcgcttatgtcgtctttaggcgtatccaatacatgccc
+cgataccgcagtgtatttcgacatgtaggtataccgtcgcatttgagctcgagtcaggac
+gtcagctagattagattccttaatagaatataccgacctctagtccgaactaaactatag
+ataacgccaacttcaggttaattgtctagtcgtctgtttgcagatgggattcttagatga
+gtgagtatcggccatattggttcgagcactttagtttttgatgcataggatatgcaatgt
+atagctgaaagtactttatctgtttcaaactcacattgattaaaccggtaaacctttaaa
+gactacaagaaaatattcagtgagggcaattttgtcaatcacaatcttccagctagagat
+acttcacaatttgtcttgaggctacgcaacattagacggattttcgcgttttattgaaat
+aatcgaggggcccaagagtatccatagttcattttgtaagatttctttacaggcttatta
+cagcttcttcagactcctacatgcttacgagttatatgctagcatgtgaacaatagatta
+atatacaggaaaacgtacattgagagagatgaccctacacagcgcaaccgttgagtactt
+tcattaaagggtaacgctctcgagacagcatccttaagatggccttattgtcaaatcatt
+tgcagaagtacgcaagatccctaaccaacgtagaagaatccctacaaacacatgagacgc
+ggtgaaaatagacagggtgttagtattcaatcttcggagtatcaatttcgccaatcttgg
+tgagaaagcataccctttcttcagagaaagaagatcaatcataacactatctttaacgag
+gtacgcacgcgcatcattacctgcctccatggatctttaggatagcggaaagtattggca
+gcgtattgtgatttcgttcctactttatcaatttcacattcatatacatgtcttttatca
+aaatcgccaataagataggatgagctatattagatgctagtagagttcgcgccaacatca
+tcgataggaatactcaggacagcgtgataggacttttcaatccctaatactctctataat
+tataactctctcttaagtttggaggcagtaacgcgctctatataatcagtttgctgcacc
+attcttcagcctctgatacatacaaataaattccacagcagtaagagggtttaattgaga
+catcttgggaacttaggattttactctaacatcaccgaaacgattattggataccgtacc
+taaacgaactttctcaaggcagtaatataggacatccgcaataacacaaatgctgcctcc
+ccaggagttatgtcttcctggaggctatatcttacacccactcactataggcaaactaaa
+gtttaaatgttgattgtctaaaaaaaagatagataagagttggccggcgtagcacatgcg
+aaagtgaatcgtaagctataattctctggacttgaagttctgtcctgttcctctgcaaga
+aacaaacttcctttaaagctatttacgacgcacatctcagcaagttataaacatgttgga
+agtttctagtcggaattcccaaagaacggatctatctaatgcattcctacatttttcctg
+tctgccgatggtgccatcctattcaaagaatttcttaaaagtagattaaatgggactttt
+aacaatgagtaaccttacgcctctaagggttcctcgagtgccatacaccagtcaggtccg
+agccacatacacggagaacattctaacatagcattctcaactcgatcatttgcaggttac
+ttctttcctatcctagtgctaaaaatcatacttgcaatcccatagcacggattaagaacc
+taagaaacaattcagtaaaacatgttcgaattcttggtatgggaacatcattgcagctat
+ggtctaacgcattaatgtttgggtacatcttccatcatataaacaggaagagtctgacga
+cagggagtgcttgcgatcatgtctatcattgtgaaatcaaattgtagctcacatgtcgtc
+tatgagagcgtgtatccgataagatttagaaaaatagaagtcgtataagatctcactgaa
+cttttgaatgaatgtgaagcatatatgatctgctttaataaaactttatccataggatac
+gtttccaaatcaattcaataattattagtcaaaatagataaggatgaacaacctgaaggc
+cgatcggacgtagaaagtggtcccatcactttgagttgatattgttgaaccacacgttat
+tatggttttcaaacagtctcaggatattgtatatacagataatccgataccagttgtctg
+acgcccctcttacgtaccccaccctttgtgacgtttaaagcagttgttcagtattttaaa
+ctaggcggcaactaatttggaaagaagcacagtggatatgtctaaattcttgttattcag
+gcctgaatttaatacaccgcatagttaacttcgcggtagagttgttcatcatgcctcctc
+taagctaccacttctatgatacaccaatagttgttctacggaatctgataattggccaag
+tcataaacttccgctgcgttcaacccccttgctcgaatatccaactcgaaaagacagcct
+tttggtgtccggaacaaatcagttacttcttttctgatgttaattctctgtggtcagata
+cagaccaaaaactccgcggatttaccatcctccaagaacaaatttgcatcaacatagcat
+tttggctacatattctaagtctcaatagtttaggttttcaactacattatcccaacatta
+ggattggaggaataatagctgggtaagtccccttgcgtctacaatcgactattttttatg
+aatatgcttctgccgcacctatggttattaaaaaagtcatgactttgaagaaccctgaaa
+agatagatgaatcaggtgtaatggcagcagccaaagagcatataattagcaacactctaa
+gaacattatagatatgatgatagcgatcgtcatgatgttatccggtcacaatagtagctt
+catcagctaattcgttttgccagtggtgacttgcgctggaagaatcgttatacggtccct
+tccctcttgatacggtgggggcttattcaaccgcgtggattgggttgtcatacttgcatt
+aaacgatgtaaaccatctagtagtcaactatactaaatcacaaaatagtgatcaatacat
+acccgcttcatggttttaaccatttaattgattaaagatattccgctaagaaccattatc
+tacctaaactgatcgccgtatcctagtagtttgaaatttgatgtaccgtaatgatcaacg
+aagtaaaacgttatattgtatgtagaataataggtcttggagctaaatgatgtgattggt
+agtgaagacttacccttacaactttaccggtttctcggaagaatatactagagaatcaat
+gcatgggctacataagcactttagtctaatgagataaaaaatacacgagtcttccatcat
+gaattttttgtcgaaaaactcgaacctggtaatttaaaccatatatctttatgtcgtcaa
+taactctcatatgttttatataacttcccaatcacgacttgtaactgcttgttcgactga
+gctgtttgagctatgaggccgggatccggttgagctacatctatttgctacaagaaaaat
+gaaagcacatttgttgggagttctggctacactcatagagaaataagtggcccgagtggg
+tgcggcctgcctccatattcaagtgtatcttaaaccaagtggttccaacgctcgcgctaa
+agaattaaagcctttatttcctccacggagtagcccgtaatccggttcgaaagagaccat
+tgaagttaattttcatatccagtgaagtttaggcacaagcatgtgttctgccacatgcct
+caaagcgctcttcaaccaagatatgattcatcctaacttcgatgaatgcgtctgtaacat
+aaatatagaaggaatgattcggcgagttaattttcgccttctccaacatggcatccctac
+gttcgttataaggaccatacatgtaggttttaaaggtttgcggttaatcgatatttacat
+catagaaattctatagtcaaatttacaagactctagatactcactcgttgcagccggcta
+ggaagcgctttgtaccttacttcccttttcgttgcgtaatatgaatttcatatagtaagt
+tcaaggcactcatacctccgtgaagagggtagatagactattaaagttgtttaatagtac
+gtattgatggaaatgacccgtaggagatttaccactcaatccacaagattcgctgctgtg
+cattatcaaaacagtgcatgtcgaaacatgggttgggtccttcaaacacgaatccaggta
+gagatacctttgcaattttt
diff --git a/third_party/rust/regex/examples/regexdna-output.txt b/third_party/rust/regex/examples/regexdna-output.txt
new file mode 100644
index 0000000000..d36baa5be8
--- /dev/null
+++ b/third_party/rust/regex/examples/regexdna-output.txt
@@ -0,0 +1,13 @@
+agggtaaa|tttaccct 0
+[cgt]gggtaaa|tttaccc[acg] 3
+a[act]ggtaaa|tttacc[agt]t 9
+ag[act]gtaaa|tttac[agt]ct 8
+agg[act]taaa|ttta[agt]cct 10
+aggg[acg]aaa|ttt[cgt]ccct 3
+agggt[cgt]aa|tt[acg]accct 4
+agggta[cgt]a|t[acg]taccct 3
+agggtaa[cgt]|[acg]ttaccct 5
+
+101745
+100000
+133640
diff --git a/third_party/rust/regex/examples/shootout-regex-dna-bytes.rs b/third_party/rust/regex/examples/shootout-regex-dna-bytes.rs
new file mode 100644
index 0000000000..773fd9ba8d
--- /dev/null
+++ b/third_party/rust/regex/examples/shootout-regex-dna-bytes.rs
@@ -0,0 +1,68 @@
+// The Computer Language Benchmarks Game
+// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
+//
+// contributed by the Rust Project Developers
+// contributed by TeXitoi
+// contributed by BurntSushi
+
+use std::io::{self, Read};
+use std::sync::Arc;
+use std::thread;
+
+macro_rules! regex {
+ ($re:expr) => {
+ ::regex::bytes::Regex::new($re).unwrap()
+ };
+}
+
+fn main() {
+ let mut seq = Vec::with_capacity(51 * (1 << 20));
+ io::stdin().read_to_end(&mut seq).unwrap();
+ let ilen = seq.len();
+
+ seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]).into_owned();
+ let clen = seq.len();
+ let seq_arc = Arc::new(seq.clone());
+
+ let variants = vec![
+ regex!("agggtaaa|tttaccct"),
+ regex!("[cgt]gggtaaa|tttaccc[acg]"),
+ regex!("a[act]ggtaaa|tttacc[agt]t"),
+ regex!("ag[act]gtaaa|tttac[agt]ct"),
+ regex!("agg[act]taaa|ttta[agt]cct"),
+ regex!("aggg[acg]aaa|ttt[cgt]ccct"),
+ regex!("agggt[cgt]aa|tt[acg]accct"),
+ regex!("agggta[cgt]a|t[acg]taccct"),
+ regex!("agggtaa[cgt]|[acg]ttaccct"),
+ ];
+ let mut counts = vec![];
+ for variant in variants {
+ let seq = seq_arc.clone();
+ let restr = variant.to_string();
+ let future = thread::spawn(move || variant.find_iter(&seq).count());
+ counts.push((restr, future));
+ }
+
+ let substs = vec![
+ (regex!("B"), &b"(c|g|t)"[..]),
+ (regex!("D"), &b"(a|g|t)"[..]),
+ (regex!("H"), &b"(a|c|t)"[..]),
+ (regex!("K"), &b"(g|t)"[..]),
+ (regex!("M"), &b"(a|c)"[..]),
+ (regex!("N"), &b"(a|c|g|t)"[..]),
+ (regex!("R"), &b"(a|g)"[..]),
+ (regex!("S"), &b"(c|g)"[..]),
+ (regex!("V"), &b"(a|c|g)"[..]),
+ (regex!("W"), &b"(a|t)"[..]),
+ (regex!("Y"), &b"(c|t)"[..]),
+ ];
+ let mut seq = seq;
+ for (re, replacement) in substs {
+ seq = re.replace_all(&seq, replacement).into_owned();
+ }
+
+ for (variant, count) in counts {
+ println!("{} {}", variant, count.join().unwrap());
+ }
+ println!("\n{}\n{}\n{}", ilen, clen, seq.len());
+}
diff --git a/third_party/rust/regex/examples/shootout-regex-dna-cheat.rs b/third_party/rust/regex/examples/shootout-regex-dna-cheat.rs
new file mode 100644
index 0000000000..1bde7ab1ff
--- /dev/null
+++ b/third_party/rust/regex/examples/shootout-regex-dna-cheat.rs
@@ -0,0 +1,90 @@
+// The Computer Language Benchmarks Game
+// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
+//
+// contributed by the Rust Project Developers
+// contributed by TeXitoi
+// contributed by BurntSushi
+
+// This technically solves the problem posed in the `regex-dna` benchmark, but
+// it cheats by combining all of the replacements into a single regex and
+// replacing them with a single linear scan. i.e., it re-implements
+// `replace_all`. As a result, this is around 25% faster. ---AG
+
+use std::io::{self, Read};
+use std::sync::Arc;
+use std::thread;
+
+macro_rules! regex {
+ ($re:expr) => {
+ ::regex::Regex::new($re).unwrap()
+ };
+}
+
+fn main() {
+ let mut seq = String::with_capacity(50 * (1 << 20));
+ io::stdin().read_to_string(&mut seq).unwrap();
+ let ilen = seq.len();
+
+ seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned();
+ let clen = seq.len();
+ let seq_arc = Arc::new(seq.clone());
+
+ let variants = vec![
+ regex!("agggtaaa|tttaccct"),
+ regex!("[cgt]gggtaaa|tttaccc[acg]"),
+ regex!("a[act]ggtaaa|tttacc[agt]t"),
+ regex!("ag[act]gtaaa|tttac[agt]ct"),
+ regex!("agg[act]taaa|ttta[agt]cct"),
+ regex!("aggg[acg]aaa|ttt[cgt]ccct"),
+ regex!("agggt[cgt]aa|tt[acg]accct"),
+ regex!("agggta[cgt]a|t[acg]taccct"),
+ regex!("agggtaa[cgt]|[acg]ttaccct"),
+ ];
+ let mut counts = vec![];
+ for variant in variants {
+ let seq = seq_arc.clone();
+ let restr = variant.to_string();
+ let future = thread::spawn(move || variant.find_iter(&seq).count());
+ counts.push((restr, future));
+ }
+
+ let substs = vec![
+ (b'B', "(c|g|t)"),
+ (b'D', "(a|g|t)"),
+ (b'H', "(a|c|t)"),
+ (b'K', "(g|t)"),
+ (b'M', "(a|c)"),
+ (b'N', "(a|c|g|t)"),
+ (b'R', "(a|g)"),
+ (b'S', "(c|g)"),
+ (b'V', "(a|c|g)"),
+ (b'W', "(a|t)"),
+ (b'Y', "(c|t)"),
+ ]; // combined into one regex in `replace_all`
+ let seq = replace_all(&seq, substs);
+
+ for (variant, count) in counts {
+ println!("{} {}", variant, count.join().unwrap());
+ }
+ println!("\n{}\n{}\n{}", ilen, clen, seq.len());
+}
+
+fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String {
+ let mut replacements = vec![""; 256];
+ let mut alternates = vec![];
+ for (re, replacement) in substs {
+ replacements[re as usize] = replacement;
+ alternates.push((re as char).to_string());
+ }
+
+ let re = regex!(&alternates.join("|"));
+ let mut new = String::with_capacity(text.len());
+ let mut last_match = 0;
+ for m in re.find_iter(text) {
+ new.push_str(&text[last_match..m.start()]);
+ new.push_str(replacements[text.as_bytes()[m.start()] as usize]);
+ last_match = m.end();
+ }
+ new.push_str(&text[last_match..]);
+ new
+}
diff --git a/third_party/rust/regex/examples/shootout-regex-dna-replace.rs b/third_party/rust/regex/examples/shootout-regex-dna-replace.rs
new file mode 100644
index 0000000000..20694e06f3
--- /dev/null
+++ b/third_party/rust/regex/examples/shootout-regex-dna-replace.rs
@@ -0,0 +1,17 @@
+use std::io::{self, Read};
+
+macro_rules! regex {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new($re).build().unwrap().into_regex()
+ }};
+}
+
+fn main() {
+ let mut seq = String::with_capacity(50 * (1 << 20));
+ io::stdin().read_to_string(&mut seq).unwrap();
+ let ilen = seq.len();
+
+ seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned();
+ println!("original: {}, replaced: {}", ilen, seq.len());
+}
diff --git a/third_party/rust/regex/examples/shootout-regex-dna-single-cheat.rs b/third_party/rust/regex/examples/shootout-regex-dna-single-cheat.rs
new file mode 100644
index 0000000000..70a979c6d4
--- /dev/null
+++ b/third_party/rust/regex/examples/shootout-regex-dna-single-cheat.rs
@@ -0,0 +1,75 @@
+// The Computer Language Benchmarks Game
+// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
+//
+// contributed by the Rust Project Developers
+// contributed by TeXitoi
+// contributed by BurntSushi
+
+use std::io::{self, Read};
+
+macro_rules! regex {
+ ($re:expr) => {
+ ::regex::Regex::new($re).unwrap()
+ };
+}
+
+fn main() {
+ let mut seq = String::with_capacity(50 * (1 << 20));
+ io::stdin().read_to_string(&mut seq).unwrap();
+ let ilen = seq.len();
+
+ seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned();
+ let clen = seq.len();
+
+ let variants = vec![
+ regex!("agggtaaa|tttaccct"),
+ regex!("[cgt]gggtaaa|tttaccc[acg]"),
+ regex!("a[act]ggtaaa|tttacc[agt]t"),
+ regex!("ag[act]gtaaa|tttac[agt]ct"),
+ regex!("agg[act]taaa|ttta[agt]cct"),
+ regex!("aggg[acg]aaa|ttt[cgt]ccct"),
+ regex!("agggt[cgt]aa|tt[acg]accct"),
+ regex!("agggta[cgt]a|t[acg]taccct"),
+ regex!("agggtaa[cgt]|[acg]ttaccct"),
+ ];
+ for re in variants {
+ println!("{} {}", re.to_string(), re.find_iter(&seq).count());
+ }
+
+ let substs = vec![
+ (b'B', "(c|g|t)"),
+ (b'D', "(a|g|t)"),
+ (b'H', "(a|c|t)"),
+ (b'K', "(g|t)"),
+ (b'M', "(a|c)"),
+ (b'N', "(a|c|g|t)"),
+ (b'R', "(a|g)"),
+ (b'S', "(c|g)"),
+ (b'V', "(a|c|g)"),
+ (b'W', "(a|t)"),
+ (b'Y', "(c|t)"),
+ ]; // combined into one regex in `replace_all`
+ let seq = replace_all(&seq, substs);
+
+ println!("\n{}\n{}\n{}", ilen, clen, seq.len());
+}
+
+fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String {
+ let mut replacements = vec![""; 256];
+ let mut alternates = vec![];
+ for (re, replacement) in substs {
+ replacements[re as usize] = replacement;
+ alternates.push((re as char).to_string());
+ }
+
+ let re = regex!(&alternates.join("|"));
+ let mut new = String::with_capacity(text.len());
+ let mut last_match = 0;
+ for m in re.find_iter(text) {
+ new.push_str(&text[last_match..m.start()]);
+ new.push_str(replacements[text.as_bytes()[m.start()] as usize]);
+ last_match = m.end();
+ }
+ new.push_str(&text[last_match..]);
+ new
+}
diff --git a/third_party/rust/regex/examples/shootout-regex-dna-single.rs b/third_party/rust/regex/examples/shootout-regex-dna-single.rs
new file mode 100644
index 0000000000..b474059600
--- /dev/null
+++ b/third_party/rust/regex/examples/shootout-regex-dna-single.rs
@@ -0,0 +1,57 @@
+// The Computer Language Benchmarks Game
+// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
+//
+// contributed by the Rust Project Developers
+// contributed by TeXitoi
+// contributed by BurntSushi
+
+use std::io::{self, Read};
+
+macro_rules! regex {
+ ($re:expr) => {
+ ::regex::Regex::new($re).unwrap()
+ };
+}
+
+fn main() {
+ let mut seq = String::with_capacity(50 * (1 << 20));
+ io::stdin().read_to_string(&mut seq).unwrap();
+ let ilen = seq.len();
+
+ seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned();
+ let clen = seq.len();
+
+ let variants = vec![
+ regex!("agggtaaa|tttaccct"),
+ regex!("[cgt]gggtaaa|tttaccc[acg]"),
+ regex!("a[act]ggtaaa|tttacc[agt]t"),
+ regex!("ag[act]gtaaa|tttac[agt]ct"),
+ regex!("agg[act]taaa|ttta[agt]cct"),
+ regex!("aggg[acg]aaa|ttt[cgt]ccct"),
+ regex!("agggt[cgt]aa|tt[acg]accct"),
+ regex!("agggta[cgt]a|t[acg]taccct"),
+ regex!("agggtaa[cgt]|[acg]ttaccct"),
+ ];
+ for re in variants {
+ println!("{} {}", re.to_string(), re.find_iter(&seq).count());
+ }
+
+ let substs = vec![
+ (regex!("B"), "(c|g|t)"),
+ (regex!("D"), "(a|g|t)"),
+ (regex!("H"), "(a|c|t)"),
+ (regex!("K"), "(g|t)"),
+ (regex!("M"), "(a|c)"),
+ (regex!("N"), "(a|c|g|t)"),
+ (regex!("R"), "(a|g)"),
+ (regex!("S"), "(c|g)"),
+ (regex!("V"), "(a|c|g)"),
+ (regex!("W"), "(a|t)"),
+ (regex!("Y"), "(c|t)"),
+ ];
+ let mut seq = seq;
+ for (re, replacement) in substs {
+ seq = re.replace_all(&seq, replacement).into_owned();
+ }
+ println!("\n{}\n{}\n{}", ilen, clen, seq.len());
+}
diff --git a/third_party/rust/regex/examples/shootout-regex-dna.rs b/third_party/rust/regex/examples/shootout-regex-dna.rs
new file mode 100644
index 0000000000..b96518e4c4
--- /dev/null
+++ b/third_party/rust/regex/examples/shootout-regex-dna.rs
@@ -0,0 +1,68 @@
+// The Computer Language Benchmarks Game
+// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
+//
+// contributed by the Rust Project Developers
+// contributed by TeXitoi
+// contributed by BurntSushi
+
+use std::io::{self, Read};
+use std::sync::Arc;
+use std::thread;
+
+macro_rules! regex {
+ ($re:expr) => {
+ ::regex::Regex::new($re).unwrap()
+ };
+}
+
+fn main() {
+ let mut seq = String::with_capacity(51 * (1 << 20));
+ io::stdin().read_to_string(&mut seq).unwrap();
+ let ilen = seq.len();
+
+ seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned();
+ let clen = seq.len();
+ let seq_arc = Arc::new(seq.clone());
+
+ let variants = vec![
+ regex!("agggtaaa|tttaccct"),
+ regex!("[cgt]gggtaaa|tttaccc[acg]"),
+ regex!("a[act]ggtaaa|tttacc[agt]t"),
+ regex!("ag[act]gtaaa|tttac[agt]ct"),
+ regex!("agg[act]taaa|ttta[agt]cct"),
+ regex!("aggg[acg]aaa|ttt[cgt]ccct"),
+ regex!("agggt[cgt]aa|tt[acg]accct"),
+ regex!("agggta[cgt]a|t[acg]taccct"),
+ regex!("agggtaa[cgt]|[acg]ttaccct"),
+ ];
+ let mut counts = vec![];
+ for variant in variants {
+ let seq = seq_arc.clone();
+ let restr = variant.to_string();
+ let future = thread::spawn(move || variant.find_iter(&seq).count());
+ counts.push((restr, future));
+ }
+
+ let substs = vec![
+ (regex!("B"), "(c|g|t)"),
+ (regex!("D"), "(a|g|t)"),
+ (regex!("H"), "(a|c|t)"),
+ (regex!("K"), "(g|t)"),
+ (regex!("M"), "(a|c)"),
+ (regex!("N"), "(a|c|g|t)"),
+ (regex!("R"), "(a|g)"),
+ (regex!("S"), "(c|g)"),
+ (regex!("V"), "(a|c|g)"),
+ (regex!("W"), "(a|t)"),
+ (regex!("Y"), "(c|t)"),
+ ];
+ let mut seq = seq;
+ for (re, replacement) in substs {
+ seq = re.replace_all(&seq, replacement).into_owned();
+ }
+
+ for (variant, count) in counts {
+ println!("{} {}", variant, count.join().unwrap());
+ }
+ println!("\n{}\n{}\n{}", ilen, clen, seq.len());
+}
diff --git a/third_party/rust/regex/rustfmt.toml b/third_party/rust/regex/rustfmt.toml
new file mode 100644
index 0000000000..aa37a218b9
--- /dev/null
+++ b/third_party/rust/regex/rustfmt.toml
@@ -0,0 +1,2 @@
+max_width = 79
+use_small_heuristics = "max"
diff --git a/third_party/rust/regex/src/backtrack.rs b/third_party/rust/regex/src/backtrack.rs
new file mode 100644
index 0000000000..4d83856ca0
--- /dev/null
+++ b/third_party/rust/regex/src/backtrack.rs
@@ -0,0 +1,282 @@
+// This is the backtracking matching engine. It has the same exact capability
+// as the full NFA simulation, except it is artificially restricted to small
+// regexes on small inputs because of its memory requirements.
+//
+// In particular, this is a *bounded* backtracking engine. It retains worst
+// case linear time by keeping track of the states that it has visited (using a
+// bitmap). Namely, once a state is visited, it is never visited again. Since a
+// state is keyed by `(instruction index, input index)`, we have that its time
+// complexity is `O(mn)` (i.e., linear in the size of the search text).
+//
+// The backtracking engine can beat out the NFA simulation on small
+// regexes/inputs because it doesn't have to keep track of multiple copies of
+// the capture groups. In benchmarks, the backtracking engine is roughly twice
+// as fast as the full NFA simulation. Note though that its performance doesn't
+// scale, even if you're willing to live with the memory requirements. Namely,
+// the bitset has to be zeroed on each execution, which becomes quite expensive
+// on large bitsets.
+
+use crate::exec::ProgramCache;
+use crate::input::{Input, InputAt};
+use crate::prog::{InstPtr, Program};
+use crate::re_trait::Slot;
+
+type Bits = u32;
+
+const BIT_SIZE: usize = 32;
+const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
+
+/// Returns true iff the given regex and input should be executed by this
+/// engine with reasonable memory usage.
+pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
+ // Total memory usage in bytes is determined by:
+ //
+ // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
+ //
+ // The actual limit picked is pretty much a heuristic.
+ // See: https://github.com/rust-lang/regex/issues/215
+ let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
+ size <= MAX_SIZE_BYTES
+}
+
+/// A backtracking matching engine.
+#[derive(Debug)]
+pub struct Bounded<'a, 'm, 'r, 's, I> {
+ prog: &'r Program,
+ input: I,
+ matches: &'m mut [bool],
+ slots: &'s mut [Slot],
+ m: &'a mut Cache,
+}
+
+/// Shared cached state between multiple invocations of a backtracking engine
+/// in the same thread.
+#[derive(Clone, Debug)]
+pub struct Cache {
+ jobs: Vec<Job>,
+ visited: Vec<Bits>,
+}
+
+impl Cache {
+ /// Create new empty cache for the backtracking engine.
+ pub fn new(_prog: &Program) -> Self {
+ Cache { jobs: vec![], visited: vec![] }
+ }
+}
+
+/// A job is an explicit unit of stack space in the backtracking engine.
+///
+/// The "normal" representation is a single state transition, which corresponds
+/// to an NFA state and a character in the input. However, the backtracking
+/// engine must keep track of old capture group values. We use the explicit
+/// stack to do it.
+#[derive(Clone, Copy, Debug)]
+enum Job {
+ Inst { ip: InstPtr, at: InputAt },
+ SaveRestore { slot: usize, old_pos: Option<usize> },
+}
+
+impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
+ /// Execute the backtracking matching engine.
+ ///
+ /// If there's a match, `exec` returns `true` and populates the given
+ /// captures accordingly.
+ pub fn exec(
+ prog: &'r Program,
+ cache: &ProgramCache,
+ matches: &'m mut [bool],
+ slots: &'s mut [Slot],
+ input: I,
+ start: usize,
+ end: usize,
+ ) -> bool {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.backtrack;
+ let start = input.at(start);
+ let mut b = Bounded { prog, input, matches, slots, m: cache };
+ b.exec_(start, end)
+ }
+
+ /// Clears the cache such that the backtracking engine can be executed
+ /// on some input of fixed length.
+ fn clear(&mut self) {
+ // Reset the job memory so that we start fresh.
+ self.m.jobs.clear();
+
+ // Now we need to clear the bit state set.
+ // We do this by figuring out how much space we need to keep track
+ // of the states we've visited.
+ // Then we reset all existing allocated space to 0.
+ // Finally, we request more space if we need it.
+ //
+ // This is all a little circuitous, but doing this using unchecked
+ // operations doesn't seem to have a measurable impact on performance.
+ // (Probably because backtracking is limited to such small
+ // inputs/regexes in the first place.)
+ let visited_len =
+ (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
+ / BIT_SIZE;
+ self.m.visited.truncate(visited_len);
+ for v in &mut self.m.visited {
+ *v = 0;
+ }
+ if visited_len > self.m.visited.len() {
+ let len = self.m.visited.len();
+ self.m.visited.reserve_exact(visited_len - len);
+ for _ in 0..(visited_len - len) {
+ self.m.visited.push(0);
+ }
+ }
+ }
+
+ /// Start backtracking at the given position in the input, but also look
+ /// for literal prefixes.
+ fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
+ self.clear();
+ // If this is an anchored regex at the beginning of the input, then
+ // we're either already done or we only need to try backtracking once.
+ if self.prog.is_anchored_start {
+ return if !at.is_start() { false } else { self.backtrack(at) };
+ }
+ let mut matched = false;
+ loop {
+ if !self.prog.prefixes.is_empty() {
+ at = match self.input.prefix_at(&self.prog.prefixes, at) {
+ None => break,
+ Some(at) => at,
+ };
+ }
+ matched = self.backtrack(at) || matched;
+ if matched && self.prog.matches.len() == 1 {
+ return true;
+ }
+ if at.pos() >= end {
+ break;
+ }
+ at = self.input.at(at.next_pos());
+ }
+ matched
+ }
+
+ /// The main backtracking loop starting at the given input position.
+ fn backtrack(&mut self, start: InputAt) -> bool {
+ // N.B. We use an explicit stack to avoid recursion.
+ // To avoid excessive pushing and popping, most transitions are handled
+ // in the `step` helper function, which only pushes to the stack when
+ // there's a capture or a branch.
+ let mut matched = false;
+ self.m.jobs.push(Job::Inst { ip: 0, at: start });
+ while let Some(job) = self.m.jobs.pop() {
+ match job {
+ Job::Inst { ip, at } => {
+ if self.step(ip, at) {
+ // Only quit if we're matching one regex.
+ // If we're matching a regex set, then mush on and
+ // try to find other matches (if we want them).
+ if self.prog.matches.len() == 1 {
+ return true;
+ }
+ matched = true;
+ }
+ }
+ Job::SaveRestore { slot, old_pos } => {
+ if slot < self.slots.len() {
+ self.slots[slot] = old_pos;
+ }
+ }
+ }
+ }
+ matched
+ }
+
+ fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
+ use crate::prog::Inst::*;
+ loop {
+ // This loop is an optimization to avoid constantly pushing/popping
+ // from the stack. Namely, if we're pushing a job only to run it
+ // next, avoid the push and just mutate `ip` (and possibly `at`)
+ // in place.
+ if self.has_visited(ip, at) {
+ return false;
+ }
+ match self.prog[ip] {
+ Match(slot) => {
+ if slot < self.matches.len() {
+ self.matches[slot] = true;
+ }
+ return true;
+ }
+ Save(ref inst) => {
+ if let Some(&old_pos) = self.slots.get(inst.slot) {
+ // If this path doesn't work out, then we save the old
+ // capture index (if one exists) in an alternate
+ // job. If the next path fails, then the alternate
+ // job is popped and the old capture index is restored.
+ self.m.jobs.push(Job::SaveRestore {
+ slot: inst.slot,
+ old_pos,
+ });
+ self.slots[inst.slot] = Some(at.pos());
+ }
+ ip = inst.goto;
+ }
+ Split(ref inst) => {
+ self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
+ ip = inst.goto1;
+ }
+ EmptyLook(ref inst) => {
+ if self.input.is_empty_match(at, inst) {
+ ip = inst.goto;
+ } else {
+ return false;
+ }
+ }
+ Char(ref inst) => {
+ if inst.c == at.char() {
+ ip = inst.goto;
+ at = self.input.at(at.next_pos());
+ } else {
+ return false;
+ }
+ }
+ Ranges(ref inst) => {
+ if inst.matches(at.char()) {
+ ip = inst.goto;
+ at = self.input.at(at.next_pos());
+ } else {
+ return false;
+ }
+ }
+ Bytes(ref inst) => {
+ if let Some(b) = at.byte() {
+ if inst.matches(b) {
+ ip = inst.goto;
+ at = self.input.at(at.next_pos());
+ continue;
+ }
+ }
+ return false;
+ }
+ }
+ }
+ }
+
+ fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
+ let k = ip * (self.input.len() + 1) + at.pos();
+ let k1 = k / BIT_SIZE;
+ let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
+ if self.m.visited[k1] & k2 == 0 {
+ self.m.visited[k1] |= k2;
+ false
+ } else {
+ true
+ }
+ }
+}
+
+fn usize_to_u32(n: usize) -> u32 {
+ if (n as u64) > (::std::u32::MAX as u64) {
+ panic!("BUG: {} is too big to fit into u32", n)
+ }
+ n as u32
+}
diff --git a/third_party/rust/regex/src/compile.rs b/third_party/rust/regex/src/compile.rs
new file mode 100644
index 0000000000..90ca25015f
--- /dev/null
+++ b/third_party/rust/regex/src/compile.rs
@@ -0,0 +1,1264 @@
+use std::collections::HashMap;
+use std::fmt;
+use std::iter;
+use std::result;
+use std::sync::Arc;
+
+use regex_syntax::hir::{self, Hir};
+use regex_syntax::is_word_byte;
+use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
+
+use crate::prog::{
+ EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
+ InstSave, InstSplit, Program,
+};
+
+use crate::Error;
+
+type Result = result::Result<Patch, Error>;
+type ResultOrEmpty = result::Result<Option<Patch>, Error>;
+
+#[derive(Debug)]
+struct Patch {
+ hole: Hole,
+ entry: InstPtr,
+}
+
+/// A compiler translates a regular expression AST to a sequence of
+/// instructions. The sequence of instructions represents an NFA.
+// `Compiler` is only public via the `internal` module, so avoid deriving
+// `Debug`.
+#[allow(missing_debug_implementations)]
+pub struct Compiler {
+ insts: Vec<MaybeInst>,
+ compiled: Program,
+ capture_name_idx: HashMap<String, usize>,
+ num_exprs: usize,
+ size_limit: usize,
+ suffix_cache: SuffixCache,
+ utf8_seqs: Option<Utf8Sequences>,
+ byte_classes: ByteClassSet,
+ // This keeps track of extra bytes allocated while compiling the regex
+ // program. Currently, this corresponds to two things. First is the heap
+ // memory allocated by Unicode character classes ('InstRanges'). Second is
+ // a "fake" amount of memory used by empty sub-expressions, so that enough
+ // empty sub-expressions will ultimately trigger the compiler to bail
+ // because of a size limit restriction. (That empty sub-expressions don't
+ // add to heap memory usage is more-or-less an implementation detail.) In
+ // the second case, if we don't bail, then an excessively large repetition
+ // on an empty sub-expression can result in the compiler using a very large
+ // amount of CPU time.
+ extra_inst_bytes: usize,
+}
+
+impl Compiler {
+ /// Create a new regular expression compiler.
+ ///
+ /// Various options can be set before calling `compile` on an expression.
+ pub fn new() -> Self {
+ Compiler {
+ insts: vec![],
+ compiled: Program::new(),
+ capture_name_idx: HashMap::new(),
+ num_exprs: 0,
+ size_limit: 10 * (1 << 20),
+ suffix_cache: SuffixCache::new(1000),
+ utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
+ byte_classes: ByteClassSet::new(),
+ extra_inst_bytes: 0,
+ }
+ }
+
+ /// The size of the resulting program is limited by size_limit. If
+ /// the program approximately exceeds the given size (in bytes), then
+ /// compilation will stop and return an error.
+ pub fn size_limit(mut self, size_limit: usize) -> Self {
+ self.size_limit = size_limit;
+ self
+ }
+
+ /// If bytes is true, then the program is compiled as a byte based
+ /// automaton, which incorporates UTF-8 decoding into the machine. If it's
+ /// false, then the automaton is Unicode scalar value based, e.g., an
+ /// engine utilizing such an automaton is responsible for UTF-8 decoding.
+ ///
+ /// The specific invariant is that when returning a byte based machine,
+ /// the neither the `Char` nor `Ranges` instructions are produced.
+ /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
+ /// instruction is never produced.
+ ///
+ /// Note that `dfa(true)` implies `bytes(true)`.
+ pub fn bytes(mut self, yes: bool) -> Self {
+ self.compiled.is_bytes = yes;
+ self
+ }
+
+ /// When disabled, the program compiled may match arbitrary bytes.
+ ///
+ /// When enabled (the default), all compiled programs exclusively match
+ /// valid UTF-8 bytes.
+ pub fn only_utf8(mut self, yes: bool) -> Self {
+ self.compiled.only_utf8 = yes;
+ self
+ }
+
+ /// When set, the machine returned is suitable for use in the DFA matching
+ /// engine.
+ ///
+ /// In particular, this ensures that if the regex is not anchored in the
+ /// beginning, then a preceding `.*?` is included in the program. (The NFA
+ /// based engines handle the preceding `.*?` explicitly, which is difficult
+ /// or impossible in the DFA engine.)
+ pub fn dfa(mut self, yes: bool) -> Self {
+ self.compiled.is_dfa = yes;
+ self
+ }
+
+ /// When set, the machine returned is suitable for matching text in
+ /// reverse. In particular, all concatenations are flipped.
+ pub fn reverse(mut self, yes: bool) -> Self {
+ self.compiled.is_reverse = yes;
+ self
+ }
+
+ /// Compile a regular expression given its AST.
+ ///
+ /// The compiler is guaranteed to succeed unless the program exceeds the
+ /// specified size limit. If the size limit is exceeded, then compilation
+ /// stops and returns an error.
+ pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
+ debug_assert!(!exprs.is_empty());
+ self.num_exprs = exprs.len();
+ if exprs.len() == 1 {
+ self.compile_one(&exprs[0])
+ } else {
+ self.compile_many(exprs)
+ }
+ }
+
+ fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
+ // If we're compiling a forward DFA and we aren't anchored, then
+ // add a `.*?` before the first capture group.
+ // Other matching engines handle this by baking the logic into the
+ // matching engine itself.
+ let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
+ self.compiled.is_anchored_start = expr.is_anchored_start();
+ self.compiled.is_anchored_end = expr.is_anchored_end();
+ if self.compiled.needs_dotstar() {
+ dotstar_patch = self.c_dotstar()?;
+ self.compiled.start = dotstar_patch.entry;
+ }
+ self.compiled.captures = vec![None];
+ let patch =
+ self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
+ if self.compiled.needs_dotstar() {
+ self.fill(dotstar_patch.hole, patch.entry);
+ } else {
+ self.compiled.start = patch.entry;
+ }
+ self.fill_to_next(patch.hole);
+ self.compiled.matches = vec![self.insts.len()];
+ self.push_compiled(Inst::Match(0));
+ self.compile_finish()
+ }
+
+ fn compile_many(
+ mut self,
+ exprs: &[Hir],
+ ) -> result::Result<Program, Error> {
+ debug_assert!(exprs.len() > 1);
+
+ self.compiled.is_anchored_start =
+ exprs.iter().all(|e| e.is_anchored_start());
+ self.compiled.is_anchored_end =
+ exprs.iter().all(|e| e.is_anchored_end());
+ let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
+ if self.compiled.needs_dotstar() {
+ dotstar_patch = self.c_dotstar()?;
+ self.compiled.start = dotstar_patch.entry;
+ } else {
+ self.compiled.start = 0; // first instruction is always split
+ }
+ self.fill_to_next(dotstar_patch.hole);
+
+ let mut prev_hole = Hole::None;
+ for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
+ self.fill_to_next(prev_hole);
+ let split = self.push_split_hole();
+ let Patch { hole, entry } =
+ self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
+ self.fill_to_next(hole);
+ self.compiled.matches.push(self.insts.len());
+ self.push_compiled(Inst::Match(i));
+ prev_hole = self.fill_split(split, Some(entry), None);
+ }
+ let i = exprs.len() - 1;
+ let Patch { hole, entry } =
+ self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
+ self.fill(prev_hole, entry);
+ self.fill_to_next(hole);
+ self.compiled.matches.push(self.insts.len());
+ self.push_compiled(Inst::Match(i));
+ self.compile_finish()
+ }
+
+ fn compile_finish(mut self) -> result::Result<Program, Error> {
+ self.compiled.insts =
+ self.insts.into_iter().map(|inst| inst.unwrap()).collect();
+ self.compiled.byte_classes = self.byte_classes.byte_classes();
+ self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
+ Ok(self.compiled)
+ }
+
+ /// Compile expr into self.insts, returning a patch on success,
+ /// or an error if we run out of memory.
+ ///
+ /// All of the c_* methods of the compiler share the contract outlined
+ /// here.
+ ///
+ /// The main thing that a c_* method does is mutate `self.insts`
+ /// to add a list of mostly compiled instructions required to execute
+ /// the given expression. `self.insts` contains MaybeInsts rather than
+ /// Insts because there is some backpatching required.
+ ///
+ /// The `Patch` value returned by each c_* method provides metadata
+ /// about the compiled instructions emitted to `self.insts`. The
+ /// `entry` member of the patch refers to the first instruction
+ /// (the entry point), while the `hole` member contains zero or
+ /// more offsets to partial instructions that need to be backpatched.
+ /// The c_* routine can't know where its list of instructions are going to
+ /// jump to after execution, so it is up to the caller to patch
+ /// these jumps to point to the right place. So compiling some
+ /// expression, e, we would end up with a situation that looked like:
+ ///
+ /// ```text
+ /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
+ /// ^ ^ ^
+ /// | \ /
+ /// entry \ /
+ /// hole
+ /// ```
+ ///
+ /// To compile two expressions, e1 and e2, concatenated together we
+ /// would do:
+ ///
+ /// ```ignore
+ /// let patch1 = self.c(e1);
+ /// let patch2 = self.c(e2);
+ /// ```
+ ///
+ /// while leaves us with a situation that looks like
+ ///
+ /// ```text
+ /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
+ /// ^ ^ ^ ^
+ /// | | | |
+ /// entry1 hole1 entry2 hole2
+ /// ```
+ ///
+ /// Then to merge the two patches together into one we would backpatch
+ /// hole1 with entry2 and return a new patch that enters at entry1
+ /// and has hole2 for a hole. In fact, if you look at the c_concat
+ /// method you will see that it does exactly this, though it handles
+ /// a list of expressions rather than just the two that we use for
+ /// an example.
+ ///
+ /// Ok(None) is returned when an expression is compiled to no
+ /// instruction, and so no patch.entry value makes sense.
+ fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
+ use crate::prog;
+ use regex_syntax::hir::HirKind::*;
+
+ self.check_size()?;
+ match *expr.kind() {
+ Empty => self.c_empty(),
+ Literal(hir::Literal::Unicode(c)) => self.c_char(c),
+ Literal(hir::Literal::Byte(b)) => {
+ assert!(self.compiled.uses_bytes());
+ self.c_byte(b)
+ }
+ Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
+ Class(hir::Class::Bytes(ref cls)) => {
+ if self.compiled.uses_bytes() {
+ self.c_class_bytes(cls.ranges())
+ } else {
+ assert!(cls.is_all_ascii());
+ let mut char_ranges = vec![];
+ for r in cls.iter() {
+ let (s, e) = (r.start() as char, r.end() as char);
+ char_ranges.push(hir::ClassUnicodeRange::new(s, e));
+ }
+ self.c_class(&char_ranges)
+ }
+ }
+ Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ Anchor(hir::Anchor::StartLine) => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ Anchor(hir::Anchor::EndLine) => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::EndText)
+ }
+ Anchor(hir::Anchor::StartText) => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ Anchor(hir::Anchor::EndText) => {
+ self.c_empty_look(prog::EmptyLook::EndText)
+ }
+ WordBoundary(hir::WordBoundary::Unicode) => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
+ }
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // We also make sure that all ASCII bytes are in a different
+ // class from non-ASCII bytes. Otherwise, it's possible for
+ // ASCII bytes to get lumped into the same class as non-ASCII
+ // bytes. This in turn may cause the lazy DFA to falsely start
+ // when it sees an ASCII byte that maps to a byte class with
+ // non-ASCII bytes. This ensures that never happens.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::WordBoundary)
+ }
+ WordBoundary(hir::WordBoundary::UnicodeNegate) => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
+ }
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // See comments above for why we set the ASCII range here.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::NotWordBoundary)
+ }
+ WordBoundary(hir::WordBoundary::Ascii) => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
+ }
+ WordBoundary(hir::WordBoundary::AsciiNegate) => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
+ }
+ Group(ref g) => match g.kind {
+ hir::GroupKind::NonCapturing => self.c(&g.hir),
+ hir::GroupKind::CaptureIndex(index) => {
+ if index as usize >= self.compiled.captures.len() {
+ self.compiled.captures.push(None);
+ }
+ self.c_capture(2 * index as usize, &g.hir)
+ }
+ hir::GroupKind::CaptureName { index, ref name } => {
+ if index as usize >= self.compiled.captures.len() {
+ let n = name.to_string();
+ self.compiled.captures.push(Some(n.clone()));
+ self.capture_name_idx.insert(n, index as usize);
+ }
+ self.c_capture(2 * index as usize, &g.hir)
+ }
+ },
+ Concat(ref es) => {
+ if self.compiled.is_reverse {
+ self.c_concat(es.iter().rev())
+ } else {
+ self.c_concat(es)
+ }
+ }
+ Alternation(ref es) => self.c_alternate(&**es),
+ Repetition(ref rep) => self.c_repeat(rep),
+ }
+ }
+
+ fn c_empty(&mut self) -> ResultOrEmpty {
+ // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+ // See: CVE-2022-24713
+ //
+ // Since 'empty' sub-expressions don't increase the size of
+ // the actual compiled object, we "fake" an increase in its
+ // size so that our 'check_size_limit' routine will eventually
+ // stop compilation if there are too many empty sub-expressions
+ // (e.g., via a large repetition).
+ self.extra_inst_bytes += std::mem::size_of::<Inst>();
+ Ok(None)
+ }
+
+ fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
+ if self.num_exprs > 1 || self.compiled.is_dfa {
+ // Don't ever compile Save instructions for regex sets because
+ // they are never used. They are also never used in DFA programs
+ // because DFAs can't handle captures.
+ self.c(expr)
+ } else {
+ let entry = self.insts.len();
+ let hole = self.push_hole(InstHole::Save { slot: first_slot });
+ let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
+ self.fill(hole, patch.entry);
+ self.fill_to_next(patch.hole);
+ let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
+ Ok(Some(Patch { hole, entry }))
+ }
+ }
+
+ fn c_dotstar(&mut self) -> Result {
+ Ok(if !self.compiled.only_utf8() {
+ self.c(&Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy: false,
+ hir: Box::new(Hir::any(true)),
+ }))?
+ .unwrap()
+ } else {
+ self.c(&Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy: false,
+ hir: Box::new(Hir::any(false)),
+ }))?
+ .unwrap()
+ })
+ }
+
+ fn c_char(&mut self, c: char) -> ResultOrEmpty {
+ if self.compiled.uses_bytes() {
+ if c.is_ascii() {
+ let b = c as u8;
+ let hole =
+ self.push_hole(InstHole::Bytes { start: b, end: b });
+ self.byte_classes.set_range(b, b);
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+ } else {
+ self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
+ }
+ } else {
+ let hole = self.push_hole(InstHole::Char { c });
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+ }
+ }
+
+ fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
+ use std::mem::size_of;
+
+ assert!(!ranges.is_empty());
+ if self.compiled.uses_bytes() {
+ Ok(Some(CompileClass { c: self, ranges }.compile()?))
+ } else {
+ let ranges: Vec<(char, char)> =
+ ranges.iter().map(|r| (r.start(), r.end())).collect();
+ let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
+ self.push_hole(InstHole::Char { c: ranges[0].0 })
+ } else {
+ self.extra_inst_bytes +=
+ ranges.len() * (size_of::<char>() * 2);
+ self.push_hole(InstHole::Ranges { ranges })
+ };
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+ }
+ }
+
+ fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
+ self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
+ }
+
+ fn c_class_bytes(
+ &mut self,
+ ranges: &[hir::ClassBytesRange],
+ ) -> ResultOrEmpty {
+ debug_assert!(!ranges.is_empty());
+
+ let first_split_entry = self.insts.len();
+ let mut holes = vec![];
+ let mut prev_hole = Hole::None;
+ for r in &ranges[0..ranges.len() - 1] {
+ self.fill_to_next(prev_hole);
+ let split = self.push_split_hole();
+ let next = self.insts.len();
+ self.byte_classes.set_range(r.start(), r.end());
+ holes.push(self.push_hole(InstHole::Bytes {
+ start: r.start(),
+ end: r.end(),
+ }));
+ prev_hole = self.fill_split(split, Some(next), None);
+ }
+ let next = self.insts.len();
+ let r = &ranges[ranges.len() - 1];
+ self.byte_classes.set_range(r.start(), r.end());
+ holes.push(
+ self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
+ );
+ self.fill(prev_hole, next);
+ Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
+ }
+
+ fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
+ let hole = self.push_hole(InstHole::EmptyLook { look });
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+ }
+
+ fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
+ where
+ I: IntoIterator<Item = &'a Hir>,
+ {
+ let mut exprs = exprs.into_iter();
+ let Patch { mut hole, entry } = loop {
+ match exprs.next() {
+ None => return self.c_empty(),
+ Some(e) => {
+ if let Some(p) = self.c(e)? {
+ break p;
+ }
+ }
+ }
+ };
+ for e in exprs {
+ if let Some(p) = self.c(e)? {
+ self.fill(hole, p.entry);
+ hole = p.hole;
+ }
+ }
+ Ok(Some(Patch { hole, entry }))
+ }
+
+ fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
+ debug_assert!(
+ exprs.len() >= 2,
+ "alternates must have at least 2 exprs"
+ );
+
+ // Initial entry point is always the first split.
+ let first_split_entry = self.insts.len();
+
+ // Save up all of the holes from each alternate. They will all get
+ // patched to point to the same location.
+ let mut holes = vec![];
+
+ // true indicates that the hole is a split where we want to fill
+ // the second branch.
+ let mut prev_hole = (Hole::None, false);
+ for e in &exprs[0..exprs.len() - 1] {
+ if prev_hole.1 {
+ let next = self.insts.len();
+ self.fill_split(prev_hole.0, None, Some(next));
+ } else {
+ self.fill_to_next(prev_hole.0);
+ }
+ let split = self.push_split_hole();
+ if let Some(Patch { hole, entry }) = self.c(e)? {
+ holes.push(hole);
+ prev_hole = (self.fill_split(split, Some(entry), None), false);
+ } else {
+ let (split1, split2) = split.dup_one();
+ holes.push(split1);
+ prev_hole = (split2, true);
+ }
+ }
+ if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
+ holes.push(hole);
+ if prev_hole.1 {
+ self.fill_split(prev_hole.0, None, Some(entry));
+ } else {
+ self.fill(prev_hole.0, entry);
+ }
+ } else {
+ // We ignore prev_hole.1. When it's true, it means we have two
+ // empty branches both pushing prev_hole.0 into holes, so both
+ // branches will go to the same place anyway.
+ holes.push(prev_hole.0);
+ }
+ Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
+ }
+
+ fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
+ use regex_syntax::hir::RepetitionKind::*;
+ match rep.kind {
+ ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
+ ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
+ OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
+ Range(hir::RepetitionRange::Exactly(min_max)) => {
+ self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
+ }
+ Range(hir::RepetitionRange::AtLeast(min)) => {
+ self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
+ }
+ Range(hir::RepetitionRange::Bounded(min, max)) => {
+ self.c_repeat_range(&rep.hir, rep.greedy, min, max)
+ }
+ }
+ }
+
+ fn c_repeat_zero_or_one(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ ) -> ResultOrEmpty {
+ let split_entry = self.insts.len();
+ let split = self.push_split_hole();
+ let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
+ Some(p) => p,
+ None => return self.pop_split_hole(),
+ };
+ let split_hole = if greedy {
+ self.fill_split(split, Some(entry_rep), None)
+ } else {
+ self.fill_split(split, None, Some(entry_rep))
+ };
+ let holes = vec![hole_rep, split_hole];
+ Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
+ }
+
+ fn c_repeat_zero_or_more(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ ) -> ResultOrEmpty {
+ let split_entry = self.insts.len();
+ let split = self.push_split_hole();
+ let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
+ Some(p) => p,
+ None => return self.pop_split_hole(),
+ };
+
+ self.fill(hole_rep, split_entry);
+ let split_hole = if greedy {
+ self.fill_split(split, Some(entry_rep), None)
+ } else {
+ self.fill_split(split, None, Some(entry_rep))
+ };
+ Ok(Some(Patch { hole: split_hole, entry: split_entry }))
+ }
+
+ fn c_repeat_one_or_more(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ ) -> ResultOrEmpty {
+ let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
+ Some(p) => p,
+ None => return Ok(None),
+ };
+ self.fill_to_next(hole_rep);
+ let split = self.push_split_hole();
+
+ let split_hole = if greedy {
+ self.fill_split(split, Some(entry_rep), None)
+ } else {
+ self.fill_split(split, None, Some(entry_rep))
+ };
+ Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
+ }
+
+ fn c_repeat_range_min_or_more(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ min: u32,
+ ) -> ResultOrEmpty {
+ let min = u32_to_usize(min);
+ // Using next_inst() is ok, because we can't return it (concat would
+ // have to return Some(_) while c_repeat_range_min_or_more returns
+ // None).
+ let patch_concat = self
+ .c_concat(iter::repeat(expr).take(min))?
+ .unwrap_or_else(|| self.next_inst());
+ if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
+ self.fill(patch_concat.hole, patch_rep.entry);
+ Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
+ } else {
+ Ok(None)
+ }
+ }
+
+ fn c_repeat_range(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ min: u32,
+ max: u32,
+ ) -> ResultOrEmpty {
+ let (min, max) = (u32_to_usize(min), u32_to_usize(max));
+ debug_assert!(min <= max);
+ let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
+ if min == max {
+ return Ok(patch_concat);
+ }
+ // Same reasoning as in c_repeat_range_min_or_more (we know that min <
+ // max at this point).
+ let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
+ let initial_entry = patch_concat.entry;
+ // It is much simpler to compile, e.g., `a{2,5}` as:
+ //
+ // aaa?a?a?
+ //
+ // But you end up with a sequence of instructions like this:
+ //
+ // 0: 'a'
+ // 1: 'a',
+ // 2: split(3, 4)
+ // 3: 'a'
+ // 4: split(5, 6)
+ // 5: 'a'
+ // 6: split(7, 8)
+ // 7: 'a'
+ // 8: MATCH
+ //
+ // This is *incredibly* inefficient because the splits end
+ // up forming a chain, which has to be resolved everything a
+ // transition is followed.
+ let mut holes = vec![];
+ let mut prev_hole = patch_concat.hole;
+ for _ in min..max {
+ self.fill_to_next(prev_hole);
+ let split = self.push_split_hole();
+ let Patch { hole, entry } = match self.c(expr)? {
+ Some(p) => p,
+ None => return self.pop_split_hole(),
+ };
+ prev_hole = hole;
+ if greedy {
+ holes.push(self.fill_split(split, Some(entry), None));
+ } else {
+ holes.push(self.fill_split(split, None, Some(entry)));
+ }
+ }
+ holes.push(prev_hole);
+ Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
+ }
+
+ /// Can be used as a default value for the c_* functions when the call to
+ /// c_function is followed by inserting at least one instruction that is
+ /// always executed after the ones written by the c* function.
+ fn next_inst(&self) -> Patch {
+ Patch { hole: Hole::None, entry: self.insts.len() }
+ }
+
+ fn fill(&mut self, hole: Hole, goto: InstPtr) {
+ match hole {
+ Hole::None => {}
+ Hole::One(pc) => {
+ self.insts[pc].fill(goto);
+ }
+ Hole::Many(holes) => {
+ for hole in holes {
+ self.fill(hole, goto);
+ }
+ }
+ }
+ }
+
+ fn fill_to_next(&mut self, hole: Hole) {
+ let next = self.insts.len();
+ self.fill(hole, next);
+ }
+
+ fn fill_split(
+ &mut self,
+ hole: Hole,
+ goto1: Option<InstPtr>,
+ goto2: Option<InstPtr>,
+ ) -> Hole {
+ match hole {
+ Hole::None => Hole::None,
+ Hole::One(pc) => match (goto1, goto2) {
+ (Some(goto1), Some(goto2)) => {
+ self.insts[pc].fill_split(goto1, goto2);
+ Hole::None
+ }
+ (Some(goto1), None) => {
+ self.insts[pc].half_fill_split_goto1(goto1);
+ Hole::One(pc)
+ }
+ (None, Some(goto2)) => {
+ self.insts[pc].half_fill_split_goto2(goto2);
+ Hole::One(pc)
+ }
+ (None, None) => unreachable!(
+ "at least one of the split \
+ holes must be filled"
+ ),
+ },
+ Hole::Many(holes) => {
+ let mut new_holes = vec![];
+ for hole in holes {
+ new_holes.push(self.fill_split(hole, goto1, goto2));
+ }
+ if new_holes.is_empty() {
+ Hole::None
+ } else if new_holes.len() == 1 {
+ new_holes.pop().unwrap()
+ } else {
+ Hole::Many(new_holes)
+ }
+ }
+ }
+ }
+
+ fn push_compiled(&mut self, inst: Inst) {
+ self.insts.push(MaybeInst::Compiled(inst));
+ }
+
+ fn push_hole(&mut self, inst: InstHole) -> Hole {
+ let hole = self.insts.len();
+ self.insts.push(MaybeInst::Uncompiled(inst));
+ Hole::One(hole)
+ }
+
+ fn push_split_hole(&mut self) -> Hole {
+ let hole = self.insts.len();
+ self.insts.push(MaybeInst::Split);
+ Hole::One(hole)
+ }
+
+ fn pop_split_hole(&mut self) -> ResultOrEmpty {
+ self.insts.pop();
+ Ok(None)
+ }
+
+ fn check_size(&self) -> result::Result<(), Error> {
+ use std::mem::size_of;
+
+ let size =
+ self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
+ if size > self.size_limit {
+ Err(Error::CompiledTooBig(self.size_limit))
+ } else {
+ Ok(())
+ }
+ }
+}
+
+#[derive(Debug)]
+enum Hole {
+ None,
+ One(InstPtr),
+ Many(Vec<Hole>),
+}
+
+impl Hole {
+ fn dup_one(self) -> (Self, Self) {
+ match self {
+ Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
+ Hole::None | Hole::Many(_) => {
+ unreachable!("must be called on single hole")
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+enum MaybeInst {
+ Compiled(Inst),
+ Uncompiled(InstHole),
+ Split,
+ Split1(InstPtr),
+ Split2(InstPtr),
+}
+
+impl MaybeInst {
+ fn fill(&mut self, goto: InstPtr) {
+ let maybeinst = match *self {
+ MaybeInst::Split => MaybeInst::Split1(goto),
+ MaybeInst::Uncompiled(ref inst) => {
+ MaybeInst::Compiled(inst.fill(goto))
+ }
+ MaybeInst::Split1(goto1) => {
+ MaybeInst::Compiled(Inst::Split(InstSplit {
+ goto1,
+ goto2: goto,
+ }))
+ }
+ MaybeInst::Split2(goto2) => {
+ MaybeInst::Compiled(Inst::Split(InstSplit {
+ goto1: goto,
+ goto2,
+ }))
+ }
+ _ => unreachable!(
+ "not all instructions were compiled! \
+ found uncompiled instruction: {:?}",
+ self
+ ),
+ };
+ *self = maybeinst;
+ }
+
+ fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
+ let filled = match *self {
+ MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
+ _ => unreachable!(
+ "must be called on Split instruction, \
+ instead it was called on: {:?}",
+ self
+ ),
+ };
+ *self = MaybeInst::Compiled(filled);
+ }
+
+ fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
+ let half_filled = match *self {
+ MaybeInst::Split => goto1,
+ _ => unreachable!(
+ "must be called on Split instruction, \
+ instead it was called on: {:?}",
+ self
+ ),
+ };
+ *self = MaybeInst::Split1(half_filled);
+ }
+
+ fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
+ let half_filled = match *self {
+ MaybeInst::Split => goto2,
+ _ => unreachable!(
+ "must be called on Split instruction, \
+ instead it was called on: {:?}",
+ self
+ ),
+ };
+ *self = MaybeInst::Split2(half_filled);
+ }
+
+ fn unwrap(self) -> Inst {
+ match self {
+ MaybeInst::Compiled(inst) => inst,
+ _ => unreachable!(
+ "must be called on a compiled instruction, \
+ instead it was called on: {:?}",
+ self
+ ),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+enum InstHole {
+ Save { slot: usize },
+ EmptyLook { look: EmptyLook },
+ Char { c: char },
+ Ranges { ranges: Vec<(char, char)> },
+ Bytes { start: u8, end: u8 },
+}
+
+impl InstHole {
+ fn fill(&self, goto: InstPtr) -> Inst {
+ match *self {
+ InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
+ InstHole::EmptyLook { look } => {
+ Inst::EmptyLook(InstEmptyLook { goto, look })
+ }
+ InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
+ InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
+ goto,
+ ranges: ranges.clone().into_boxed_slice(),
+ }),
+ InstHole::Bytes { start, end } => {
+ Inst::Bytes(InstBytes { goto, start, end })
+ }
+ }
+ }
+}
+
+struct CompileClass<'a, 'b> {
+ c: &'a mut Compiler,
+ ranges: &'b [hir::ClassUnicodeRange],
+}
+
+impl<'a, 'b> CompileClass<'a, 'b> {
+ fn compile(mut self) -> Result {
+ let mut holes = vec![];
+ let mut initial_entry = None;
+ let mut last_split = Hole::None;
+ let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
+ self.c.suffix_cache.clear();
+
+ for (i, range) in self.ranges.iter().enumerate() {
+ let is_last_range = i + 1 == self.ranges.len();
+ utf8_seqs.reset(range.start(), range.end());
+ let mut it = (&mut utf8_seqs).peekable();
+ loop {
+ let utf8_seq = match it.next() {
+ None => break,
+ Some(utf8_seq) => utf8_seq,
+ };
+ if is_last_range && it.peek().is_none() {
+ let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
+ holes.push(hole);
+ self.c.fill(last_split, entry);
+ last_split = Hole::None;
+ if initial_entry.is_none() {
+ initial_entry = Some(entry);
+ }
+ } else {
+ if initial_entry.is_none() {
+ initial_entry = Some(self.c.insts.len());
+ }
+ self.c.fill_to_next(last_split);
+ last_split = self.c.push_split_hole();
+ let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
+ holes.push(hole);
+ last_split =
+ self.c.fill_split(last_split, Some(entry), None);
+ }
+ }
+ }
+ self.c.utf8_seqs = Some(utf8_seqs);
+ Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
+ }
+
+ fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
+ if self.c.compiled.is_reverse {
+ self.c_utf8_seq_(seq)
+ } else {
+ self.c_utf8_seq_(seq.into_iter().rev())
+ }
+ }
+
+ fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
+ where
+ I: IntoIterator<Item = &'r Utf8Range>,
+ {
+ // The initial instruction for each UTF-8 sequence should be the same.
+ let mut from_inst = ::std::usize::MAX;
+ let mut last_hole = Hole::None;
+ for byte_range in seq {
+ let key = SuffixCacheKey {
+ from_inst,
+ start: byte_range.start,
+ end: byte_range.end,
+ };
+ {
+ let pc = self.c.insts.len();
+ if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
+ from_inst = cached_pc;
+ continue;
+ }
+ }
+ self.c.byte_classes.set_range(byte_range.start, byte_range.end);
+ if from_inst == ::std::usize::MAX {
+ last_hole = self.c.push_hole(InstHole::Bytes {
+ start: byte_range.start,
+ end: byte_range.end,
+ });
+ } else {
+ self.c.push_compiled(Inst::Bytes(InstBytes {
+ goto: from_inst,
+ start: byte_range.start,
+ end: byte_range.end,
+ }));
+ }
+ from_inst = self.c.insts.len().checked_sub(1).unwrap();
+ debug_assert!(from_inst < ::std::usize::MAX);
+ }
+ debug_assert!(from_inst < ::std::usize::MAX);
+ Ok(Patch { hole: last_hole, entry: from_inst })
+ }
+}
+
+/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
+/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
+/// The set of byte ranges looks like this:
+///
+/// [0-7F]
+/// [C2-DF][80-BF]
+/// [E0][A0-BF][80-BF]
+/// [E1-EC][80-BF][80-BF]
+/// [ED][80-9F][80-BF]
+/// [EE-EF][80-BF][80-BF]
+///
+/// Each line above translates to one alternate in the compiled regex program.
+/// However, all but one of the alternates end in the same suffix, which is
+/// a waste of an instruction. The suffix cache facilitates reusing them across
+/// alternates.
+///
+/// Note that a HashMap could be trivially used for this, but we don't need its
+/// overhead. Some small bounded space (LRU style) is more than enough.
+///
+/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
+/// except it uses hashes as original indices and then compares full keys for
+/// validation against `dense` array.
+#[derive(Debug)]
+struct SuffixCache {
+ sparse: Box<[usize]>,
+ dense: Vec<SuffixCacheEntry>,
+}
+
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
+struct SuffixCacheEntry {
+ key: SuffixCacheKey,
+ pc: InstPtr,
+}
+
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
+struct SuffixCacheKey {
+ from_inst: InstPtr,
+ start: u8,
+ end: u8,
+}
+
+impl SuffixCache {
+ fn new(size: usize) -> Self {
+ SuffixCache {
+ sparse: vec![0usize; size].into(),
+ dense: Vec::with_capacity(size),
+ }
+ }
+
+ fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
+ let hash = self.hash(&key);
+ let pos = &mut self.sparse[hash];
+ if let Some(entry) = self.dense.get(*pos) {
+ if entry.key == key {
+ return Some(entry.pc);
+ }
+ }
+ *pos = self.dense.len();
+ self.dense.push(SuffixCacheEntry { key, pc });
+ None
+ }
+
+ fn clear(&mut self) {
+ self.dense.clear();
+ }
+
+ fn hash(&self, suffix: &SuffixCacheKey) -> usize {
+ // Basic FNV-1a hash as described:
+ // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+ const FNV_PRIME: u64 = 1_099_511_628_211;
+ let mut h = 14_695_981_039_346_656_037;
+ h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
+ h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
+ h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
+ (h as usize) % self.sparse.len()
+ }
+}
+
+struct ByteClassSet([bool; 256]);
+
+impl ByteClassSet {
+ fn new() -> Self {
+ ByteClassSet([false; 256])
+ }
+
+ fn set_range(&mut self, start: u8, end: u8) {
+ debug_assert!(start <= end);
+ if start > 0 {
+ self.0[start as usize - 1] = true;
+ }
+ self.0[end as usize] = true;
+ }
+
+ fn set_word_boundary(&mut self) {
+ // We need to mark all ranges of bytes whose pairs result in
+ // evaluating \b differently.
+ let iswb = is_word_byte;
+ let mut b1: u16 = 0;
+ let mut b2: u16;
+ while b1 <= 255 {
+ b2 = b1 + 1;
+ while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
+ b2 += 1;
+ }
+ self.set_range(b1 as u8, (b2 - 1) as u8);
+ b1 = b2;
+ }
+ }
+
+ fn byte_classes(&self) -> Vec<u8> {
+ // N.B. If you're debugging the DFA, it's useful to simply return
+ // `(0..256).collect()`, which effectively removes the byte classes
+ // and makes the transitions easier to read.
+ // (0usize..256).map(|x| x as u8).collect()
+ let mut byte_classes = vec![0; 256];
+ let mut class = 0u8;
+ let mut i = 0;
+ loop {
+ byte_classes[i] = class as u8;
+ if i >= 255 {
+ break;
+ }
+ if self.0[i] {
+ class = class.checked_add(1).unwrap();
+ }
+ i += 1;
+ }
+ byte_classes
+ }
+}
+
+impl fmt::Debug for ByteClassSet {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
+ }
+}
+
+fn u32_to_usize(n: u32) -> usize {
+ // In case usize is less than 32 bits, we need to guard against overflow.
+ // On most platforms this compiles to nothing.
+ // TODO Use `std::convert::TryFrom` once it's stable.
+ if (n as u64) > (::std::usize::MAX as u64) {
+ panic!("BUG: {} is too big to be pointer sized", n)
+ }
+ n as usize
+}
+
+#[cfg(test)]
+mod tests {
+ use super::ByteClassSet;
+
+ #[test]
+ fn byte_classes() {
+ let mut set = ByteClassSet::new();
+ set.set_range(b'a', b'z');
+ let classes = set.byte_classes();
+ assert_eq!(classes[0], 0);
+ assert_eq!(classes[1], 0);
+ assert_eq!(classes[2], 0);
+ assert_eq!(classes[b'a' as usize - 1], 0);
+ assert_eq!(classes[b'a' as usize], 1);
+ assert_eq!(classes[b'm' as usize], 1);
+ assert_eq!(classes[b'z' as usize], 1);
+ assert_eq!(classes[b'z' as usize + 1], 2);
+ assert_eq!(classes[254], 2);
+ assert_eq!(classes[255], 2);
+
+ let mut set = ByteClassSet::new();
+ set.set_range(0, 2);
+ set.set_range(4, 6);
+ let classes = set.byte_classes();
+ assert_eq!(classes[0], 0);
+ assert_eq!(classes[1], 0);
+ assert_eq!(classes[2], 0);
+ assert_eq!(classes[3], 1);
+ assert_eq!(classes[4], 2);
+ assert_eq!(classes[5], 2);
+ assert_eq!(classes[6], 2);
+ assert_eq!(classes[7], 3);
+ assert_eq!(classes[255], 3);
+ }
+
+ #[test]
+ fn full_byte_classes() {
+ let mut set = ByteClassSet::new();
+ for i in 0..256u16 {
+ set.set_range(i as u8, i as u8);
+ }
+ assert_eq!(set.byte_classes().len(), 256);
+ }
+}
diff --git a/third_party/rust/regex/src/dfa.rs b/third_party/rust/regex/src/dfa.rs
new file mode 100644
index 0000000000..dc9952120e
--- /dev/null
+++ b/third_party/rust/regex/src/dfa.rs
@@ -0,0 +1,1945 @@
+/*!
+The DFA matching engine.
+
+A DFA provides faster matching because the engine is in exactly one state at
+any point in time. In the NFA, there may be multiple active states, and
+considerable CPU cycles are spent shuffling them around. In finite automata
+speak, the DFA follows epsilon transitions in the regex far less than the NFA.
+
+A DFA is a classic trade off between time and space. The NFA is slower, but
+its memory requirements are typically small and predictable. The DFA is faster,
+but given the right regex and the right input, the number of states in the
+DFA can grow exponentially. To mitigate this space problem, we do two things:
+
+1. We implement an *online* DFA. That is, the DFA is constructed from the NFA
+ during a search. When a new state is computed, it is stored in a cache so
+ that it may be reused. An important consequence of this implementation
+ is that states that are never reached for a particular input are never
+ computed. (This is impossible in an "offline" DFA which needs to compute
+ all possible states up front.)
+2. If the cache gets too big, we wipe it and continue matching.
+
+In pathological cases, a new state can be created for every byte of input.
+(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.)
+In this case, performance regresses to slightly slower than the full NFA
+simulation, in large part because the cache becomes useless. If the cache
+is wiped too frequently, the DFA quits and control falls back to one of the
+NFA simulations.
+
+Because of the "lazy" nature of this DFA, the inner matching loop is
+considerably more complex than one might expect out of a DFA. A number of
+tricks are employed to make it fast. Tread carefully.
+
+N.B. While this implementation is heavily commented, Russ Cox's series of
+articles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/>
+(As is the DFA implementation in RE2, which heavily influenced this
+implementation.)
+*/
+
+use std::collections::HashMap;
+use std::fmt;
+use std::iter::repeat;
+use std::mem;
+use std::sync::Arc;
+
+use crate::exec::ProgramCache;
+use crate::prog::{Inst, Program};
+use crate::sparse::SparseSet;
+
+/// Return true if and only if the given program can be executed by a DFA.
+///
+/// Generally, a DFA is always possible. A pathological case where it is not
+/// possible is if the number of NFA states exceeds `u32::MAX`, in which case,
+/// this function will return false.
+///
+/// This function will also return false if the given program has any Unicode
+/// instructions (Char or Ranges) since the DFA operates on bytes only.
+pub fn can_exec(insts: &Program) -> bool {
+ use crate::prog::Inst::*;
+ // If for some reason we manage to allocate a regex program with more
+ // than i32::MAX instructions, then we can't execute the DFA because we
+ // use 32 bit instruction pointer deltas for memory savings.
+ // If i32::MAX is the largest positive delta,
+ // then -i32::MAX == i32::MIN + 1 is the largest negative delta,
+ // and we are OK to use 32 bits.
+ if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
+ return false;
+ }
+ for inst in insts {
+ match *inst {
+ Char(_) | Ranges(_) => return false,
+ EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
+ }
+ }
+ true
+}
+
+/// A reusable cache of DFA states.
+///
+/// This cache is reused between multiple invocations of the same regex
+/// program. (It is not shared simultaneously between threads. If there is
+/// contention, then new caches are created.)
+#[derive(Debug)]
+pub struct Cache {
+ /// Group persistent DFA related cache state together. The sparse sets
+ /// listed below are used as scratch space while computing uncached states.
+ inner: CacheInner,
+ /// qcur and qnext are ordered sets with constant time
+ /// addition/membership/clearing-whole-set and linear time iteration. They
+ /// are used to manage the sets of NFA states in DFA states when computing
+ /// cached DFA states. In particular, the order of the NFA states matters
+ /// for leftmost-first style matching. Namely, when computing a cached
+ /// state, the set of NFA states stops growing as soon as the first Match
+ /// instruction is observed.
+ qcur: SparseSet,
+ qnext: SparseSet,
+}
+
+/// `CacheInner` is logically just a part of Cache, but groups together fields
+/// that aren't passed as function parameters throughout search. (This split
+/// is mostly an artifact of the borrow checker. It is happily paid.)
+#[derive(Debug)]
+struct CacheInner {
+ /// A cache of pre-compiled DFA states, keyed by the set of NFA states
+ /// and the set of empty-width flags set at the byte in the input when the
+ /// state was observed.
+ ///
+ /// A StatePtr is effectively a `*State`, but to avoid various inconvenient
+ /// things, we just pass indexes around manually. The performance impact of
+ /// this is probably an instruction or two in the inner loop. However, on
+ /// 64 bit, each StatePtr is half the size of a *State.
+ compiled: StateMap,
+ /// The transition table.
+ ///
+ /// The transition table is laid out in row-major order, where states are
+ /// rows and the transitions for each state are columns. At a high level,
+ /// given state `s` and byte `b`, the next state can be found at index
+ /// `s * 256 + b`.
+ ///
+ /// This is, of course, a lie. A StatePtr is actually a pointer to the
+ /// *start* of a row in this table. When indexing in the DFA's inner loop,
+ /// this removes the need to multiply the StatePtr by the stride. Yes, it
+ /// matters. This reduces the number of states we can store, but: the
+ /// stride is rarely 256 since we define transitions in terms of
+ /// *equivalence classes* of bytes. Each class corresponds to a set of
+ /// bytes that never discriminate a distinct path through the DFA from each
+ /// other.
+ trans: Transitions,
+ /// A set of cached start states, which are limited to the number of
+ /// permutations of flags set just before the initial byte of input. (The
+ /// index into this vec is a `EmptyFlags`.)
+ ///
+ /// N.B. A start state can be "dead" (i.e., no possible match), so we
+ /// represent it with a StatePtr.
+ start_states: Vec<StatePtr>,
+ /// Stack scratch space used to follow epsilon transitions in the NFA.
+ /// (This permits us to avoid recursion.)
+ ///
+ /// The maximum stack size is the number of NFA states.
+ stack: Vec<InstPtr>,
+ /// The total number of times this cache has been flushed by the DFA
+ /// because of space constraints.
+ flush_count: u64,
+ /// The total heap size of the DFA's cache. We use this to determine when
+ /// we should flush the cache.
+ size: usize,
+ /// Scratch space used when building instruction pointer lists for new
+ /// states. This helps amortize allocation.
+ insts_scratch_space: Vec<u8>,
+}
+
+/// The transition table.
+///
+/// It is laid out in row-major order, with states as rows and byte class
+/// transitions as columns.
+///
+/// The transition table is responsible for producing valid `StatePtrs`. A
+/// `StatePtr` points to the start of a particular row in this table. When
+/// indexing to find the next state this allows us to avoid a multiplication
+/// when computing an index into the table.
+#[derive(Clone)]
+struct Transitions {
+ /// The table.
+ table: Vec<StatePtr>,
+ /// The stride.
+ num_byte_classes: usize,
+}
+
+/// Fsm encapsulates the actual execution of the DFA.
+#[derive(Debug)]
+pub struct Fsm<'a> {
+ /// prog contains the NFA instruction opcodes. DFA execution uses either
+ /// the `dfa` instructions or the `dfa_reverse` instructions from
+ /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have
+ /// Unicode opcodes that cannot be executed by the DFA.)
+ prog: &'a Program,
+ /// The start state. We record it here because the pointer may change
+ /// when the cache is wiped.
+ start: StatePtr,
+ /// The current position in the input.
+ at: usize,
+ /// Should we quit after seeing the first match? e.g., When the caller
+ /// uses `is_match` or `shortest_match`.
+ quit_after_match: bool,
+ /// The last state that matched.
+ ///
+ /// When no match has occurred, this is set to STATE_UNKNOWN.
+ ///
+ /// This is only useful when matching regex sets. The last match state
+ /// is useful because it contains all of the match instructions seen,
+ /// thereby allowing us to enumerate which regexes in the set matched.
+ last_match_si: StatePtr,
+ /// The input position of the last cache flush. We use this to determine
+ /// if we're thrashing in the cache too often. If so, the DFA quits so
+ /// that we can fall back to the NFA algorithm.
+ last_cache_flush: usize,
+ /// All cached DFA information that is persisted between searches.
+ cache: &'a mut CacheInner,
+}
+
+/// The result of running the DFA.
+///
+/// Generally, the result is either a match or not a match, but sometimes the
+/// DFA runs too slowly because the cache size is too small. In that case, it
+/// gives up with the intent of falling back to the NFA algorithm.
+///
+/// The DFA can also give up if it runs out of room to create new states, or if
+/// it sees non-ASCII bytes in the presence of a Unicode word boundary.
+#[derive(Clone, Debug)]
+pub enum Result<T> {
+ Match(T),
+ NoMatch(usize),
+ Quit,
+}
+
+impl<T> Result<T> {
+ /// Returns true if this result corresponds to a match.
+ pub fn is_match(&self) -> bool {
+ match *self {
+ Result::Match(_) => true,
+ Result::NoMatch(_) | Result::Quit => false,
+ }
+ }
+
+ /// Maps the given function onto T and returns the result.
+ ///
+ /// If this isn't a match, then this is a no-op.
+ #[cfg(feature = "perf-literal")]
+ pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
+ match self {
+ Result::Match(t) => Result::Match(f(t)),
+ Result::NoMatch(x) => Result::NoMatch(x),
+ Result::Quit => Result::Quit,
+ }
+ }
+
+ /// Sets the non-match position.
+ ///
+ /// If this isn't a non-match, then this is a no-op.
+ fn set_non_match(self, at: usize) -> Result<T> {
+ match self {
+ Result::NoMatch(_) => Result::NoMatch(at),
+ r => r,
+ }
+ }
+}
+
+/// `State` is a DFA state. It contains an ordered set of NFA states (not
+/// necessarily complete) and a smattering of flags.
+///
+/// The flags are packed into the first byte of data.
+///
+/// States don't carry their transitions. Instead, transitions are stored in
+/// a single row-major table.
+///
+/// Delta encoding is used to store the instruction pointers.
+/// The first instruction pointer is stored directly starting
+/// at data[1], and each following pointer is stored as an offset
+/// to the previous one. If a delta is in the range -127..127,
+/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
+/// is coded as a flag, followed by 4 bytes encoding the delta.
+#[derive(Clone, Eq, Hash, PartialEq)]
+struct State {
+ data: Arc<[u8]>,
+}
+
+/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
+/// an NFA state).
+///
+/// Throughout this library, this is usually set to `usize`, but we force a
+/// `u32` here for the DFA to save on space.
+type InstPtr = u32;
+
+/// Adds ip to data using delta encoding with respect to prev.
+///
+/// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
+fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
+ let delta = (ip as i32) - (*prev as i32);
+ write_vari32(data, delta);
+ *prev = ip;
+}
+
+struct InstPtrs<'a> {
+ base: usize,
+ data: &'a [u8],
+}
+
+impl<'a> Iterator for InstPtrs<'a> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ if self.data.is_empty() {
+ return None;
+ }
+ let (delta, nread) = read_vari32(self.data);
+ let base = self.base as i32 + delta;
+ debug_assert!(base >= 0);
+ debug_assert!(nread > 0);
+ self.data = &self.data[nread..];
+ self.base = base as usize;
+ Some(self.base)
+ }
+}
+
+impl State {
+ fn flags(&self) -> StateFlags {
+ StateFlags(self.data[0])
+ }
+
+ fn inst_ptrs(&self) -> InstPtrs<'_> {
+ InstPtrs { base: 0, data: &self.data[1..] }
+ }
+}
+
+/// `StatePtr` is a 32 bit pointer to the start of a row in the transition
+/// table.
+///
+/// It has many special values. There are two types of special values:
+/// sentinels and flags.
+///
+/// Sentinels corresponds to special states that carry some kind of
+/// significance. There are three such states: unknown, dead and quit states.
+///
+/// Unknown states are states that haven't been computed yet. They indicate
+/// that a transition should be filled in that points to either an existing
+/// cached state or a new state altogether. In general, an unknown state means
+/// "follow the NFA's epsilon transitions."
+///
+/// Dead states are states that can never lead to a match, no matter what
+/// subsequent input is observed. This means that the DFA should quit
+/// immediately and return the longest match it has found thus far.
+///
+/// Quit states are states that imply the DFA is not capable of matching the
+/// regex correctly. Currently, this is only used when a Unicode word boundary
+/// exists in the regex *and* a non-ASCII byte is observed.
+///
+/// The other type of state pointer is a state pointer with special flag bits.
+/// There are two flags: a start flag and a match flag. The lower bits of both
+/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX`
+/// mask).
+///
+/// The start flag means that the state is a start state, and therefore may be
+/// subject to special prefix scanning optimizations.
+///
+/// The match flag means that the state is a match state, and therefore the
+/// current position in the input (while searching) should be recorded.
+///
+/// The above exists mostly in the service of making the inner loop fast.
+/// In particular, the inner *inner* loop looks something like this:
+///
+/// ```ignore
+/// while state <= STATE_MAX and i < len(text):
+/// state = state.next[i]
+/// ```
+///
+/// This is nice because it lets us execute a lazy DFA as if it were an
+/// entirely offline DFA (i.e., with very few instructions). The loop will
+/// quit only when we need to examine a case that needs special attention.
+type StatePtr = u32;
+
+/// An unknown state means that the state has not been computed yet, and that
+/// the only way to progress is to compute it.
+const STATE_UNKNOWN: StatePtr = 1 << 31;
+
+/// A dead state means that the state has been computed and it is known that
+/// once it is entered, no future match can ever occur.
+const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1;
+
+/// A quit state means that the DFA came across some input that it doesn't
+/// know how to process correctly. The DFA should quit and another matching
+/// engine should be run in its place.
+const STATE_QUIT: StatePtr = STATE_DEAD + 1;
+
+/// A start state is a state that the DFA can start in.
+///
+/// Note that start states have their lower bits set to a state pointer.
+const STATE_START: StatePtr = 1 << 30;
+
+/// A match state means that the regex has successfully matched.
+///
+/// Note that match states have their lower bits set to a state pointer.
+const STATE_MATCH: StatePtr = 1 << 29;
+
+/// The maximum state pointer. This is useful to mask out the "valid" state
+/// pointer from a state with the "start" or "match" bits set.
+///
+/// It doesn't make sense to use this with unknown, dead or quit state
+/// pointers, since those pointers are sentinels and never have their lower
+/// bits set to anything meaningful.
+const STATE_MAX: StatePtr = STATE_MATCH - 1;
+
+/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the
+/// special EOF sentinel value.
+#[derive(Copy, Clone, Debug)]
+struct Byte(u16);
+
+/// A set of flags for zero-width assertions.
+#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
+struct EmptyFlags {
+ start: bool,
+ end: bool,
+ start_line: bool,
+ end_line: bool,
+ word_boundary: bool,
+ not_word_boundary: bool,
+}
+
+/// A set of flags describing various configurations of a DFA state. This is
+/// represented by a `u8` so that it is compact.
+#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
+struct StateFlags(u8);
+
+impl Cache {
+ /// Create new empty cache for the DFA engine.
+ pub fn new(prog: &Program) -> Self {
+ // We add 1 to account for the special EOF byte.
+ let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
+ let starts = vec![STATE_UNKNOWN; 256];
+ let mut cache = Cache {
+ inner: CacheInner {
+ compiled: StateMap::new(num_byte_classes),
+ trans: Transitions::new(num_byte_classes),
+ start_states: starts,
+ stack: vec![],
+ flush_count: 0,
+ size: 0,
+ insts_scratch_space: vec![],
+ },
+ qcur: SparseSet::new(prog.insts.len()),
+ qnext: SparseSet::new(prog.insts.len()),
+ };
+ cache.inner.reset_size();
+ cache
+ }
+}
+
+impl CacheInner {
+ /// Resets the cache size to account for fixed costs, such as the program
+ /// and stack sizes.
+ fn reset_size(&mut self) {
+ self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
+ + (self.stack.len() * mem::size_of::<InstPtr>());
+ }
+}
+
+impl<'a> Fsm<'a> {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn forward(
+ prog: &'a Program,
+ cache: &ProgramCache,
+ quit_after_match: bool,
+ text: &[u8],
+ at: usize,
+ ) -> Result<usize> {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.dfa;
+ let mut dfa = Fsm {
+ prog,
+ start: 0, // filled in below
+ at,
+ quit_after_match,
+ last_match_si: STATE_UNKNOWN,
+ last_cache_flush: at,
+ cache: &mut cache.inner,
+ };
+ let (empty_flags, state_flags) = dfa.start_flags(text, at);
+ dfa.start =
+ match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return Result::NoMatch(at),
+ Some(si) => si,
+ };
+ debug_assert!(dfa.start != STATE_UNKNOWN);
+ dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn reverse(
+ prog: &'a Program,
+ cache: &ProgramCache,
+ quit_after_match: bool,
+ text: &[u8],
+ at: usize,
+ ) -> Result<usize> {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.dfa_reverse;
+ let mut dfa = Fsm {
+ prog,
+ start: 0, // filled in below
+ at,
+ quit_after_match,
+ last_match_si: STATE_UNKNOWN,
+ last_cache_flush: at,
+ cache: &mut cache.inner,
+ };
+ let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
+ dfa.start =
+ match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return Result::NoMatch(at),
+ Some(si) => si,
+ };
+ debug_assert!(dfa.start != STATE_UNKNOWN);
+ dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn forward_many(
+ prog: &'a Program,
+ cache: &ProgramCache,
+ matches: &mut [bool],
+ text: &[u8],
+ at: usize,
+ ) -> Result<usize> {
+ debug_assert!(matches.len() == prog.matches.len());
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.dfa;
+ let mut dfa = Fsm {
+ prog,
+ start: 0, // filled in below
+ at,
+ quit_after_match: false,
+ last_match_si: STATE_UNKNOWN,
+ last_cache_flush: at,
+ cache: &mut cache.inner,
+ };
+ let (empty_flags, state_flags) = dfa.start_flags(text, at);
+ dfa.start =
+ match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return Result::NoMatch(at),
+ Some(si) => si,
+ };
+ debug_assert!(dfa.start != STATE_UNKNOWN);
+ let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
+ if result.is_match() {
+ if matches.len() == 1 {
+ matches[0] = true;
+ } else {
+ debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
+ debug_assert!(dfa.last_match_si != STATE_DEAD);
+ for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
+ if let Inst::Match(slot) = dfa.prog[ip] {
+ matches[slot] = true;
+ }
+ }
+ }
+ }
+ result
+ }
+
+ /// Executes the DFA on a forward NFA.
+ ///
+ /// {qcur,qnext} are scratch ordered sets which may be non-empty.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn exec_at(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ text: &[u8],
+ ) -> Result<usize> {
+ // For the most part, the DFA is basically:
+ //
+ // last_match = null
+ // while current_byte != EOF:
+ // si = current_state.next[current_byte]
+ // if si is match
+ // last_match = si
+ // return last_match
+ //
+ // However, we need to deal with a few things:
+ //
+ // 1. This is an *online* DFA, so the current state's next list
+ // may not point to anywhere yet, so we must go out and compute
+ // them. (They are then cached into the current state's next list
+ // to avoid re-computation.)
+ // 2. If we come across a state that is known to be dead (i.e., never
+ // leads to a match), then we can quit early.
+ // 3. If the caller just wants to know if a match occurs, then we
+ // can quit as soon as we know we have a match. (Full leftmost
+ // first semantics require continuing on.)
+ // 4. If we're in the start state, then we can use a pre-computed set
+ // of prefix literals to skip quickly along the input.
+ // 5. After the input is exhausted, we run the DFA on one symbol
+ // that stands for EOF. This is useful for handling empty width
+ // assertions.
+ // 6. We can't actually do state.next[byte]. Instead, we have to do
+ // state.next[byte_classes[byte]], which permits us to keep the
+ // 'next' list very small.
+ //
+ // Since there's a bunch of extra stuff we need to consider, we do some
+ // pretty hairy tricks to get the inner loop to run as fast as
+ // possible.
+ debug_assert!(!self.prog.is_reverse);
+
+ // The last match is the currently known ending match position. It is
+ // reported as an index to the most recent byte that resulted in a
+ // transition to a match state and is always stored in capture slot `1`
+ // when searching forwards. Its maximum value is `text.len()`.
+ let mut result = Result::NoMatch(self.at);
+ let (mut prev_si, mut next_si) = (self.start, self.start);
+ let mut at = self.at;
+ while at < text.len() {
+ // This is the real inner loop. We take advantage of special bits
+ // set in the state pointer to determine whether a state is in the
+ // "common" case or not. Specifically, the common case is a
+ // non-match non-start non-dead state that has already been
+ // computed. So long as we remain in the common case, this inner
+ // loop will chew through the input.
+ //
+ // We also unroll the loop 4 times to amortize the cost of checking
+ // whether we've consumed the entire input. We are also careful
+ // to make sure that `prev_si` always represents the previous state
+ // and `next_si` always represents the next state after the loop
+ // exits, even if it isn't always true inside the loop.
+ while next_si <= STATE_MAX && at < text.len() {
+ // Argument for safety is in the definition of next_si.
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ at += 1;
+ if prev_si > STATE_MAX || at + 2 >= text.len() {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ at += 1;
+ if next_si > STATE_MAX {
+ break;
+ }
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ at += 1;
+ if prev_si > STATE_MAX {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ at += 1;
+ }
+ if next_si & STATE_MATCH > 0 {
+ // A match state is outside of the common case because it needs
+ // special case analysis. In particular, we need to record the
+ // last position as having matched and possibly quit the DFA if
+ // we don't need to keep matching.
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at - 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ prev_si = next_si;
+
+ // This permits short-circuiting when matching a regex set.
+ // In particular, if this DFA state contains only match states,
+ // then it's impossible to extend the set of matches since
+ // match states are final. Therefore, we can quit.
+ if self.prog.matches.len() > 1 {
+ let state = self.state(next_si);
+ let just_matches =
+ state.inst_ptrs().all(|ip| self.prog[ip].is_match());
+ if just_matches {
+ return result;
+ }
+ }
+
+ // Another inner loop! If the DFA stays in this particular
+ // match state, then we can rip through all of the input
+ // very quickly, and only recording the match location once
+ // we've left this particular state.
+ let cur = at;
+ while (next_si & !STATE_MATCH) == prev_si
+ && at + 2 < text.len()
+ {
+ // Argument for safety is in the definition of next_si.
+ next_si = unsafe {
+ self.next_si(next_si & !STATE_MATCH, text, at)
+ };
+ at += 1;
+ }
+ if at > cur {
+ result = Result::Match(at - 2);
+ }
+ } else if next_si & STATE_START > 0 {
+ // A start state isn't in the common case because we may
+ // want to do quick prefix scanning. If the program doesn't
+ // have a detected prefix, then start states are actually
+ // considered common and this case is never reached.
+ debug_assert!(self.has_prefix());
+ next_si &= !STATE_START;
+ prev_si = next_si;
+ at = match self.prefix_at(text, at) {
+ None => return Result::NoMatch(text.len()),
+ Some(i) => i,
+ };
+ } else if next_si >= STATE_UNKNOWN {
+ if next_si == STATE_QUIT {
+ return Result::Quit;
+ }
+ // Finally, this corresponds to the case where the transition
+ // entered a state that can never lead to a match or a state
+ // that hasn't been computed yet. The latter being the "slow"
+ // path.
+ let byte = Byte::byte(text[at - 1]);
+ // We no longer care about the special bits in the state
+ // pointer.
+ prev_si &= STATE_MAX;
+ // Record where we are. This is used to track progress for
+ // determining whether we should quit if we've flushed the
+ // cache too much.
+ self.at = at;
+ next_si = match self.next_state(qcur, qnext, prev_si, byte) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(at),
+ Some(si) => si,
+ };
+ debug_assert!(next_si != STATE_UNKNOWN);
+ if next_si & STATE_MATCH > 0 {
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at - 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ }
+ prev_si = next_si;
+ } else {
+ prev_si = next_si;
+ }
+ }
+
+ // Run the DFA once more on the special EOF sentinel value.
+ // We don't care about the special bits in the state pointer any more,
+ // so get rid of them.
+ prev_si &= STATE_MAX;
+ prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(text.len()),
+ Some(si) => si & !STATE_START,
+ };
+ debug_assert!(prev_si != STATE_UNKNOWN);
+ if prev_si & STATE_MATCH > 0 {
+ prev_si &= !STATE_MATCH;
+ self.last_match_si = prev_si;
+ result = Result::Match(text.len());
+ }
+ result
+ }
+
+ /// Executes the DFA on a reverse NFA.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn exec_at_reverse(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ text: &[u8],
+ ) -> Result<usize> {
+ // The comments in `exec_at` above mostly apply here too. The main
+ // difference is that we move backwards over the input and we look for
+ // the longest possible match instead of the leftmost-first match.
+ //
+ // N.B. The code duplication here is regrettable. Efforts to improve
+ // it without sacrificing performance are welcome. ---AG
+ debug_assert!(self.prog.is_reverse);
+ let mut result = Result::NoMatch(self.at);
+ let (mut prev_si, mut next_si) = (self.start, self.start);
+ let mut at = self.at;
+ while at > 0 {
+ while next_si <= STATE_MAX && at > 0 {
+ // Argument for safety is in the definition of next_si.
+ at -= 1;
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ if prev_si > STATE_MAX || at <= 4 {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ at -= 1;
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ if next_si > STATE_MAX {
+ break;
+ }
+ at -= 1;
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ if prev_si > STATE_MAX {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ at -= 1;
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ }
+ if next_si & STATE_MATCH > 0 {
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at + 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ prev_si = next_si;
+ let cur = at;
+ while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
+ // Argument for safety is in the definition of next_si.
+ at -= 1;
+ next_si = unsafe {
+ self.next_si(next_si & !STATE_MATCH, text, at)
+ };
+ }
+ if at < cur {
+ result = Result::Match(at + 2);
+ }
+ } else if next_si >= STATE_UNKNOWN {
+ if next_si == STATE_QUIT {
+ return Result::Quit;
+ }
+ let byte = Byte::byte(text[at]);
+ prev_si &= STATE_MAX;
+ self.at = at;
+ next_si = match self.next_state(qcur, qnext, prev_si, byte) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(at),
+ Some(si) => si,
+ };
+ debug_assert!(next_si != STATE_UNKNOWN);
+ if next_si & STATE_MATCH > 0 {
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at + 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ }
+ prev_si = next_si;
+ } else {
+ prev_si = next_si;
+ }
+ }
+
+ // Run the DFA once more on the special EOF sentinel value.
+ prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(0),
+ Some(si) => si,
+ };
+ debug_assert!(prev_si != STATE_UNKNOWN);
+ if prev_si & STATE_MATCH > 0 {
+ prev_si &= !STATE_MATCH;
+ self.last_match_si = prev_si;
+ result = Result::Match(0);
+ }
+ result
+ }
+
+ /// next_si transitions to the next state, where the transition input
+ /// corresponds to text[i].
+ ///
+ /// This elides bounds checks, and is therefore not safe.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
+ // What is the argument for safety here?
+ // We have three unchecked accesses that could possibly violate safety:
+ //
+ // 1. The given byte of input (`text[i]`).
+ // 2. The class of the byte of input (`classes[text[i]]`).
+ // 3. The transition for the class (`trans[si + cls]`).
+ //
+ // (1) is only safe when calling next_si is guarded by
+ // `i < text.len()`.
+ //
+ // (2) is the easiest case to guarantee since `text[i]` is always a
+ // `u8` and `self.prog.byte_classes` always has length `u8::MAX`.
+ // (See `ByteClassSet.byte_classes` in `compile.rs`.)
+ //
+ // (3) is only safe if (1)+(2) are safe. Namely, the transitions
+ // of every state are defined to have length equal to the number of
+ // byte classes in the program. Therefore, a valid class leads to a
+ // valid transition. (All possible transitions are valid lookups, even
+ // if it points to a state that hasn't been computed yet.) (3) also
+ // relies on `si` being correct, but StatePtrs should only ever be
+ // retrieved from the transition table, which ensures they are correct.
+ debug_assert!(i < text.len());
+ let b = *text.get_unchecked(i);
+ debug_assert!((b as usize) < self.prog.byte_classes.len());
+ let cls = *self.prog.byte_classes.get_unchecked(b as usize);
+ self.cache.trans.next_unchecked(si, cls as usize)
+ }
+
+ /// Computes the next state given the current state and the current input
+ /// byte (which may be EOF).
+ ///
+ /// If STATE_DEAD is returned, then there is no valid state transition.
+ /// This implies that no permutation of future input can lead to a match
+ /// state.
+ ///
+ /// STATE_UNKNOWN can never be returned.
+ fn exec_byte(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ mut si: StatePtr,
+ b: Byte,
+ ) -> Option<StatePtr> {
+ use crate::prog::Inst::*;
+
+ // Initialize a queue with the current DFA state's NFA states.
+ qcur.clear();
+ for ip in self.state(si).inst_ptrs() {
+ qcur.insert(ip);
+ }
+
+ // Before inspecting the current byte, we may need to also inspect
+ // whether the position immediately preceding the current byte
+ // satisfies the empty assertions found in the current state.
+ //
+ // We only need to do this step if there are any empty assertions in
+ // the current state.
+ let is_word_last = self.state(si).flags().is_word();
+ let is_word = b.is_ascii_word();
+ if self.state(si).flags().has_empty() {
+ // Compute the flags immediately preceding the current byte.
+ // This means we only care about the "end" or "end line" flags.
+ // (The "start" flags are computed immediately following the
+ // current byte and are handled below.)
+ let mut flags = EmptyFlags::default();
+ if b.is_eof() {
+ flags.end = true;
+ flags.end_line = true;
+ } else if b.as_byte().map_or(false, |b| b == b'\n') {
+ flags.end_line = true;
+ }
+ if is_word_last == is_word {
+ flags.not_word_boundary = true;
+ } else {
+ flags.word_boundary = true;
+ }
+ // Now follow epsilon transitions from every NFA state, but make
+ // sure we only follow transitions that satisfy our flags.
+ qnext.clear();
+ for &ip in &*qcur {
+ self.follow_epsilons(usize_to_u32(ip), qnext, flags);
+ }
+ mem::swap(qcur, qnext);
+ }
+
+ // Now we set flags for immediately after the current byte. Since start
+ // states are processed separately, and are the only states that can
+ // have the StartText flag set, we therefore only need to worry about
+ // the StartLine flag here.
+ //
+ // We do also keep track of whether this DFA state contains a NFA state
+ // that is a matching state. This is precisely how we delay the DFA
+ // matching by one byte in order to process the special EOF sentinel
+ // byte. Namely, if this DFA state containing a matching NFA state,
+ // then it is the *next* DFA state that is marked as a match.
+ let mut empty_flags = EmptyFlags::default();
+ let mut state_flags = StateFlags::default();
+ empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
+ if b.is_ascii_word() {
+ state_flags.set_word();
+ }
+ // Now follow all epsilon transitions again, but only after consuming
+ // the current byte.
+ qnext.clear();
+ for &ip in &*qcur {
+ match self.prog[ip as usize] {
+ // These states never happen in a byte-based program.
+ Char(_) | Ranges(_) => unreachable!(),
+ // These states are handled when following epsilon transitions.
+ Save(_) | Split(_) | EmptyLook(_) => {}
+ Match(_) => {
+ state_flags.set_match();
+ if !self.continue_past_first_match() {
+ break;
+ } else if self.prog.matches.len() > 1
+ && !qnext.contains(ip as usize)
+ {
+ // If we are continuing on to find other matches,
+ // then keep a record of the match states we've seen.
+ qnext.insert(ip);
+ }
+ }
+ Bytes(ref inst) => {
+ if b.as_byte().map_or(false, |b| inst.matches(b)) {
+ self.follow_epsilons(
+ inst.goto as InstPtr,
+ qnext,
+ empty_flags,
+ );
+ }
+ }
+ }
+ }
+
+ let cache = if b.is_eof() && self.prog.matches.len() > 1 {
+ // If we're processing the last byte of the input and we're
+ // matching a regex set, then make the next state contain the
+ // previous states transitions. We do this so that the main
+ // matching loop can extract all of the match instructions.
+ mem::swap(qcur, qnext);
+ // And don't cache this state because it's totally bunk.
+ false
+ } else {
+ true
+ };
+
+ // We've now built up the set of NFA states that ought to comprise the
+ // next DFA state, so try to find it in the cache, and if it doesn't
+ // exist, cache it.
+ //
+ // N.B. We pass `&mut si` here because the cache may clear itself if
+ // it has gotten too full. When that happens, the location of the
+ // current state may change.
+ let mut next =
+ match self.cached_state(qnext, state_flags, Some(&mut si)) {
+ None => return None,
+ Some(next) => next,
+ };
+ if (self.start & !STATE_START) == next {
+ // Start states can never be match states since all matches are
+ // delayed by one byte.
+ debug_assert!(!self.state(next).flags().is_match());
+ next = self.start_ptr(next);
+ }
+ if next <= STATE_MAX && self.state(next).flags().is_match() {
+ next |= STATE_MATCH;
+ }
+ debug_assert!(next != STATE_UNKNOWN);
+ // And now store our state in the current state's next list.
+ if cache {
+ let cls = self.byte_class(b);
+ self.cache.trans.set_next(si, cls, next);
+ }
+ Some(next)
+ }
+
+ /// Follows the epsilon transitions starting at (and including) `ip`. The
+ /// resulting states are inserted into the ordered set `q`.
+ ///
+ /// Conditional epsilon transitions (i.e., empty width assertions) are only
+ /// followed if they are satisfied by the given flags, which should
+ /// represent the flags set at the current location in the input.
+ ///
+ /// If the current location corresponds to the empty string, then only the
+ /// end line and/or end text flags may be set. If the current location
+ /// corresponds to a real byte in the input, then only the start line
+ /// and/or start text flags may be set.
+ ///
+ /// As an exception to the above, when finding the initial state, any of
+ /// the above flags may be set:
+ ///
+ /// If matching starts at the beginning of the input, then start text and
+ /// start line should be set. If the input is empty, then end text and end
+ /// line should also be set.
+ ///
+ /// If matching starts after the beginning of the input, then only start
+ /// line should be set if the preceding byte is `\n`. End line should never
+ /// be set in this case. (Even if the following byte is a `\n`, it will
+ /// be handled in a subsequent DFA state.)
+ fn follow_epsilons(
+ &mut self,
+ ip: InstPtr,
+ q: &mut SparseSet,
+ flags: EmptyFlags,
+ ) {
+ use crate::prog::EmptyLook::*;
+ use crate::prog::Inst::*;
+
+ // We need to traverse the NFA to follow epsilon transitions, so avoid
+ // recursion with an explicit stack.
+ self.cache.stack.push(ip);
+ while let Some(mut ip) = self.cache.stack.pop() {
+ // Try to munch through as many states as possible without
+ // pushes/pops to the stack.
+ loop {
+ // Don't visit states we've already added.
+ if q.contains(ip as usize) {
+ break;
+ }
+ q.insert(ip as usize);
+ match self.prog[ip as usize] {
+ Char(_) | Ranges(_) => unreachable!(),
+ Match(_) | Bytes(_) => {
+ break;
+ }
+ EmptyLook(ref inst) => {
+ // Only follow empty assertion states if our flags
+ // satisfy the assertion.
+ match inst.look {
+ StartLine if flags.start_line => {
+ ip = inst.goto as InstPtr;
+ }
+ EndLine if flags.end_line => {
+ ip = inst.goto as InstPtr;
+ }
+ StartText if flags.start => {
+ ip = inst.goto as InstPtr;
+ }
+ EndText if flags.end => {
+ ip = inst.goto as InstPtr;
+ }
+ WordBoundaryAscii if flags.word_boundary => {
+ ip = inst.goto as InstPtr;
+ }
+ NotWordBoundaryAscii
+ if flags.not_word_boundary =>
+ {
+ ip = inst.goto as InstPtr;
+ }
+ WordBoundary if flags.word_boundary => {
+ ip = inst.goto as InstPtr;
+ }
+ NotWordBoundary if flags.not_word_boundary => {
+ ip = inst.goto as InstPtr;
+ }
+ StartLine | EndLine | StartText | EndText
+ | WordBoundaryAscii | NotWordBoundaryAscii
+ | WordBoundary | NotWordBoundary => {
+ break;
+ }
+ }
+ }
+ Save(ref inst) => {
+ ip = inst.goto as InstPtr;
+ }
+ Split(ref inst) => {
+ self.cache.stack.push(inst.goto2 as InstPtr);
+ ip = inst.goto1 as InstPtr;
+ }
+ }
+ }
+ }
+ }
+
+ /// Find a previously computed state matching the given set of instructions
+ /// and is_match bool.
+ ///
+ /// The given set of instructions should represent a single state in the
+ /// NFA along with all states reachable without consuming any input.
+ ///
+ /// The is_match bool should be true if and only if the preceding DFA state
+ /// contains an NFA matching state. The cached state produced here will
+ /// then signify a match. (This enables us to delay a match by one byte,
+ /// in order to account for the EOF sentinel byte.)
+ ///
+ /// If the cache is full, then it is wiped before caching a new state.
+ ///
+ /// The current state should be specified if it exists, since it will need
+ /// to be preserved if the cache clears itself. (Start states are
+ /// always saved, so they should not be passed here.) It takes a mutable
+ /// pointer to the index because if the cache is cleared, the state's
+ /// location may change.
+ fn cached_state(
+ &mut self,
+ q: &SparseSet,
+ mut state_flags: StateFlags,
+ current_state: Option<&mut StatePtr>,
+ ) -> Option<StatePtr> {
+ // If we couldn't come up with a non-empty key to represent this state,
+ // then it is dead and can never lead to a match.
+ //
+ // Note that inst_flags represent the set of empty width assertions
+ // in q. We use this as an optimization in exec_byte to determine when
+ // we should follow epsilon transitions at the empty string preceding
+ // the current byte.
+ let key = match self.cached_state_key(q, &mut state_flags) {
+ None => return Some(STATE_DEAD),
+ Some(v) => v,
+ };
+ // In the cache? Cool. Done.
+ if let Some(si) = self.cache.compiled.get_ptr(&key) {
+ return Some(si);
+ }
+ // If the cache has gotten too big, wipe it.
+ if self.approximate_size() > self.prog.dfa_size_limit
+ && !self.clear_cache_and_save(current_state)
+ {
+ // Ooops. DFA is giving up.
+ return None;
+ }
+ // Allocate room for our state and add it.
+ self.add_state(key)
+ }
+
+ /// Produces a key suitable for describing a state in the DFA cache.
+ ///
+ /// The key invariant here is that equivalent keys are produced for any two
+ /// sets of ordered NFA states (and toggling of whether the previous NFA
+ /// states contain a match state) that do not discriminate a match for any
+ /// input.
+ ///
+ /// Specifically, q should be an ordered set of NFA states and is_match
+ /// should be true if and only if the previous NFA states contained a match
+ /// state.
+ fn cached_state_key(
+ &mut self,
+ q: &SparseSet,
+ state_flags: &mut StateFlags,
+ ) -> Option<State> {
+ use crate::prog::Inst::*;
+
+ // We need to build up enough information to recognize pre-built states
+ // in the DFA. Generally speaking, this includes every instruction
+ // except for those which are purely epsilon transitions, e.g., the
+ // Save and Split instructions.
+ //
+ // Empty width assertions are also epsilon transitions, but since they
+ // are conditional, we need to make them part of a state's key in the
+ // cache.
+
+ let mut insts =
+ mem::replace(&mut self.cache.insts_scratch_space, vec![]);
+ insts.clear();
+ // Reserve 1 byte for flags.
+ insts.push(0);
+
+ let mut prev = 0;
+ for &ip in q {
+ let ip = usize_to_u32(ip);
+ match self.prog[ip as usize] {
+ Char(_) | Ranges(_) => unreachable!(),
+ Save(_) | Split(_) => {}
+ Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
+ EmptyLook(_) => {
+ state_flags.set_empty();
+ push_inst_ptr(&mut insts, &mut prev, ip)
+ }
+ Match(_) => {
+ push_inst_ptr(&mut insts, &mut prev, ip);
+ if !self.continue_past_first_match() {
+ break;
+ }
+ }
+ }
+ }
+ // If we couldn't transition to any other instructions and we didn't
+ // see a match when expanding NFA states previously, then this is a
+ // dead state and no amount of additional input can transition out
+ // of this state.
+ let opt_state = if insts.len() == 1 && !state_flags.is_match() {
+ None
+ } else {
+ let StateFlags(f) = *state_flags;
+ insts[0] = f;
+ Some(State { data: Arc::from(&*insts) })
+ };
+ self.cache.insts_scratch_space = insts;
+ opt_state
+ }
+
+ /// Clears the cache, but saves and restores current_state if it is not
+ /// none.
+ ///
+ /// The current state must be provided here in case its location in the
+ /// cache changes.
+ ///
+ /// This returns false if the cache is not cleared and the DFA should
+ /// give up.
+ fn clear_cache_and_save(
+ &mut self,
+ current_state: Option<&mut StatePtr>,
+ ) -> bool {
+ if self.cache.compiled.is_empty() {
+ // Nothing to clear...
+ return true;
+ }
+ match current_state {
+ None => self.clear_cache(),
+ Some(si) => {
+ let cur = self.state(*si).clone();
+ if !self.clear_cache() {
+ return false;
+ }
+ // The unwrap is OK because we just cleared the cache and
+ // therefore know that the next state pointer won't exceed
+ // STATE_MAX.
+ *si = self.restore_state(cur).unwrap();
+ true
+ }
+ }
+ }
+
+ /// Wipes the state cache, but saves and restores the current start state.
+ ///
+ /// This returns false if the cache is not cleared and the DFA should
+ /// give up.
+ fn clear_cache(&mut self) -> bool {
+ // Bail out of the DFA if we're moving too "slowly."
+ // A heuristic from RE2: assume the DFA is too slow if it is processing
+ // 10 or fewer bytes per state.
+ // Additionally, we permit the cache to be flushed a few times before
+ // caling it quits.
+ let nstates = self.cache.compiled.len();
+ if self.cache.flush_count >= 3
+ && self.at >= self.last_cache_flush
+ && (self.at - self.last_cache_flush) <= 10 * nstates
+ {
+ return false;
+ }
+ // Update statistics tracking cache flushes.
+ self.last_cache_flush = self.at;
+ self.cache.flush_count += 1;
+
+ // OK, actually flush the cache.
+ let start = self.state(self.start & !STATE_START).clone();
+ let last_match = if self.last_match_si <= STATE_MAX {
+ Some(self.state(self.last_match_si).clone())
+ } else {
+ None
+ };
+ self.cache.reset_size();
+ self.cache.trans.clear();
+ self.cache.compiled.clear();
+ for s in &mut self.cache.start_states {
+ *s = STATE_UNKNOWN;
+ }
+ // The unwraps are OK because we just cleared the cache and therefore
+ // know that the next state pointer won't exceed STATE_MAX.
+ let start_ptr = self.restore_state(start).unwrap();
+ self.start = self.start_ptr(start_ptr);
+ if let Some(last_match) = last_match {
+ self.last_match_si = self.restore_state(last_match).unwrap();
+ }
+ true
+ }
+
+ /// Restores the given state back into the cache, and returns a pointer
+ /// to it.
+ fn restore_state(&mut self, state: State) -> Option<StatePtr> {
+ // If we've already stored this state, just return a pointer to it.
+ // None will be the wiser.
+ if let Some(si) = self.cache.compiled.get_ptr(&state) {
+ return Some(si);
+ }
+ self.add_state(state)
+ }
+
+ /// Returns the next state given the current state si and current byte
+ /// b. {qcur,qnext} are used as scratch space for storing ordered NFA
+ /// states.
+ ///
+ /// This tries to fetch the next state from the cache, but if that fails,
+ /// it computes the next state, caches it and returns a pointer to it.
+ ///
+ /// The pointer can be to a real state, or it can be STATE_DEAD.
+ /// STATE_UNKNOWN cannot be returned.
+ ///
+ /// None is returned if a new state could not be allocated (i.e., the DFA
+ /// ran out of space and thinks it's running too slowly).
+ fn next_state(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ si: StatePtr,
+ b: Byte,
+ ) -> Option<StatePtr> {
+ if si == STATE_DEAD {
+ return Some(STATE_DEAD);
+ }
+ match self.cache.trans.next(si, self.byte_class(b)) {
+ STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
+ STATE_QUIT => None,
+ nsi => Some(nsi),
+ }
+ }
+
+ /// Computes and returns the start state, where searching begins at
+ /// position `at` in `text`. If the state has already been computed,
+ /// then it is pulled from the cache. If the state hasn't been cached,
+ /// then it is computed, cached and a pointer to it is returned.
+ ///
+ /// This may return STATE_DEAD but never STATE_UNKNOWN.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn start_state(
+ &mut self,
+ q: &mut SparseSet,
+ empty_flags: EmptyFlags,
+ state_flags: StateFlags,
+ ) -> Option<StatePtr> {
+ // Compute an index into our cache of start states based on the set
+ // of empty/state flags set at the current position in the input. We
+ // don't use every flag since not all flags matter. For example, since
+ // matches are delayed by one byte, start states can never be match
+ // states.
+ let flagi = {
+ (((empty_flags.start as u8) << 0)
+ | ((empty_flags.end as u8) << 1)
+ | ((empty_flags.start_line as u8) << 2)
+ | ((empty_flags.end_line as u8) << 3)
+ | ((empty_flags.word_boundary as u8) << 4)
+ | ((empty_flags.not_word_boundary as u8) << 5)
+ | ((state_flags.is_word() as u8) << 6)) as usize
+ };
+ match self.cache.start_states[flagi] {
+ STATE_UNKNOWN => {}
+ si => return Some(si),
+ }
+ q.clear();
+ let start = usize_to_u32(self.prog.start);
+ self.follow_epsilons(start, q, empty_flags);
+ // Start states can never be match states because we delay every match
+ // by one byte. Given an empty string and an empty match, the match
+ // won't actually occur until the DFA processes the special EOF
+ // sentinel byte.
+ let sp = match self.cached_state(q, state_flags, None) {
+ None => return None,
+ Some(sp) => self.start_ptr(sp),
+ };
+ self.cache.start_states[flagi] = sp;
+ Some(sp)
+ }
+
+ /// Computes the set of starting flags for the given position in text.
+ ///
+ /// This should only be used when executing the DFA forwards over the
+ /// input.
+ fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
+ let mut empty_flags = EmptyFlags::default();
+ let mut state_flags = StateFlags::default();
+ empty_flags.start = at == 0;
+ empty_flags.end = text.is_empty();
+ empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
+ empty_flags.end_line = text.is_empty();
+
+ let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
+ let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
+ if is_word_last {
+ state_flags.set_word();
+ }
+ if is_word == is_word_last {
+ empty_flags.not_word_boundary = true;
+ } else {
+ empty_flags.word_boundary = true;
+ }
+ (empty_flags, state_flags)
+ }
+
+ /// Computes the set of starting flags for the given position in text.
+ ///
+ /// This should only be used when executing the DFA in reverse over the
+ /// input.
+ fn start_flags_reverse(
+ &self,
+ text: &[u8],
+ at: usize,
+ ) -> (EmptyFlags, StateFlags) {
+ let mut empty_flags = EmptyFlags::default();
+ let mut state_flags = StateFlags::default();
+ empty_flags.start = at == text.len();
+ empty_flags.end = text.is_empty();
+ empty_flags.start_line = at == text.len() || text[at] == b'\n';
+ empty_flags.end_line = text.is_empty();
+
+ let is_word_last =
+ at < text.len() && Byte::byte(text[at]).is_ascii_word();
+ let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
+ if is_word_last {
+ state_flags.set_word();
+ }
+ if is_word == is_word_last {
+ empty_flags.not_word_boundary = true;
+ } else {
+ empty_flags.word_boundary = true;
+ }
+ (empty_flags, state_flags)
+ }
+
+ /// Returns a reference to a State given a pointer to it.
+ fn state(&self, si: StatePtr) -> &State {
+ self.cache.compiled.get_state(si).unwrap()
+ }
+
+ /// Adds the given state to the DFA.
+ ///
+ /// This allocates room for transitions out of this state in
+ /// self.cache.trans. The transitions can be set with the returned
+ /// StatePtr.
+ ///
+ /// If None is returned, then the state limit was reached and the DFA
+ /// should quit.
+ fn add_state(&mut self, state: State) -> Option<StatePtr> {
+ // This will fail if the next state pointer exceeds STATE_PTR. In
+ // practice, the cache limit will prevent us from ever getting here,
+ // but maybe callers will set the cache size to something ridiculous...
+ let si = match self.cache.trans.add() {
+ None => return None,
+ Some(si) => si,
+ };
+ // If the program has a Unicode word boundary, then set any transitions
+ // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a
+ // transition, then it will quit and an alternative matching engine
+ // will take over.
+ if self.prog.has_unicode_word_boundary {
+ for b in 128..256 {
+ let cls = self.byte_class(Byte::byte(b as u8));
+ self.cache.trans.set_next(si, cls, STATE_QUIT);
+ }
+ }
+ // Finally, put our actual state on to our heap of states and index it
+ // so we can find it later.
+ self.cache.size += self.cache.trans.state_heap_size()
+ + state.data.len()
+ + (2 * mem::size_of::<State>())
+ + mem::size_of::<StatePtr>();
+ self.cache.compiled.insert(state, si);
+ // Transition table and set of states and map should all be in sync.
+ debug_assert!(
+ self.cache.compiled.len() == self.cache.trans.num_states()
+ );
+ Some(si)
+ }
+
+ /// Quickly finds the next occurrence of any literal prefixes in the regex.
+ /// If there are no literal prefixes, then the current position is
+ /// returned. If there are literal prefixes and one could not be found,
+ /// then None is returned.
+ ///
+ /// This should only be called when the DFA is in a start state.
+ fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
+ self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
+ }
+
+ /// Returns the number of byte classes required to discriminate transitions
+ /// in each state.
+ ///
+ /// invariant: num_byte_classes() == len(State.next)
+ fn num_byte_classes(&self) -> usize {
+ // We add 1 to account for the special EOF byte.
+ (self.prog.byte_classes[255] as usize + 1) + 1
+ }
+
+ /// Given an input byte or the special EOF sentinel, return its
+ /// corresponding byte class.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn byte_class(&self, b: Byte) -> usize {
+ match b.as_byte() {
+ None => self.num_byte_classes() - 1,
+ Some(b) => self.u8_class(b),
+ }
+ }
+
+ /// Like byte_class, but explicitly for u8s.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn u8_class(&self, b: u8) -> usize {
+ self.prog.byte_classes[b as usize] as usize
+ }
+
+ /// Returns true if the DFA should continue searching past the first match.
+ ///
+ /// Leftmost first semantics in the DFA are preserved by not following NFA
+ /// transitions after the first match is seen.
+ ///
+ /// On occasion, we want to avoid leftmost first semantics to find either
+ /// the longest match (for reverse search) or all possible matches (for
+ /// regex sets).
+ fn continue_past_first_match(&self) -> bool {
+ self.prog.is_reverse || self.prog.matches.len() > 1
+ }
+
+ /// Returns true if there is a prefix we can quickly search for.
+ fn has_prefix(&self) -> bool {
+ !self.prog.is_reverse
+ && !self.prog.prefixes.is_empty()
+ && !self.prog.is_anchored_start
+ }
+
+ /// Sets the STATE_START bit in the given state pointer if and only if
+ /// we have a prefix to scan for.
+ ///
+ /// If there's no prefix, then it's a waste to treat the start state
+ /// specially.
+ fn start_ptr(&self, si: StatePtr) -> StatePtr {
+ if self.has_prefix() {
+ si | STATE_START
+ } else {
+ si
+ }
+ }
+
+ /// Approximate size returns the approximate heap space currently used by
+ /// the DFA. It is used to determine whether the DFA's state cache needs to
+ /// be wiped. Namely, it is possible that for certain regexes on certain
+ /// inputs, a new state could be created for every byte of input. (This is
+ /// bad for memory use, so we bound it with a cache.)
+ fn approximate_size(&self) -> usize {
+ self.cache.size + self.prog.approximate_size()
+ }
+}
+
+/// An abstraction for representing a map of states. The map supports two
+/// different ways of state lookup. One is fast constant time access via a
+/// state pointer. The other is a hashmap lookup based on the DFA's
+/// constituent NFA states.
+///
+/// A DFA state internally uses an Arc such that we only need to store the
+/// set of NFA states on the heap once, even though we support looking up
+/// states by two different means. A more natural way to express this might
+/// use raw pointers, but an Arc is safe and effectively achieves the same
+/// thing.
+#[derive(Debug)]
+struct StateMap {
+ /// The keys are not actually static but rely on always pointing to a
+ /// buffer in `states` which will never be moved except when clearing
+ /// the map or on drop, in which case the keys of this map will be
+ /// removed before
+ map: HashMap<State, StatePtr>,
+ /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
+ /// this Vec rather than just a `StatePtr`.
+ states: Vec<State>,
+ /// The number of byte classes in the DFA. Used to index `states`.
+ num_byte_classes: usize,
+}
+
+impl StateMap {
+ fn new(num_byte_classes: usize) -> StateMap {
+ StateMap { map: HashMap::new(), states: vec![], num_byte_classes }
+ }
+
+ fn len(&self) -> usize {
+ self.states.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.states.is_empty()
+ }
+
+ fn get_ptr(&self, state: &State) -> Option<StatePtr> {
+ self.map.get(state).cloned()
+ }
+
+ fn get_state(&self, si: StatePtr) -> Option<&State> {
+ self.states.get(si as usize / self.num_byte_classes)
+ }
+
+ fn insert(&mut self, state: State, si: StatePtr) {
+ self.map.insert(state.clone(), si);
+ self.states.push(state);
+ }
+
+ fn clear(&mut self) {
+ self.map.clear();
+ self.states.clear();
+ }
+}
+
+impl Transitions {
+ /// Create a new transition table.
+ ///
+ /// The number of byte classes corresponds to the stride. Every state will
+ /// have `num_byte_classes` slots for transitions.
+ fn new(num_byte_classes: usize) -> Transitions {
+ Transitions { table: vec![], num_byte_classes }
+ }
+
+ /// Returns the total number of states currently in this table.
+ fn num_states(&self) -> usize {
+ self.table.len() / self.num_byte_classes
+ }
+
+ /// Allocates room for one additional state and returns a pointer to it.
+ ///
+ /// If there's no more room, None is returned.
+ fn add(&mut self) -> Option<StatePtr> {
+ let si = self.table.len();
+ if si > STATE_MAX as usize {
+ return None;
+ }
+ self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
+ Some(usize_to_u32(si))
+ }
+
+ /// Clears the table of all states.
+ fn clear(&mut self) {
+ self.table.clear();
+ }
+
+ /// Sets the transition from (si, cls) to next.
+ fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) {
+ self.table[si as usize + cls] = next;
+ }
+
+ /// Returns the transition corresponding to (si, cls).
+ fn next(&self, si: StatePtr, cls: usize) -> StatePtr {
+ self.table[si as usize + cls]
+ }
+
+ /// The heap size, in bytes, of a single state in the transition table.
+ fn state_heap_size(&self) -> usize {
+ self.num_byte_classes * mem::size_of::<StatePtr>()
+ }
+
+ /// Like `next`, but uses unchecked access and is therefore not safe.
+ unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
+ debug_assert!((si as usize) < self.table.len());
+ debug_assert!(cls < self.num_byte_classes);
+ *self.table.get_unchecked(si as usize + cls)
+ }
+}
+
+impl StateFlags {
+ fn is_match(&self) -> bool {
+ self.0 & 0b0000_0001 > 0
+ }
+
+ fn set_match(&mut self) {
+ self.0 |= 0b0000_0001;
+ }
+
+ fn is_word(&self) -> bool {
+ self.0 & 0b0000_0010 > 0
+ }
+
+ fn set_word(&mut self) {
+ self.0 |= 0b0000_0010;
+ }
+
+ fn has_empty(&self) -> bool {
+ self.0 & 0b0000_0100 > 0
+ }
+
+ fn set_empty(&mut self) {
+ self.0 |= 0b0000_0100;
+ }
+}
+
+impl Byte {
+ fn byte(b: u8) -> Self {
+ Byte(b as u16)
+ }
+ fn eof() -> Self {
+ Byte(256)
+ }
+ fn is_eof(&self) -> bool {
+ self.0 == 256
+ }
+
+ fn is_ascii_word(&self) -> bool {
+ let b = match self.as_byte() {
+ None => return false,
+ Some(b) => b,
+ };
+ match b {
+ b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true,
+ _ => false,
+ }
+ }
+
+ fn as_byte(&self) -> Option<u8> {
+ if self.is_eof() {
+ None
+ } else {
+ Some(self.0 as u8)
+ }
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let ips: Vec<usize> = self.inst_ptrs().collect();
+ f.debug_struct("State")
+ .field("flags", &self.flags())
+ .field("insts", &ips)
+ .finish()
+ }
+}
+
+impl fmt::Debug for Transitions {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut fmtd = f.debug_map();
+ for si in 0..self.num_states() {
+ let s = si * self.num_byte_classes;
+ let e = s + self.num_byte_classes;
+ fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e]));
+ }
+ fmtd.finish()
+ }
+}
+
+struct TransitionsRow<'a>(&'a [StatePtr]);
+
+impl<'a> fmt::Debug for TransitionsRow<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut fmtd = f.debug_map();
+ for (b, si) in self.0.iter().enumerate() {
+ match *si {
+ STATE_UNKNOWN => {}
+ STATE_DEAD => {
+ fmtd.entry(&vb(b as usize), &"DEAD");
+ }
+ si => {
+ fmtd.entry(&vb(b as usize), &si.to_string());
+ }
+ }
+ }
+ fmtd.finish()
+ }
+}
+
+impl fmt::Debug for StateFlags {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("StateFlags")
+ .field("is_match", &self.is_match())
+ .field("is_word", &self.is_word())
+ .field("has_empty", &self.has_empty())
+ .finish()
+ }
+}
+
+/// Helper function for formatting a byte as a nice-to-read escaped string.
+fn vb(b: usize) -> String {
+ use std::ascii::escape_default;
+
+ if b > ::std::u8::MAX as usize {
+ "EOF".to_owned()
+ } else {
+ let escaped = escape_default(b as u8).collect::<Vec<u8>>();
+ String::from_utf8_lossy(&escaped).into_owned()
+ }
+}
+
+fn usize_to_u32(n: usize) -> u32 {
+ if (n as u64) > (::std::u32::MAX as u64) {
+ panic!("BUG: {} is too big to fit into u32", n)
+ }
+ n as u32
+}
+
+#[allow(dead_code)] // useful for debugging
+fn show_state_ptr(si: StatePtr) -> String {
+ let mut s = format!("{:?}", si & STATE_MAX);
+ if si == STATE_UNKNOWN {
+ s = format!("{} (unknown)", s);
+ }
+ if si == STATE_DEAD {
+ s = format!("{} (dead)", s);
+ }
+ if si == STATE_QUIT {
+ s = format!("{} (quit)", s);
+ }
+ if si & STATE_START > 0 {
+ s = format!("{} (start)", s);
+ }
+ if si & STATE_MATCH > 0 {
+ s = format!("{} (match)", s);
+ }
+ s
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_vari32(data: &mut Vec<u8>, n: i32) {
+ let mut un = (n as u32) << 1;
+ if n < 0 {
+ un = !un;
+ }
+ write_varu32(data, un)
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_vari32(data: &[u8]) -> (i32, usize) {
+ let (un, i) = read_varu32(data);
+ let mut n = (un >> 1) as i32;
+ if un & 1 != 0 {
+ n = !n;
+ }
+ (n, i)
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
+ while n >= 0b1000_0000 {
+ data.push((n as u8) | 0b1000_0000);
+ n >>= 7;
+ }
+ data.push(n as u8);
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_varu32(data: &[u8]) -> (u32, usize) {
+ let mut n: u32 = 0;
+ let mut shift: u32 = 0;
+ for (i, &b) in data.iter().enumerate() {
+ if b < 0b1000_0000 {
+ return (n | ((b as u32) << shift), i + 1);
+ }
+ n |= ((b as u32) & 0b0111_1111) << shift;
+ shift += 7;
+ }
+ (0, 0)
+}
+
+#[cfg(test)]
+mod tests {
+
+ use super::{
+ push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
+ State, StateFlags,
+ };
+ use quickcheck::{quickcheck, Gen, QuickCheck};
+ use std::sync::Arc;
+
+ #[test]
+ fn prop_state_encode_decode() {
+ fn p(mut ips: Vec<u32>, flags: u8) -> bool {
+ // It looks like our encoding scheme can't handle instruction
+ // pointers at or above 2**31. We should fix that, but it seems
+ // unlikely to occur in real code due to the amount of memory
+ // required for such a state machine. So for now, we just clamp
+ // our test data.
+ for ip in &mut ips {
+ if *ip >= 1 << 31 {
+ *ip = (1 << 31) - 1;
+ }
+ }
+ let mut data = vec![flags];
+ let mut prev = 0;
+ for &ip in ips.iter() {
+ push_inst_ptr(&mut data, &mut prev, ip);
+ }
+ let state = State { data: Arc::from(&data[..]) };
+
+ let expected: Vec<usize> =
+ ips.into_iter().map(|ip| ip as usize).collect();
+ let got: Vec<usize> = state.inst_ptrs().collect();
+ expected == got && state.flags() == StateFlags(flags)
+ }
+ QuickCheck::new()
+ .gen(Gen::new(10_000))
+ .quickcheck(p as fn(Vec<u32>, u8) -> bool);
+ }
+
+ #[test]
+ fn prop_read_write_u32() {
+ fn p(n: u32) -> bool {
+ let mut buf = vec![];
+ write_varu32(&mut buf, n);
+ let (got, nread) = read_varu32(&buf);
+ nread == buf.len() && got == n
+ }
+ quickcheck(p as fn(u32) -> bool);
+ }
+
+ #[test]
+ fn prop_read_write_i32() {
+ fn p(n: i32) -> bool {
+ let mut buf = vec![];
+ write_vari32(&mut buf, n);
+ let (got, nread) = read_vari32(&buf);
+ nread == buf.len() && got == n
+ }
+ quickcheck(p as fn(i32) -> bool);
+ }
+}
diff --git a/third_party/rust/regex/src/error.rs b/third_party/rust/regex/src/error.rs
new file mode 100644
index 0000000000..3e0ec75210
--- /dev/null
+++ b/third_party/rust/regex/src/error.rs
@@ -0,0 +1,71 @@
+use std::fmt;
+use std::iter::repeat;
+
+/// An error that occurred during parsing or compiling a regular expression.
+#[derive(Clone, PartialEq)]
+pub enum Error {
+ /// A syntax error.
+ Syntax(String),
+ /// The compiled program exceeded the set size limit.
+ /// The argument is the size limit imposed.
+ CompiledTooBig(usize),
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl ::std::error::Error for Error {
+ // TODO: Remove this method entirely on the next breaking semver release.
+ #[allow(deprecated)]
+ fn description(&self) -> &str {
+ match *self {
+ Error::Syntax(ref err) => err,
+ Error::CompiledTooBig(_) => "compiled program too big",
+ Error::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ Error::Syntax(ref err) => err.fmt(f),
+ Error::CompiledTooBig(limit) => write!(
+ f,
+ "Compiled regex exceeds size limit of {} bytes.",
+ limit
+ ),
+ Error::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+// We implement our own Debug implementation so that we show nicer syntax
+// errors when people use `Regex::new(...).unwrap()`. It's a little weird,
+// but the `Syntax` variant is already storing a `String` anyway, so we might
+// as well format it nicely.
+impl fmt::Debug for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ Error::Syntax(ref err) => {
+ let hr: String = repeat('~').take(79).collect();
+ writeln!(f, "Syntax(")?;
+ writeln!(f, "{}", hr)?;
+ writeln!(f, "{}", err)?;
+ writeln!(f, "{}", hr)?;
+ write!(f, ")")?;
+ Ok(())
+ }
+ Error::CompiledTooBig(limit) => {
+ f.debug_tuple("CompiledTooBig").field(&limit).finish()
+ }
+ Error::__Nonexhaustive => {
+ f.debug_tuple("__Nonexhaustive").finish()
+ }
+ }
+ }
+}
diff --git a/third_party/rust/regex/src/exec.rs b/third_party/rust/regex/src/exec.rs
new file mode 100644
index 0000000000..e75ca083a0
--- /dev/null
+++ b/third_party/rust/regex/src/exec.rs
@@ -0,0 +1,1655 @@
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::panic::AssertUnwindSafe;
+use std::sync::Arc;
+
+#[cfg(feature = "perf-literal")]
+use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
+use regex_syntax::hir::literal::Literals;
+use regex_syntax::hir::Hir;
+use regex_syntax::ParserBuilder;
+
+use crate::backtrack;
+use crate::compile::Compiler;
+#[cfg(feature = "perf-dfa")]
+use crate::dfa;
+use crate::error::Error;
+use crate::input::{ByteInput, CharInput};
+use crate::literal::LiteralSearcher;
+use crate::pikevm;
+use crate::pool::{Pool, PoolGuard};
+use crate::prog::Program;
+use crate::re_builder::RegexOptions;
+use crate::re_bytes;
+use crate::re_set;
+use crate::re_trait::{Locations, RegularExpression, Slot};
+use crate::re_unicode;
+use crate::utf8::next_utf8;
+
+/// `Exec` manages the execution of a regular expression.
+///
+/// In particular, this manages the various compiled forms of a single regular
+/// expression and the choice of which matching engine to use to execute a
+/// regular expression.
+#[derive(Debug)]
+pub struct Exec {
+ /// All read only state.
+ ro: Arc<ExecReadOnly>,
+ /// A pool of reusable values for the various matching engines.
+ ///
+ /// Note that boxing this value is not strictly necessary, but it is an
+ /// easy way to ensure that T does not bloat the stack sized used by a pool
+ /// in the case where T is big. And this turns out to be the case at the
+ /// time of writing for regex's use of this pool. At the time of writing,
+ /// the size of a Regex on the stack is 856 bytes. Boxing this value
+ /// reduces that size to 16 bytes.
+ pool: Box<Pool<ProgramCache>>,
+}
+
+/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
+/// means it is no longer Sync, but we can now avoid the overhead of
+/// synchronization to fetch the cache.
+#[derive(Debug)]
+pub struct ExecNoSync<'c> {
+ /// All read only state.
+ ro: &'c Arc<ExecReadOnly>,
+ /// Caches for the various matching engines.
+ cache: PoolGuard<'c, ProgramCache>,
+}
+
+/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
+#[derive(Debug)]
+pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
+
+/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
+/// state is determined at compile time and never changes during search.
+#[derive(Debug)]
+struct ExecReadOnly {
+ /// The original regular expressions given by the caller to compile.
+ res: Vec<String>,
+ /// A compiled program that is used in the NFA simulation and backtracking.
+ /// It can be byte-based or Unicode codepoint based.
+ ///
+ /// N.B. It is not possibly to make this byte-based from the public API.
+ /// It is only used for testing byte based programs in the NFA simulations.
+ nfa: Program,
+ /// A compiled byte based program for DFA execution. This is only used
+ /// if a DFA can be executed. (Currently, only word boundary assertions are
+ /// not supported.) Note that this program contains an embedded `.*?`
+ /// preceding the first capture group, unless the regex is anchored at the
+ /// beginning.
+ dfa: Program,
+ /// The same as above, except the program is reversed (and there is no
+ /// preceding `.*?`). This is used by the DFA to find the starting location
+ /// of matches.
+ dfa_reverse: Program,
+ /// A set of suffix literals extracted from the regex.
+ ///
+ /// Prefix literals are stored on the `Program`, since they are used inside
+ /// the matching engines.
+ suffixes: LiteralSearcher,
+ /// An Aho-Corasick automaton with leftmost-first match semantics.
+ ///
+ /// This is only set when the entire regex is a simple unanchored
+ /// alternation of literals. We could probably use it more circumstances,
+ /// but this is already hacky enough in this architecture.
+ ///
+ /// N.B. We use u32 as a state ID representation under the assumption that
+ /// if we were to exhaust the ID space, we probably would have long
+ /// surpassed the compilation size limit.
+ #[cfg(feature = "perf-literal")]
+ ac: Option<AhoCorasick<u32>>,
+ /// match_type encodes as much upfront knowledge about how we're going to
+ /// execute a search as possible.
+ match_type: MatchType,
+}
+
+/// Facilitates the construction of an executor by exposing various knobs
+/// to control how a regex is executed and what kinds of resources it's
+/// permitted to use.
+// `ExecBuilder` is only public via the `internal` module, so avoid deriving
+// `Debug`.
+#[allow(missing_debug_implementations)]
+pub struct ExecBuilder {
+ options: RegexOptions,
+ match_type: Option<MatchType>,
+ bytes: bool,
+ only_utf8: bool,
+}
+
+/// Parsed represents a set of parsed regular expressions and their detected
+/// literals.
+struct Parsed {
+ exprs: Vec<Hir>,
+ prefixes: Literals,
+ suffixes: Literals,
+ bytes: bool,
+}
+
+impl ExecBuilder {
+ /// Create a regex execution builder.
+ ///
+ /// This uses default settings for everything except the regex itself,
+ /// which must be provided. Further knobs can be set by calling methods,
+ /// and then finally, `build` to actually create the executor.
+ pub fn new(re: &str) -> Self {
+ Self::new_many(&[re])
+ }
+
+ /// Like new, but compiles the union of the given regular expressions.
+ ///
+ /// Note that when compiling 2 or more regular expressions, capture groups
+ /// are completely unsupported. (This means both `find` and `captures`
+ /// won't work.)
+ pub fn new_many<I, S>(res: I) -> Self
+ where
+ S: AsRef<str>,
+ I: IntoIterator<Item = S>,
+ {
+ let mut opts = RegexOptions::default();
+ opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
+ Self::new_options(opts)
+ }
+
+ /// Create a regex execution builder.
+ pub fn new_options(opts: RegexOptions) -> Self {
+ ExecBuilder {
+ options: opts,
+ match_type: None,
+ bytes: false,
+ only_utf8: true,
+ }
+ }
+
+ /// Set the matching engine to be automatically determined.
+ ///
+ /// This is the default state and will apply whatever optimizations are
+ /// possible, such as running a DFA.
+ ///
+ /// This overrides whatever was previously set via the `nfa` or
+ /// `bounded_backtracking` methods.
+ pub fn automatic(mut self) -> Self {
+ self.match_type = None;
+ self
+ }
+
+ /// Sets the matching engine to use the NFA algorithm no matter what
+ /// optimizations are possible.
+ ///
+ /// This overrides whatever was previously set via the `automatic` or
+ /// `bounded_backtracking` methods.
+ pub fn nfa(mut self) -> Self {
+ self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
+ self
+ }
+
+ /// Sets the matching engine to use a bounded backtracking engine no
+ /// matter what optimizations are possible.
+ ///
+ /// One must use this with care, since the bounded backtracking engine
+ /// uses memory proportion to `len(regex) * len(text)`.
+ ///
+ /// This overrides whatever was previously set via the `automatic` or
+ /// `nfa` methods.
+ pub fn bounded_backtracking(mut self) -> Self {
+ self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
+ self
+ }
+
+ /// Compiles byte based programs for use with the NFA matching engines.
+ ///
+ /// By default, the NFA engines match on Unicode scalar values. They can
+ /// be made to use byte based programs instead. In general, the byte based
+ /// programs are slower because of a less efficient encoding of character
+ /// classes.
+ ///
+ /// Note that this does not impact DFA matching engines, which always
+ /// execute on bytes.
+ pub fn bytes(mut self, yes: bool) -> Self {
+ self.bytes = yes;
+ self
+ }
+
+ /// When disabled, the program compiled may match arbitrary bytes.
+ ///
+ /// When enabled (the default), all compiled programs exclusively match
+ /// valid UTF-8 bytes.
+ pub fn only_utf8(mut self, yes: bool) -> Self {
+ self.only_utf8 = yes;
+ self
+ }
+
+ /// Set the Unicode flag.
+ pub fn unicode(mut self, yes: bool) -> Self {
+ self.options.unicode = yes;
+ self
+ }
+
+ /// Parse the current set of patterns into their AST and extract literals.
+ fn parse(&self) -> Result<Parsed, Error> {
+ let mut exprs = Vec::with_capacity(self.options.pats.len());
+ let mut prefixes = Some(Literals::empty());
+ let mut suffixes = Some(Literals::empty());
+ let mut bytes = false;
+ let is_set = self.options.pats.len() > 1;
+ // If we're compiling a regex set and that set has any anchored
+ // expressions, then disable all literal optimizations.
+ for pat in &self.options.pats {
+ let mut parser = ParserBuilder::new()
+ .octal(self.options.octal)
+ .case_insensitive(self.options.case_insensitive)
+ .multi_line(self.options.multi_line)
+ .dot_matches_new_line(self.options.dot_matches_new_line)
+ .swap_greed(self.options.swap_greed)
+ .ignore_whitespace(self.options.ignore_whitespace)
+ .unicode(self.options.unicode)
+ .allow_invalid_utf8(!self.only_utf8)
+ .nest_limit(self.options.nest_limit)
+ .build();
+ let expr =
+ parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
+ bytes = bytes || !expr.is_always_utf8();
+
+ if cfg!(feature = "perf-literal") {
+ if !expr.is_anchored_start() && expr.is_any_anchored_start() {
+ // Partial anchors unfortunately make it hard to use
+ // prefixes, so disable them.
+ prefixes = None;
+ } else if is_set && expr.is_anchored_start() {
+ // Regex sets with anchors do not go well with literal
+ // optimizations.
+ prefixes = None;
+ }
+ prefixes = prefixes.and_then(|mut prefixes| {
+ if !prefixes.union_prefixes(&expr) {
+ None
+ } else {
+ Some(prefixes)
+ }
+ });
+
+ if !expr.is_anchored_end() && expr.is_any_anchored_end() {
+ // Partial anchors unfortunately make it hard to use
+ // suffixes, so disable them.
+ suffixes = None;
+ } else if is_set && expr.is_anchored_end() {
+ // Regex sets with anchors do not go well with literal
+ // optimizations.
+ suffixes = None;
+ }
+ suffixes = suffixes.and_then(|mut suffixes| {
+ if !suffixes.union_suffixes(&expr) {
+ None
+ } else {
+ Some(suffixes)
+ }
+ });
+ }
+ exprs.push(expr);
+ }
+ Ok(Parsed {
+ exprs,
+ prefixes: prefixes.unwrap_or_else(Literals::empty),
+ suffixes: suffixes.unwrap_or_else(Literals::empty),
+ bytes,
+ })
+ }
+
+ /// Build an executor that can run a regular expression.
+ pub fn build(self) -> Result<Exec, Error> {
+ // Special case when we have no patterns to compile.
+ // This can happen when compiling a regex set.
+ if self.options.pats.is_empty() {
+ let ro = Arc::new(ExecReadOnly {
+ res: vec![],
+ nfa: Program::new(),
+ dfa: Program::new(),
+ dfa_reverse: Program::new(),
+ suffixes: LiteralSearcher::empty(),
+ #[cfg(feature = "perf-literal")]
+ ac: None,
+ match_type: MatchType::Nothing,
+ });
+ let pool = ExecReadOnly::new_pool(&ro);
+ return Ok(Exec { ro, pool });
+ }
+ let parsed = self.parse()?;
+ let mut nfa = Compiler::new()
+ .size_limit(self.options.size_limit)
+ .bytes(self.bytes || parsed.bytes)
+ .only_utf8(self.only_utf8)
+ .compile(&parsed.exprs)?;
+ let mut dfa = Compiler::new()
+ .size_limit(self.options.size_limit)
+ .dfa(true)
+ .only_utf8(self.only_utf8)
+ .compile(&parsed.exprs)?;
+ let mut dfa_reverse = Compiler::new()
+ .size_limit(self.options.size_limit)
+ .dfa(true)
+ .only_utf8(self.only_utf8)
+ .reverse(true)
+ .compile(&parsed.exprs)?;
+
+ #[cfg(feature = "perf-literal")]
+ let ac = self.build_aho_corasick(&parsed);
+ nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
+ dfa.prefixes = nfa.prefixes.clone();
+ dfa.dfa_size_limit = self.options.dfa_size_limit;
+ dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
+
+ let mut ro = ExecReadOnly {
+ res: self.options.pats,
+ nfa,
+ dfa,
+ dfa_reverse,
+ suffixes: LiteralSearcher::suffixes(parsed.suffixes),
+ #[cfg(feature = "perf-literal")]
+ ac,
+ match_type: MatchType::Nothing,
+ };
+ ro.match_type = ro.choose_match_type(self.match_type);
+
+ let ro = Arc::new(ro);
+ let pool = ExecReadOnly::new_pool(&ro);
+ Ok(Exec { ro, pool })
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
+ if parsed.exprs.len() != 1 {
+ return None;
+ }
+ let lits = match alternation_literals(&parsed.exprs[0]) {
+ None => return None,
+ Some(lits) => lits,
+ };
+ // If we have a small number of literals, then let Teddy handle
+ // things (see literal/mod.rs).
+ if lits.len() <= 32 {
+ return None;
+ }
+ Some(
+ AhoCorasickBuilder::new()
+ .match_kind(MatchKind::LeftmostFirst)
+ .auto_configure(&lits)
+ .build_with_size::<u32, _, _>(&lits)
+ // This should never happen because we'd long exceed the
+ // compilation limit for regexes first.
+ .expect("AC automaton too big"),
+ )
+ }
+}
+
+impl<'c> RegularExpression for ExecNoSyncStr<'c> {
+ type Text = str;
+
+ fn slots_len(&self) -> usize {
+ self.0.slots_len()
+ }
+
+ fn next_after_empty(&self, text: &str, i: usize) -> usize {
+ next_utf8(text.as_bytes(), i)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
+ self.0.shortest_match_at(text.as_bytes(), start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match_at(&self, text: &str, start: usize) -> bool {
+ self.0.is_match_at(text.as_bytes(), start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
+ self.0.find_at(text.as_bytes(), start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn captures_read_at(
+ &self,
+ locs: &mut Locations,
+ text: &str,
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ self.0.captures_read_at(locs, text.as_bytes(), start)
+ }
+}
+
+impl<'c> RegularExpression for ExecNoSync<'c> {
+ type Text = [u8];
+
+ /// Returns the number of capture slots in the regular expression. (There
+ /// are two slots for every capture group, corresponding to possibly empty
+ /// start and end locations of the capture.)
+ fn slots_len(&self) -> usize {
+ self.ro.nfa.captures.len() * 2
+ }
+
+ fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
+ i + 1
+ }
+
+ /// Returns the end of a match location, possibly occurring before the
+ /// end location of the correct leftmost-first match.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
+ if !self.is_anchor_end_match(text) {
+ return None;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => {
+ self.find_literals(ty, text, start).map(|(_, e)| e)
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa | MatchType::DfaMany => {
+ match self.shortest_dfa(text, start) {
+ dfa::Result::Match(end) => Some(end),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.shortest_nfa(text, start),
+ }
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ true,
+ &text[start..],
+ text.len(),
+ ) {
+ dfa::Result::Match(_) => Some(text.len()),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.shortest_nfa(text, start),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.shortest_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match(e) => Some(e),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.shortest_nfa(text, start),
+ }
+ }
+ MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
+ MatchType::Nothing => None,
+ }
+ }
+
+ /// Returns true if and only if the regex matches text.
+ ///
+ /// For single regular expressions, this is equivalent to calling
+ /// shortest_match(...).is_some().
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match_at(&self, text: &[u8], start: usize) -> bool {
+ if !self.is_anchor_end_match(text) {
+ return false;
+ }
+ // We need to do this dance because shortest_match relies on the NFA
+ // filling in captures[1], but a RegexSet has no captures. In other
+ // words, a RegexSet can't (currently) use shortest_match. ---AG
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => {
+ self.find_literals(ty, text, start).is_some()
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa | MatchType::DfaMany => {
+ match self.shortest_dfa(text, start) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.match_nfa(text, start),
+ }
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ true,
+ &text[start..],
+ text.len(),
+ ) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.match_nfa(text, start),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.shortest_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.match_nfa(text, start),
+ }
+ }
+ MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
+ MatchType::Nothing => false,
+ }
+ }
+
+ /// Finds the start and end location of the leftmost-first match, starting
+ /// at the given location.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
+ if !self.is_anchor_end_match(text) {
+ return None;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => self.find_literals(ty, text, start),
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa => match self.find_dfa_forward(text, start) {
+ dfa::Result::Match((s, e)) => Some((s, e)),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.find_nfa(MatchNfaType::Auto, text, start)
+ }
+ },
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match self.find_dfa_anchored_reverse(text, start) {
+ dfa::Result::Match((s, e)) => Some((s, e)),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.find_nfa(MatchNfaType::Auto, text, start)
+ }
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.find_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match((s, e)) => Some((s, e)),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.find_nfa(MatchNfaType::Auto, text, start)
+ }
+ }
+ }
+ MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
+ MatchType::Nothing => None,
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaMany => {
+ unreachable!("BUG: RegexSet cannot be used with find")
+ }
+ }
+ }
+
+ /// Finds the start and end location of the leftmost-first match and also
+ /// fills in all matching capture groups.
+ ///
+ /// The number of capture slots given should be equal to the total number
+ /// of capture slots in the compiled program.
+ ///
+ /// Note that the first two slots always correspond to the start and end
+ /// locations of the overall match.
+ fn captures_read_at(
+ &self,
+ locs: &mut Locations,
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ let slots = locs.as_slots();
+ for slot in slots.iter_mut() {
+ *slot = None;
+ }
+ // If the caller unnecessarily uses this, then we try to save them
+ // from themselves.
+ match slots.len() {
+ 0 => return self.find_at(text, start),
+ 2 => {
+ return self.find_at(text, start).map(|(s, e)| {
+ slots[0] = Some(s);
+ slots[1] = Some(e);
+ (s, e)
+ });
+ }
+ _ => {} // fallthrough
+ }
+ if !self.is_anchor_end_match(text) {
+ return None;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => {
+ self.find_literals(ty, text, start).and_then(|(s, e)| {
+ self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ )
+ })
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa => {
+ if self.ro.nfa.is_anchored_start {
+ self.captures_nfa(slots, text, start)
+ } else {
+ match self.find_dfa_forward(text, start) {
+ dfa::Result::Match((s, e)) => self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ ),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.captures_nfa(slots, text, start)
+ }
+ }
+ }
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match self.find_dfa_anchored_reverse(text, start) {
+ dfa::Result::Match((s, e)) => self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ ),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.captures_nfa(slots, text, start),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.find_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match((s, e)) => self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ ),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.captures_nfa(slots, text, start),
+ }
+ }
+ MatchType::Nfa(ty) => {
+ self.captures_nfa_type(ty, slots, text, start, text.len())
+ }
+ MatchType::Nothing => None,
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaMany => {
+ unreachable!("BUG: RegexSet cannot be used with captures")
+ }
+ }
+ }
+}
+
+impl<'c> ExecNoSync<'c> {
+ /// Finds the leftmost-first match using only literal search.
+ #[cfg(feature = "perf-literal")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_literals(
+ &self,
+ ty: MatchLiteralType,
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ use self::MatchLiteralType::*;
+ match ty {
+ Unanchored => {
+ let lits = &self.ro.nfa.prefixes;
+ lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
+ }
+ AnchoredStart => {
+ let lits = &self.ro.nfa.prefixes;
+ if start == 0 || !self.ro.nfa.is_anchored_start {
+ lits.find_start(&text[start..])
+ .map(|(s, e)| (start + s, start + e))
+ } else {
+ None
+ }
+ }
+ AnchoredEnd => {
+ let lits = &self.ro.suffixes;
+ lits.find_end(&text[start..])
+ .map(|(s, e)| (start + s, start + e))
+ }
+ AhoCorasick => self
+ .ro
+ .ac
+ .as_ref()
+ .unwrap()
+ .find(&text[start..])
+ .map(|m| (start + m.start(), start + m.end())),
+ }
+ }
+
+ /// Finds the leftmost-first match (start and end) using only the DFA.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(feature = "perf-dfa")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_dfa_forward(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<(usize, usize)> {
+ use crate::dfa::Result::*;
+ let end = match dfa::Fsm::forward(
+ &self.ro.dfa,
+ self.cache.value(),
+ false,
+ text,
+ start,
+ ) {
+ NoMatch(i) => return NoMatch(i),
+ Quit => return Quit,
+ Match(end) if start == end => return Match((start, start)),
+ Match(end) => end,
+ };
+ // Now run the DFA in reverse to find the start of the match.
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ false,
+ &text[start..],
+ end - start,
+ ) {
+ Match(s) => Match((start + s, end)),
+ NoMatch(i) => NoMatch(i),
+ Quit => Quit,
+ }
+ }
+
+ /// Finds the leftmost-first match (start and end) using only the DFA,
+ /// but assumes the regex is anchored at the end and therefore starts at
+ /// the end of the regex and matches in reverse.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(feature = "perf-dfa")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_dfa_anchored_reverse(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<(usize, usize)> {
+ use crate::dfa::Result::*;
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ false,
+ &text[start..],
+ text.len() - start,
+ ) {
+ Match(s) => Match((start + s, text.len())),
+ NoMatch(i) => NoMatch(i),
+ Quit => Quit,
+ }
+ }
+
+ /// Finds the end of the shortest match using only the DFA.
+ #[cfg(feature = "perf-dfa")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
+ dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
+ }
+
+ /// Finds the end of the shortest match using only the DFA by scanning for
+ /// suffix literals.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_dfa_reverse_suffix(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<usize> {
+ match self.exec_dfa_reverse_suffix(text, start) {
+ None => self.shortest_dfa(text, start),
+ Some(r) => r.map(|(_, end)| end),
+ }
+ }
+
+ /// Finds the end of the shortest match using only the DFA by scanning for
+ /// suffix literals. It also reports the start of the match.
+ ///
+ /// Note that if None is returned, then the optimization gave up to avoid
+ /// worst case quadratic behavior. A forward scanning DFA should be tried
+ /// next.
+ ///
+ /// If a match is returned and the full leftmost-first match is desired,
+ /// then a forward scan starting from the beginning of the match must be
+ /// done.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn exec_dfa_reverse_suffix(
+ &self,
+ text: &[u8],
+ original_start: usize,
+ ) -> Option<dfa::Result<(usize, usize)>> {
+ use crate::dfa::Result::*;
+
+ let lcs = self.ro.suffixes.lcs();
+ debug_assert!(lcs.len() >= 1);
+ let mut start = original_start;
+ let mut end = start;
+ let mut last_literal = start;
+ while end <= text.len() {
+ last_literal += match lcs.find(&text[last_literal..]) {
+ None => return Some(NoMatch(text.len())),
+ Some(i) => i,
+ };
+ end = last_literal + lcs.len();
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ false,
+ &text[start..end],
+ end - start,
+ ) {
+ Match(0) | NoMatch(0) => return None,
+ Match(i) => return Some(Match((start + i, end))),
+ NoMatch(i) => {
+ start += i;
+ last_literal += 1;
+ continue;
+ }
+ Quit => return Some(Quit),
+ };
+ }
+ Some(NoMatch(text.len()))
+ }
+
+ /// Finds the leftmost-first match (start and end) using only the DFA
+ /// by scanning for suffix literals.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_dfa_reverse_suffix(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<(usize, usize)> {
+ use crate::dfa::Result::*;
+
+ let match_start = match self.exec_dfa_reverse_suffix(text, start) {
+ None => return self.find_dfa_forward(text, start),
+ Some(Match((start, _))) => start,
+ Some(r) => return r,
+ };
+ // At this point, we've found a match. The only way to quit now
+ // without a match is if the DFA gives up (seems unlikely).
+ //
+ // Now run the DFA forwards to find the proper end of the match.
+ // (The suffix literal match can only indicate the earliest
+ // possible end location, which may appear before the end of the
+ // leftmost-first match.)
+ match dfa::Fsm::forward(
+ &self.ro.dfa,
+ self.cache.value(),
+ false,
+ text,
+ match_start,
+ ) {
+ NoMatch(_) => panic!("BUG: reverse match implies forward match"),
+ Quit => Quit,
+ Match(e) => Match((match_start, e)),
+ }
+ }
+
+ /// Executes the NFA engine to return whether there is a match or not.
+ ///
+ /// Ideally, we could use shortest_nfa(...).is_some() and get the same
+ /// performance characteristics, but regex sets don't have captures, which
+ /// shortest_nfa depends on.
+ #[cfg(feature = "perf-dfa")]
+ fn match_nfa(&self, text: &[u8], start: usize) -> bool {
+ self.match_nfa_type(MatchNfaType::Auto, text, start)
+ }
+
+ /// Like match_nfa, but allows specification of the type of NFA engine.
+ fn match_nfa_type(
+ &self,
+ ty: MatchNfaType,
+ text: &[u8],
+ start: usize,
+ ) -> bool {
+ self.exec_nfa(
+ ty,
+ &mut [false],
+ &mut [],
+ true,
+ false,
+ text,
+ start,
+ text.len(),
+ )
+ }
+
+ /// Finds the shortest match using an NFA.
+ #[cfg(feature = "perf-dfa")]
+ fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
+ self.shortest_nfa_type(MatchNfaType::Auto, text, start)
+ }
+
+ /// Like shortest_nfa, but allows specification of the type of NFA engine.
+ fn shortest_nfa_type(
+ &self,
+ ty: MatchNfaType,
+ text: &[u8],
+ start: usize,
+ ) -> Option<usize> {
+ let mut slots = [None, None];
+ if self.exec_nfa(
+ ty,
+ &mut [false],
+ &mut slots,
+ true,
+ true,
+ text,
+ start,
+ text.len(),
+ ) {
+ slots[1]
+ } else {
+ None
+ }
+ }
+
+ /// Like find, but executes an NFA engine.
+ fn find_nfa(
+ &self,
+ ty: MatchNfaType,
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ let mut slots = [None, None];
+ if self.exec_nfa(
+ ty,
+ &mut [false],
+ &mut slots,
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ) {
+ match (slots[0], slots[1]) {
+ (Some(s), Some(e)) => Some((s, e)),
+ _ => None,
+ }
+ } else {
+ None
+ }
+ }
+
+ /// Like find_nfa, but fills in captures.
+ ///
+ /// `slots` should have length equal to `2 * nfa.captures.len()`.
+ #[cfg(feature = "perf-dfa")]
+ fn captures_nfa(
+ &self,
+ slots: &mut [Slot],
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ start,
+ text.len(),
+ )
+ }
+
+ /// Like captures_nfa, but allows specification of type of NFA engine.
+ fn captures_nfa_type(
+ &self,
+ ty: MatchNfaType,
+ slots: &mut [Slot],
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Option<(usize, usize)> {
+ if self.exec_nfa(
+ ty,
+ &mut [false],
+ slots,
+ false,
+ false,
+ text,
+ start,
+ end,
+ ) {
+ match (slots[0], slots[1]) {
+ (Some(s), Some(e)) => Some((s, e)),
+ _ => None,
+ }
+ } else {
+ None
+ }
+ }
+
+ fn exec_nfa(
+ &self,
+ mut ty: MatchNfaType,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ quit_after_match_with_pos: bool,
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ use self::MatchNfaType::*;
+ if let Auto = ty {
+ if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
+ ty = Backtrack;
+ } else {
+ ty = PikeVM;
+ }
+ }
+ // The backtracker can't return the shortest match position as it is
+ // implemented today. So if someone calls `shortest_match` and we need
+ // to run an NFA, then use the PikeVM.
+ if quit_after_match_with_pos || ty == PikeVM {
+ self.exec_pikevm(
+ matches,
+ slots,
+ quit_after_match,
+ text,
+ start,
+ end,
+ )
+ } else {
+ self.exec_backtrack(matches, slots, text, start, end)
+ }
+ }
+
+ /// Always run the NFA algorithm.
+ fn exec_pikevm(
+ &self,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ if self.ro.nfa.uses_bytes() {
+ pikevm::Fsm::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ quit_after_match,
+ ByteInput::new(text, self.ro.nfa.only_utf8),
+ start,
+ end,
+ )
+ } else {
+ pikevm::Fsm::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ quit_after_match,
+ CharInput::new(text),
+ start,
+ end,
+ )
+ }
+ }
+
+ /// Always runs the NFA using bounded backtracking.
+ fn exec_backtrack(
+ &self,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ if self.ro.nfa.uses_bytes() {
+ backtrack::Bounded::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ ByteInput::new(text, self.ro.nfa.only_utf8),
+ start,
+ end,
+ )
+ } else {
+ backtrack::Bounded::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ CharInput::new(text),
+ start,
+ end,
+ )
+ }
+ }
+
+ /// Finds which regular expressions match the given text.
+ ///
+ /// `matches` should have length equal to the number of regexes being
+ /// searched.
+ ///
+ /// This is only useful when one wants to know which regexes in a set
+ /// match some text.
+ pub fn many_matches_at(
+ &self,
+ matches: &mut [bool],
+ text: &[u8],
+ start: usize,
+ ) -> bool {
+ use self::MatchType::*;
+ if !self.is_anchor_end_match(text) {
+ return false;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ Literal(ty) => {
+ debug_assert_eq!(matches.len(), 1);
+ matches[0] = self.find_literals(ty, text, start).is_some();
+ matches[0]
+ }
+ #[cfg(feature = "perf-dfa")]
+ Dfa | DfaAnchoredReverse | DfaMany => {
+ match dfa::Fsm::forward_many(
+ &self.ro.dfa,
+ self.cache.value(),
+ matches,
+ text,
+ start,
+ ) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.exec_nfa(
+ MatchNfaType::Auto,
+ matches,
+ &mut [],
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ DfaSuffix => {
+ match dfa::Fsm::forward_many(
+ &self.ro.dfa,
+ self.cache.value(),
+ matches,
+ text,
+ start,
+ ) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.exec_nfa(
+ MatchNfaType::Auto,
+ matches,
+ &mut [],
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ),
+ }
+ }
+ Nfa(ty) => self.exec_nfa(
+ ty,
+ matches,
+ &mut [],
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ),
+ Nothing => false,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_anchor_end_match(&self, text: &[u8]) -> bool {
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
+ true
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
+ // Only do this check if the haystack is big (>1MB).
+ if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
+ let lcs = ro.suffixes.lcs();
+ if lcs.len() >= 1 && !lcs.is_suffix(text) {
+ return false;
+ }
+ }
+ true
+ }
+
+ imp(&self.ro, text)
+ }
+
+ pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+ &self.ro.nfa.capture_name_idx
+ }
+}
+
+impl<'c> ExecNoSyncStr<'c> {
+ pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+ self.0.capture_name_idx()
+ }
+}
+
+impl Exec {
+ /// Get a searcher that isn't Sync.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn searcher(&self) -> ExecNoSync<'_> {
+ ExecNoSync {
+ ro: &self.ro, // a clone is too expensive here! (and not needed)
+ cache: self.pool.get(),
+ }
+ }
+
+ /// Get a searcher that isn't Sync and can match on &str.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
+ ExecNoSyncStr(self.searcher())
+ }
+
+ /// Build a Regex from this executor.
+ pub fn into_regex(self) -> re_unicode::Regex {
+ re_unicode::Regex::from(self)
+ }
+
+ /// Build a RegexSet from this executor.
+ pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
+ re_set::unicode::RegexSet::from(self)
+ }
+
+ /// Build a Regex from this executor that can match arbitrary bytes.
+ pub fn into_byte_regex(self) -> re_bytes::Regex {
+ re_bytes::Regex::from(self)
+ }
+
+ /// Build a RegexSet from this executor that can match arbitrary bytes.
+ pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
+ re_set::bytes::RegexSet::from(self)
+ }
+
+ /// The original regular expressions given by the caller that were
+ /// compiled.
+ pub fn regex_strings(&self) -> &[String] {
+ &self.ro.res
+ }
+
+ /// Return a slice of capture names.
+ ///
+ /// Any capture that isn't named is None.
+ pub fn capture_names(&self) -> &[Option<String>] {
+ &self.ro.nfa.captures
+ }
+
+ /// Return a reference to named groups mapping (from group name to
+ /// group position).
+ pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+ &self.ro.nfa.capture_name_idx
+ }
+}
+
+impl Clone for Exec {
+ fn clone(&self) -> Exec {
+ let pool = ExecReadOnly::new_pool(&self.ro);
+ Exec { ro: self.ro.clone(), pool }
+ }
+}
+
+impl ExecReadOnly {
+ fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
+ if let Some(MatchType::Nfa(_)) = hint {
+ return hint.unwrap();
+ }
+ // If the NFA is empty, then we'll never match anything.
+ if self.nfa.insts.is_empty() {
+ return MatchType::Nothing;
+ }
+ if let Some(literalty) = self.choose_literal_match_type() {
+ return literalty;
+ }
+ if let Some(dfaty) = self.choose_dfa_match_type() {
+ return dfaty;
+ }
+ // We're so totally hosed.
+ MatchType::Nfa(MatchNfaType::Auto)
+ }
+
+ /// If a plain literal scan can be used, then a corresponding literal
+ /// search type is returned.
+ fn choose_literal_match_type(&self) -> Option<MatchType> {
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &ExecReadOnly) -> Option<MatchType> {
+ None
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
+ // If our set of prefixes is complete, then we can use it to find
+ // a match in lieu of a regex engine. This doesn't quite work well
+ // in the presence of multiple regexes, so only do it when there's
+ // one.
+ //
+ // TODO(burntsushi): Also, don't try to match literals if the regex
+ // is partially anchored. We could technically do it, but we'd need
+ // to create two sets of literals: all of them and then the subset
+ // that aren't anchored. We would then only search for all of them
+ // when at the beginning of the input and use the subset in all
+ // other cases.
+ if ro.res.len() != 1 {
+ return None;
+ }
+ if ro.ac.is_some() {
+ return Some(MatchType::Literal(
+ MatchLiteralType::AhoCorasick,
+ ));
+ }
+ if ro.nfa.prefixes.complete() {
+ return if ro.nfa.is_anchored_start {
+ Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
+ } else {
+ Some(MatchType::Literal(MatchLiteralType::Unanchored))
+ };
+ }
+ if ro.suffixes.complete() {
+ return if ro.nfa.is_anchored_end {
+ Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
+ } else {
+ // This case shouldn't happen. When the regex isn't
+ // anchored, then complete prefixes should imply complete
+ // suffixes.
+ Some(MatchType::Literal(MatchLiteralType::Unanchored))
+ };
+ }
+ None
+ }
+
+ imp(self)
+ }
+
+ /// If a DFA scan can be used, then choose the appropriate DFA strategy.
+ fn choose_dfa_match_type(&self) -> Option<MatchType> {
+ #[cfg(not(feature = "perf-dfa"))]
+ fn imp(_: &ExecReadOnly) -> Option<MatchType> {
+ None
+ }
+
+ #[cfg(feature = "perf-dfa")]
+ fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
+ if !dfa::can_exec(&ro.dfa) {
+ return None;
+ }
+ // Regex sets require a slightly specialized path.
+ if ro.res.len() >= 2 {
+ return Some(MatchType::DfaMany);
+ }
+ // If the regex is anchored at the end but not the start, then
+ // just match in reverse from the end of the haystack.
+ if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
+ return Some(MatchType::DfaAnchoredReverse);
+ }
+ #[cfg(feature = "perf-literal")]
+ {
+ // If there's a longish suffix literal, then it might be faster
+ // to look for that first.
+ if ro.should_suffix_scan() {
+ return Some(MatchType::DfaSuffix);
+ }
+ }
+ // Fall back to your garden variety forward searching lazy DFA.
+ Some(MatchType::Dfa)
+ }
+
+ imp(self)
+ }
+
+ /// Returns true if the program is amenable to suffix scanning.
+ ///
+ /// When this is true, as a heuristic, we assume it is OK to quickly scan
+ /// for suffix literals and then do a *reverse* DFA match from any matches
+ /// produced by the literal scan. (And then followed by a forward DFA
+ /// search, since the previously found suffix literal maybe not actually be
+ /// the end of a match.)
+ ///
+ /// This is a bit of a specialized optimization, but can result in pretty
+ /// big performance wins if 1) there are no prefix literals and 2) the
+ /// suffix literals are pretty rare in the text. (1) is obviously easy to
+ /// account for but (2) is harder. As a proxy, we assume that longer
+ /// strings are generally rarer, so we only enable this optimization when
+ /// we have a meaty suffix.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ fn should_suffix_scan(&self) -> bool {
+ if self.suffixes.is_empty() {
+ return false;
+ }
+ let lcs_len = self.suffixes.lcs().char_len();
+ lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
+ }
+
+ fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
+ let ro = ro.clone();
+ Box::new(Pool::new(Box::new(move || {
+ AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
+ })))
+ }
+}
+
+#[derive(Clone, Copy, Debug)]
+enum MatchType {
+ /// A single or multiple literal search. This is only used when the regex
+ /// can be decomposed into a literal search.
+ #[cfg(feature = "perf-literal")]
+ Literal(MatchLiteralType),
+ /// A normal DFA search.
+ #[cfg(feature = "perf-dfa")]
+ Dfa,
+ /// A reverse DFA search starting from the end of a haystack.
+ #[cfg(feature = "perf-dfa")]
+ DfaAnchoredReverse,
+ /// A reverse DFA search with suffix literal scanning.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ DfaSuffix,
+ /// Use the DFA on two or more regular expressions.
+ #[cfg(feature = "perf-dfa")]
+ DfaMany,
+ /// An NFA variant.
+ Nfa(MatchNfaType),
+ /// No match is ever possible, so don't ever try to search.
+ Nothing,
+}
+
+#[derive(Clone, Copy, Debug)]
+#[cfg(feature = "perf-literal")]
+enum MatchLiteralType {
+ /// Match literals anywhere in text.
+ Unanchored,
+ /// Match literals only at the start of text.
+ AnchoredStart,
+ /// Match literals only at the end of text.
+ AnchoredEnd,
+ /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
+ /// ExecReadOnly.
+ AhoCorasick,
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum MatchNfaType {
+ /// Choose between Backtrack and PikeVM.
+ Auto,
+ /// NFA bounded backtracking.
+ ///
+ /// (This is only set by tests, since it never makes sense to always want
+ /// backtracking.)
+ Backtrack,
+ /// The Pike VM.
+ ///
+ /// (This is only set by tests, since it never makes sense to always want
+ /// the Pike VM.)
+ PikeVM,
+}
+
+/// `ProgramCache` maintains reusable allocations for each matching engine
+/// available to a particular program.
+///
+/// We declare this as unwind safe since it's a cache that's only used for
+/// performance purposes. If a panic occurs, it is (or should be) always safe
+/// to continue using the same regex object.
+pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
+
+#[derive(Debug)]
+pub struct ProgramCacheInner {
+ pub pikevm: pikevm::Cache,
+ pub backtrack: backtrack::Cache,
+ #[cfg(feature = "perf-dfa")]
+ pub dfa: dfa::Cache,
+ #[cfg(feature = "perf-dfa")]
+ pub dfa_reverse: dfa::Cache,
+}
+
+impl ProgramCacheInner {
+ fn new(ro: &ExecReadOnly) -> Self {
+ ProgramCacheInner {
+ pikevm: pikevm::Cache::new(&ro.nfa),
+ backtrack: backtrack::Cache::new(&ro.nfa),
+ #[cfg(feature = "perf-dfa")]
+ dfa: dfa::Cache::new(&ro.dfa),
+ #[cfg(feature = "perf-dfa")]
+ dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
+ }
+ }
+}
+
+/// Alternation literals checks if the given HIR is a simple alternation of
+/// literals, and if so, returns them. Otherwise, this returns None.
+#[cfg(feature = "perf-literal")]
+fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
+ use regex_syntax::hir::{HirKind, Literal};
+
+ // This is pretty hacky, but basically, if `is_alternation_literal` is
+ // true, then we can make several assumptions about the structure of our
+ // HIR. This is what justifies the `unreachable!` statements below.
+ //
+ // This code should be refactored once we overhaul this crate's
+ // optimization pipeline, because this is a terribly inflexible way to go
+ // about things.
+
+ if !expr.is_alternation_literal() {
+ return None;
+ }
+ let alts = match *expr.kind() {
+ HirKind::Alternation(ref alts) => alts,
+ _ => return None, // one literal isn't worth it
+ };
+
+ let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
+ Literal::Unicode(c) => {
+ let mut buf = [0; 4];
+ dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
+ }
+ Literal::Byte(b) => {
+ dst.push(b);
+ }
+ };
+
+ let mut lits = vec![];
+ for alt in alts {
+ let mut lit = vec![];
+ match *alt.kind() {
+ HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ HirKind::Concat(ref exprs) => {
+ for e in exprs {
+ match *e.kind() {
+ HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ _ => unreachable!("expected literal, got {:?}", e),
+ }
+ }
+ }
+ _ => unreachable!("expected literal or concat, got {:?}", alt),
+ }
+ lits.push(lit);
+ }
+ Some(lits)
+}
+
+#[cfg(test)]
+mod test {
+ #[test]
+ fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
+ use crate::internal::ExecBuilder;
+
+ let backtrack_bytes_re = ExecBuilder::new("^S")
+ .bounded_backtracking()
+ .only_utf8(false)
+ .build()
+ .map(|exec| exec.into_byte_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let default_bytes_re = ExecBuilder::new("^S")
+ .only_utf8(false)
+ .build()
+ .map(|exec| exec.into_byte_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let input = vec![83, 83];
+
+ let s1 = backtrack_bytes_re.split(&input);
+ let s2 = default_bytes_re.split(&input);
+ for (chunk1, chunk2) in s1.zip(s2) {
+ assert_eq!(chunk1, chunk2);
+ }
+ }
+
+ #[test]
+ fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
+ use crate::internal::ExecBuilder;
+
+ let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
+ .bounded_backtracking()
+ .bytes(true)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
+ .bytes(true)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let input = "**";
+
+ let s1 = backtrack_bytes_re.split(input);
+ let s2 = default_bytes_re.split(input);
+ for (chunk1, chunk2) in s1.zip(s2) {
+ assert_eq!(chunk1, chunk2);
+ }
+ }
+}
diff --git a/third_party/rust/regex/src/expand.rs b/third_party/rust/regex/src/expand.rs
new file mode 100644
index 0000000000..67b514926a
--- /dev/null
+++ b/third_party/rust/regex/src/expand.rs
@@ -0,0 +1,239 @@
+use std::str;
+
+use crate::find_byte::find_byte;
+
+use crate::re_bytes;
+use crate::re_unicode;
+
+pub fn expand_str(
+ caps: &re_unicode::Captures<'_>,
+ mut replacement: &str,
+ dst: &mut String,
+) {
+ while !replacement.is_empty() {
+ match find_byte(b'$', replacement.as_bytes()) {
+ None => break,
+ Some(i) => {
+ dst.push_str(&replacement[..i]);
+ replacement = &replacement[i..];
+ }
+ }
+ if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
+ dst.push_str("$");
+ replacement = &replacement[2..];
+ continue;
+ }
+ debug_assert!(!replacement.is_empty());
+ let cap_ref = match find_cap_ref(replacement.as_bytes()) {
+ Some(cap_ref) => cap_ref,
+ None => {
+ dst.push_str("$");
+ replacement = &replacement[1..];
+ continue;
+ }
+ };
+ replacement = &replacement[cap_ref.end..];
+ match cap_ref.cap {
+ Ref::Number(i) => {
+ dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or(""));
+ }
+ Ref::Named(name) => {
+ dst.push_str(
+ caps.name(name).map(|m| m.as_str()).unwrap_or(""),
+ );
+ }
+ }
+ }
+ dst.push_str(replacement);
+}
+
+pub fn expand_bytes(
+ caps: &re_bytes::Captures<'_>,
+ mut replacement: &[u8],
+ dst: &mut Vec<u8>,
+) {
+ while !replacement.is_empty() {
+ match find_byte(b'$', replacement) {
+ None => break,
+ Some(i) => {
+ dst.extend(&replacement[..i]);
+ replacement = &replacement[i..];
+ }
+ }
+ if replacement.get(1).map_or(false, |&b| b == b'$') {
+ dst.push(b'$');
+ replacement = &replacement[2..];
+ continue;
+ }
+ debug_assert!(!replacement.is_empty());
+ let cap_ref = match find_cap_ref(replacement) {
+ Some(cap_ref) => cap_ref,
+ None => {
+ dst.push(b'$');
+ replacement = &replacement[1..];
+ continue;
+ }
+ };
+ replacement = &replacement[cap_ref.end..];
+ match cap_ref.cap {
+ Ref::Number(i) => {
+ dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b""));
+ }
+ Ref::Named(name) => {
+ dst.extend(
+ caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""),
+ );
+ }
+ }
+ }
+ dst.extend(replacement);
+}
+
+/// `CaptureRef` represents a reference to a capture group inside some text.
+/// The reference is either a capture group name or a number.
+///
+/// It is also tagged with the position in the text following the
+/// capture reference.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+struct CaptureRef<'a> {
+ cap: Ref<'a>,
+ end: usize,
+}
+
+/// A reference to a capture group in some text.
+///
+/// e.g., `$2`, `$foo`, `${foo}`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum Ref<'a> {
+ Named(&'a str),
+ Number(usize),
+}
+
+impl<'a> From<&'a str> for Ref<'a> {
+ fn from(x: &'a str) -> Ref<'a> {
+ Ref::Named(x)
+ }
+}
+
+impl From<usize> for Ref<'static> {
+ fn from(x: usize) -> Ref<'static> {
+ Ref::Number(x)
+ }
+}
+
+/// Parses a possible reference to a capture group name in the given text,
+/// starting at the beginning of `replacement`.
+///
+/// If no such valid reference could be found, None is returned.
+fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
+ let mut i = 0;
+ let rep: &[u8] = replacement;
+ if rep.len() <= 1 || rep[0] != b'$' {
+ return None;
+ }
+ i += 1;
+ if rep[i] == b'{' {
+ return find_cap_ref_braced(rep, i + 1);
+ }
+ let mut cap_end = i;
+ while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
+ cap_end += 1;
+ }
+ if cap_end == i {
+ return None;
+ }
+ // We just verified that the range 0..cap_end is valid ASCII, so it must
+ // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
+ // check via an unchecked conversion or by parsing the number straight from
+ // &[u8].
+ let cap =
+ str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
+ Some(CaptureRef {
+ cap: match cap.parse::<u32>() {
+ Ok(i) => Ref::Number(i as usize),
+ Err(_) => Ref::Named(cap),
+ },
+ end: cap_end,
+ })
+}
+
+fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
+ let start = i;
+ while rep.get(i).map_or(false, |&b| b != b'}') {
+ i += 1;
+ }
+ if !rep.get(i).map_or(false, |&b| b == b'}') {
+ return None;
+ }
+ // When looking at braced names, we don't put any restrictions on the name,
+ // so it's possible it could be invalid UTF-8. But a capture group name
+ // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
+ // safely return None.
+ let cap = match str::from_utf8(&rep[start..i]) {
+ Err(_) => return None,
+ Ok(cap) => cap,
+ };
+ Some(CaptureRef {
+ cap: match cap.parse::<u32>() {
+ Ok(i) => Ref::Number(i as usize),
+ Err(_) => Ref::Named(cap),
+ },
+ end: i + 1,
+ })
+}
+
+/// Returns true if and only if the given byte is allowed in a capture name.
+fn is_valid_cap_letter(b: u8) -> bool {
+ match b {
+ b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
+ _ => false,
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{find_cap_ref, CaptureRef};
+
+ macro_rules! find {
+ ($name:ident, $text:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(None, find_cap_ref($text.as_bytes()));
+ }
+ };
+ ($name:ident, $text:expr, $capref:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
+ }
+ };
+ }
+
+ macro_rules! c {
+ ($name_or_number:expr, $pos:expr) => {
+ CaptureRef { cap: $name_or_number.into(), end: $pos }
+ };
+ }
+
+ find!(find_cap_ref1, "$foo", c!("foo", 4));
+ find!(find_cap_ref2, "${foo}", c!("foo", 6));
+ find!(find_cap_ref3, "$0", c!(0, 2));
+ find!(find_cap_ref4, "$5", c!(5, 2));
+ find!(find_cap_ref5, "$10", c!(10, 3));
+ // See https://github.com/rust-lang/regex/pull/585
+ // for more on characters following numbers
+ find!(find_cap_ref6, "$42a", c!("42a", 4));
+ find!(find_cap_ref7, "${42}a", c!(42, 5));
+ find!(find_cap_ref8, "${42");
+ find!(find_cap_ref9, "${42 ");
+ find!(find_cap_ref10, " $0 ");
+ find!(find_cap_ref11, "$");
+ find!(find_cap_ref12, " ");
+ find!(find_cap_ref13, "");
+ find!(find_cap_ref14, "$1-$2", c!(1, 2));
+ find!(find_cap_ref15, "$1_$2", c!("1_", 3));
+ find!(find_cap_ref16, "$x-$y", c!("x", 2));
+ find!(find_cap_ref17, "$x_$y", c!("x_", 3));
+ find!(find_cap_ref18, "${#}", c!("#", 4));
+ find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
+}
diff --git a/third_party/rust/regex/src/find_byte.rs b/third_party/rust/regex/src/find_byte.rs
new file mode 100644
index 0000000000..e95f72afb9
--- /dev/null
+++ b/third_party/rust/regex/src/find_byte.rs
@@ -0,0 +1,18 @@
+/// Searches for the given needle in the given haystack.
+///
+/// If the perf-literal feature is enabled, then this uses the super optimized
+/// memchr crate. Otherwise, it uses the naive byte-at-a-time implementation.
+pub fn find_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
+ haystack.iter().position(|&b| b == needle)
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
+ use memchr::memchr;
+ memchr(needle, haystack)
+ }
+
+ imp(needle, haystack)
+}
diff --git a/third_party/rust/regex/src/freqs.rs b/third_party/rust/regex/src/freqs.rs
new file mode 100644
index 0000000000..fcffa95fb5
--- /dev/null
+++ b/third_party/rust/regex/src/freqs.rs
@@ -0,0 +1,261 @@
+// NOTE: The following code was generated by "scripts/frequencies.py", do not
+// edit directly
+
+pub const BYTE_FREQUENCIES: [u8; 256] = [
+ 55, // '\x00'
+ 52, // '\x01'
+ 51, // '\x02'
+ 50, // '\x03'
+ 49, // '\x04'
+ 48, // '\x05'
+ 47, // '\x06'
+ 46, // '\x07'
+ 45, // '\x08'
+ 103, // '\t'
+ 242, // '\n'
+ 66, // '\x0b'
+ 67, // '\x0c'
+ 229, // '\r'
+ 44, // '\x0e'
+ 43, // '\x0f'
+ 42, // '\x10'
+ 41, // '\x11'
+ 40, // '\x12'
+ 39, // '\x13'
+ 38, // '\x14'
+ 37, // '\x15'
+ 36, // '\x16'
+ 35, // '\x17'
+ 34, // '\x18'
+ 33, // '\x19'
+ 56, // '\x1a'
+ 32, // '\x1b'
+ 31, // '\x1c'
+ 30, // '\x1d'
+ 29, // '\x1e'
+ 28, // '\x1f'
+ 255, // ' '
+ 148, // '!'
+ 164, // '"'
+ 149, // '#'
+ 136, // '$'
+ 160, // '%'
+ 155, // '&'
+ 173, // "'"
+ 221, // '('
+ 222, // ')'
+ 134, // '*'
+ 122, // '+'
+ 232, // ','
+ 202, // '-'
+ 215, // '.'
+ 224, // '/'
+ 208, // '0'
+ 220, // '1'
+ 204, // '2'
+ 187, // '3'
+ 183, // '4'
+ 179, // '5'
+ 177, // '6'
+ 168, // '7'
+ 178, // '8'
+ 200, // '9'
+ 226, // ':'
+ 195, // ';'
+ 154, // '<'
+ 184, // '='
+ 174, // '>'
+ 126, // '?'
+ 120, // '@'
+ 191, // 'A'
+ 157, // 'B'
+ 194, // 'C'
+ 170, // 'D'
+ 189, // 'E'
+ 162, // 'F'
+ 161, // 'G'
+ 150, // 'H'
+ 193, // 'I'
+ 142, // 'J'
+ 137, // 'K'
+ 171, // 'L'
+ 176, // 'M'
+ 185, // 'N'
+ 167, // 'O'
+ 186, // 'P'
+ 112, // 'Q'
+ 175, // 'R'
+ 192, // 'S'
+ 188, // 'T'
+ 156, // 'U'
+ 140, // 'V'
+ 143, // 'W'
+ 123, // 'X'
+ 133, // 'Y'
+ 128, // 'Z'
+ 147, // '['
+ 138, // '\\'
+ 146, // ']'
+ 114, // '^'
+ 223, // '_'
+ 151, // '`'
+ 249, // 'a'
+ 216, // 'b'
+ 238, // 'c'
+ 236, // 'd'
+ 253, // 'e'
+ 227, // 'f'
+ 218, // 'g'
+ 230, // 'h'
+ 247, // 'i'
+ 135, // 'j'
+ 180, // 'k'
+ 241, // 'l'
+ 233, // 'm'
+ 246, // 'n'
+ 244, // 'o'
+ 231, // 'p'
+ 139, // 'q'
+ 245, // 'r'
+ 243, // 's'
+ 251, // 't'
+ 235, // 'u'
+ 201, // 'v'
+ 196, // 'w'
+ 240, // 'x'
+ 214, // 'y'
+ 152, // 'z'
+ 182, // '{'
+ 205, // '|'
+ 181, // '}'
+ 127, // '~'
+ 27, // '\x7f'
+ 212, // '\x80'
+ 211, // '\x81'
+ 210, // '\x82'
+ 213, // '\x83'
+ 228, // '\x84'
+ 197, // '\x85'
+ 169, // '\x86'
+ 159, // '\x87'
+ 131, // '\x88'
+ 172, // '\x89'
+ 105, // '\x8a'
+ 80, // '\x8b'
+ 98, // '\x8c'
+ 96, // '\x8d'
+ 97, // '\x8e'
+ 81, // '\x8f'
+ 207, // '\x90'
+ 145, // '\x91'
+ 116, // '\x92'
+ 115, // '\x93'
+ 144, // '\x94'
+ 130, // '\x95'
+ 153, // '\x96'
+ 121, // '\x97'
+ 107, // '\x98'
+ 132, // '\x99'
+ 109, // '\x9a'
+ 110, // '\x9b'
+ 124, // '\x9c'
+ 111, // '\x9d'
+ 82, // '\x9e'
+ 108, // '\x9f'
+ 118, // '\xa0'
+ 141, // '¡'
+ 113, // '¢'
+ 129, // '£'
+ 119, // '¤'
+ 125, // '¥'
+ 165, // '¦'
+ 117, // '§'
+ 92, // '¨'
+ 106, // '©'
+ 83, // 'ª'
+ 72, // '«'
+ 99, // '¬'
+ 93, // '\xad'
+ 65, // '®'
+ 79, // '¯'
+ 166, // '°'
+ 237, // '±'
+ 163, // '²'
+ 199, // '³'
+ 190, // '´'
+ 225, // 'µ'
+ 209, // '¶'
+ 203, // '·'
+ 198, // '¸'
+ 217, // '¹'
+ 219, // 'º'
+ 206, // '»'
+ 234, // '¼'
+ 248, // '½'
+ 158, // '¾'
+ 239, // '¿'
+ 255, // 'À'
+ 255, // 'Á'
+ 255, // 'Â'
+ 255, // 'Ã'
+ 255, // 'Ä'
+ 255, // 'Å'
+ 255, // 'Æ'
+ 255, // 'Ç'
+ 255, // 'È'
+ 255, // 'É'
+ 255, // 'Ê'
+ 255, // 'Ë'
+ 255, // 'Ì'
+ 255, // 'Í'
+ 255, // 'Î'
+ 255, // 'Ï'
+ 255, // 'Ð'
+ 255, // 'Ñ'
+ 255, // 'Ò'
+ 255, // 'Ó'
+ 255, // 'Ô'
+ 255, // 'Õ'
+ 255, // 'Ö'
+ 255, // '×'
+ 255, // 'Ø'
+ 255, // 'Ù'
+ 255, // 'Ú'
+ 255, // 'Û'
+ 255, // 'Ü'
+ 255, // 'Ý'
+ 255, // 'Þ'
+ 255, // 'ß'
+ 255, // 'à'
+ 255, // 'á'
+ 255, // 'â'
+ 255, // 'ã'
+ 255, // 'ä'
+ 255, // 'å'
+ 255, // 'æ'
+ 255, // 'ç'
+ 255, // 'è'
+ 255, // 'é'
+ 255, // 'ê'
+ 255, // 'ë'
+ 255, // 'ì'
+ 255, // 'í'
+ 255, // 'î'
+ 255, // 'ï'
+ 255, // 'ð'
+ 255, // 'ñ'
+ 255, // 'ò'
+ 255, // 'ó'
+ 255, // 'ô'
+ 255, // 'õ'
+ 255, // 'ö'
+ 255, // '÷'
+ 255, // 'ø'
+ 255, // 'ù'
+ 255, // 'ú'
+ 255, // 'û'
+ 255, // 'ü'
+ 255, // 'ý'
+ 255, // 'þ'
+ 255, // 'ÿ'
+];
diff --git a/third_party/rust/regex/src/input.rs b/third_party/rust/regex/src/input.rs
new file mode 100644
index 0000000000..df6c3e0c91
--- /dev/null
+++ b/third_party/rust/regex/src/input.rs
@@ -0,0 +1,432 @@
+use std::char;
+use std::cmp::Ordering;
+use std::fmt;
+use std::ops;
+use std::u32;
+
+use crate::literal::LiteralSearcher;
+use crate::prog::InstEmptyLook;
+use crate::utf8::{decode_last_utf8, decode_utf8};
+
+/// Represents a location in the input.
+#[derive(Clone, Copy, Debug)]
+pub struct InputAt {
+ pos: usize,
+ c: Char,
+ byte: Option<u8>,
+ len: usize,
+}
+
+impl InputAt {
+ /// Returns true iff this position is at the beginning of the input.
+ pub fn is_start(&self) -> bool {
+ self.pos == 0
+ }
+
+ /// Returns true iff this position is past the end of the input.
+ pub fn is_end(&self) -> bool {
+ self.c.is_none() && self.byte.is_none()
+ }
+
+ /// Returns the character at this position.
+ ///
+ /// If this position is just before or after the input, then an absent
+ /// character is returned.
+ pub fn char(&self) -> Char {
+ self.c
+ }
+
+ /// Returns the byte at this position.
+ pub fn byte(&self) -> Option<u8> {
+ self.byte
+ }
+
+ /// Returns the UTF-8 width of the character at this position.
+ pub fn len(&self) -> usize {
+ self.len
+ }
+
+ /// Returns whether the UTF-8 width of the character at this position
+ /// is zero.
+ pub fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ /// Returns the byte offset of this position.
+ pub fn pos(&self) -> usize {
+ self.pos
+ }
+
+ /// Returns the byte offset of the next position in the input.
+ pub fn next_pos(&self) -> usize {
+ self.pos + self.len
+ }
+}
+
+/// An abstraction over input used in the matching engines.
+pub trait Input: fmt::Debug {
+ /// Return an encoding of the position at byte offset `i`.
+ fn at(&self, i: usize) -> InputAt;
+
+ /// Return the Unicode character occurring next to `at`.
+ ///
+ /// If no such character could be decoded, then `Char` is absent.
+ fn next_char(&self, at: InputAt) -> Char;
+
+ /// Return the Unicode character occurring previous to `at`.
+ ///
+ /// If no such character could be decoded, then `Char` is absent.
+ fn previous_char(&self, at: InputAt) -> Char;
+
+ /// Return true if the given empty width instruction matches at the
+ /// input position given.
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
+
+ /// Scan the input for a matching prefix.
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt>;
+
+ /// The number of bytes in the input.
+ fn len(&self) -> usize;
+
+ /// Whether the input is empty.
+ fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Return the given input as a sequence of bytes.
+ fn as_bytes(&self) -> &[u8];
+}
+
+impl<'a, T: Input> Input for &'a T {
+ fn at(&self, i: usize) -> InputAt {
+ (**self).at(i)
+ }
+
+ fn next_char(&self, at: InputAt) -> Char {
+ (**self).next_char(at)
+ }
+
+ fn previous_char(&self, at: InputAt) -> Char {
+ (**self).previous_char(at)
+ }
+
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+ (**self).is_empty_match(at, empty)
+ }
+
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt> {
+ (**self).prefix_at(prefixes, at)
+ }
+
+ fn len(&self) -> usize {
+ (**self).len()
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ (**self).as_bytes()
+ }
+}
+
+/// An input reader over characters.
+#[derive(Clone, Copy, Debug)]
+pub struct CharInput<'t>(&'t [u8]);
+
+impl<'t> CharInput<'t> {
+ /// Return a new character input reader for the given string.
+ pub fn new(s: &'t [u8]) -> CharInput<'t> {
+ CharInput(s)
+ }
+}
+
+impl<'t> ops::Deref for CharInput<'t> {
+ type Target = [u8];
+
+ fn deref(&self) -> &[u8] {
+ self.0
+ }
+}
+
+impl<'t> Input for CharInput<'t> {
+ fn at(&self, i: usize) -> InputAt {
+ if i >= self.len() {
+ InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
+ } else {
+ let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
+ InputAt { pos: i, c, byte: None, len: c.len_utf8() }
+ }
+ }
+
+ fn next_char(&self, at: InputAt) -> Char {
+ at.char()
+ }
+
+ fn previous_char(&self, at: InputAt) -> Char {
+ decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
+ }
+
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+ use crate::prog::EmptyLook::*;
+ match empty.look {
+ StartLine => {
+ let c = self.previous_char(at);
+ at.pos() == 0 || c == '\n'
+ }
+ EndLine => {
+ let c = self.next_char(at);
+ at.pos() == self.len() || c == '\n'
+ }
+ StartText => at.pos() == 0,
+ EndText => at.pos() == self.len(),
+ WordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() != c2.is_word_char()
+ }
+ NotWordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() == c2.is_word_char()
+ }
+ WordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_byte() != c2.is_word_byte()
+ }
+ NotWordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_byte() == c2.is_word_byte()
+ }
+ }
+ }
+
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt> {
+ prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
+ }
+
+ fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ self.0
+ }
+}
+
+/// An input reader over bytes.
+#[derive(Clone, Copy, Debug)]
+pub struct ByteInput<'t> {
+ text: &'t [u8],
+ only_utf8: bool,
+}
+
+impl<'t> ByteInput<'t> {
+ /// Return a new byte-based input reader for the given string.
+ pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
+ ByteInput { text, only_utf8 }
+ }
+}
+
+impl<'t> ops::Deref for ByteInput<'t> {
+ type Target = [u8];
+
+ fn deref(&self) -> &[u8] {
+ self.text
+ }
+}
+
+impl<'t> Input for ByteInput<'t> {
+ fn at(&self, i: usize) -> InputAt {
+ if i >= self.len() {
+ InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
+ } else {
+ InputAt {
+ pos: i,
+ c: None.into(),
+ byte: self.get(i).cloned(),
+ len: 1,
+ }
+ }
+ }
+
+ fn next_char(&self, at: InputAt) -> Char {
+ decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
+ }
+
+ fn previous_char(&self, at: InputAt) -> Char {
+ decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
+ }
+
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+ use crate::prog::EmptyLook::*;
+ match empty.look {
+ StartLine => {
+ let c = self.previous_char(at);
+ at.pos() == 0 || c == '\n'
+ }
+ EndLine => {
+ let c = self.next_char(at);
+ at.pos() == self.len() || c == '\n'
+ }
+ StartText => at.pos() == 0,
+ EndText => at.pos() == self.len(),
+ WordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() != c2.is_word_char()
+ }
+ NotWordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() == c2.is_word_char()
+ }
+ WordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ if self.only_utf8 {
+ // If we must match UTF-8, then we can't match word
+ // boundaries at invalid UTF-8.
+ if c1.is_none() && !at.is_start() {
+ return false;
+ }
+ if c2.is_none() && !at.is_end() {
+ return false;
+ }
+ }
+ c1.is_word_byte() != c2.is_word_byte()
+ }
+ NotWordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ if self.only_utf8 {
+ // If we must match UTF-8, then we can't match word
+ // boundaries at invalid UTF-8.
+ if c1.is_none() && !at.is_start() {
+ return false;
+ }
+ if c2.is_none() && !at.is_end() {
+ return false;
+ }
+ }
+ c1.is_word_byte() == c2.is_word_byte()
+ }
+ }
+ }
+
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt> {
+ prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
+ }
+
+ fn len(&self) -> usize {
+ self.text.len()
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ self.text
+ }
+}
+
+/// An inline representation of `Option<char>`.
+///
+/// This eliminates the need to do case analysis on `Option<char>` to determine
+/// ordinality with other characters.
+///
+/// (The `Option<char>` is not related to encoding. Instead, it is used in the
+/// matching engines to represent the beginning and ending boundaries of the
+/// search text.)
+#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
+pub struct Char(u32);
+
+impl fmt::Debug for Char {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match char::from_u32(self.0) {
+ None => write!(f, "Empty"),
+ Some(c) => write!(f, "{:?}", c),
+ }
+ }
+}
+
+impl Char {
+ /// Returns true iff the character is absent.
+ #[inline]
+ pub fn is_none(self) -> bool {
+ self.0 == u32::MAX
+ }
+
+ /// Returns the length of the character's UTF-8 encoding.
+ ///
+ /// If the character is absent, then `1` is returned.
+ #[inline]
+ pub fn len_utf8(self) -> usize {
+ char::from_u32(self.0).map_or(1, |c| c.len_utf8())
+ }
+
+ /// Returns true iff the character is a word character.
+ ///
+ /// If the character is absent, then false is returned.
+ pub fn is_word_char(self) -> bool {
+ // is_word_character can panic if the Unicode data for \w isn't
+ // available. However, our compiler ensures that if a Unicode word
+ // boundary is used, then the data must also be available. If it isn't,
+ // then the compiler returns an error.
+ char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
+ }
+
+ /// Returns true iff the byte is a word byte.
+ ///
+ /// If the byte is absent, then false is returned.
+ pub fn is_word_byte(self) -> bool {
+ match char::from_u32(self.0) {
+ Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
+ None | Some(_) => false,
+ }
+ }
+}
+
+impl From<char> for Char {
+ fn from(c: char) -> Char {
+ Char(c as u32)
+ }
+}
+
+impl From<Option<char>> for Char {
+ fn from(c: Option<char>) -> Char {
+ c.map_or(Char(u32::MAX), |c| c.into())
+ }
+}
+
+impl PartialEq<char> for Char {
+ #[inline]
+ fn eq(&self, other: &char) -> bool {
+ self.0 == *other as u32
+ }
+}
+
+impl PartialEq<Char> for char {
+ #[inline]
+ fn eq(&self, other: &Char) -> bool {
+ *self as u32 == other.0
+ }
+}
+
+impl PartialOrd<char> for Char {
+ #[inline]
+ fn partial_cmp(&self, other: &char) -> Option<Ordering> {
+ self.0.partial_cmp(&(*other as u32))
+ }
+}
+
+impl PartialOrd<Char> for char {
+ #[inline]
+ fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
+ (*self as u32).partial_cmp(&other.0)
+ }
+}
diff --git a/third_party/rust/regex/src/lib.rs b/third_party/rust/regex/src/lib.rs
new file mode 100644
index 0000000000..6b95739c5c
--- /dev/null
+++ b/third_party/rust/regex/src/lib.rs
@@ -0,0 +1,769 @@
+/*!
+This crate provides a library for parsing, compiling, and executing regular
+expressions. Its syntax is similar to Perl-style regular expressions, but lacks
+a few features like look around and backreferences. In exchange, all searches
+execute in linear time with respect to the size of the regular expression and
+search text.
+
+This crate's documentation provides some simple examples, describes
+[Unicode support](#unicode) and exhaustively lists the
+[supported syntax](#syntax).
+
+For more specific details on the API for regular expressions, please see the
+documentation for the [`Regex`](struct.Regex.html) type.
+
+# Usage
+
+This crate is [on crates.io](https://crates.io/crates/regex) and can be
+used by adding `regex` to your dependencies in your project's `Cargo.toml`.
+
+```toml
+[dependencies]
+regex = "1"
+```
+
+# Example: find a date
+
+General use of regular expressions in this package involves compiling an
+expression and then using it to search, split or replace text. For example,
+to confirm that some text resembles a date:
+
+```rust
+use regex::Regex;
+let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap();
+assert!(re.is_match("2014-01-01"));
+```
+
+Notice the use of the `^` and `$` anchors. In this crate, every expression
+is executed with an implicit `.*?` at the beginning and end, which allows
+it to match anywhere in the text. Anchors can be used to ensure that the
+full text matches an expression.
+
+This example also demonstrates the utility of
+[raw strings](https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals)
+in Rust, which
+are just like regular strings except they are prefixed with an `r` and do
+not process any escape sequences. For example, `"\\d"` is the same
+expression as `r"\d"`.
+
+# Example: Avoid compiling the same regex in a loop
+
+It is an anti-pattern to compile the same regular expression in a loop
+since compilation is typically expensive. (It takes anywhere from a few
+microseconds to a few **milliseconds** depending on the size of the
+regex.) Not only is compilation itself expensive, but this also prevents
+optimizations that reuse allocations internally to the matching engines.
+
+In Rust, it can sometimes be a pain to pass regular expressions around if
+they're used from inside a helper function. Instead, we recommend using the
+[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that
+regular expressions are compiled exactly once.
+
+For example:
+
+```rust
+use lazy_static::lazy_static;
+use regex::Regex;
+
+fn some_helper_function(text: &str) -> bool {
+ lazy_static! {
+ static ref RE: Regex = Regex::new("...").unwrap();
+ }
+ RE.is_match(text)
+}
+
+fn main() {}
+```
+
+Specifically, in this example, the regex will be compiled when it is used for
+the first time. On subsequent uses, it will reuse the previous compilation.
+
+# Example: iterating over capture groups
+
+This crate provides convenient iterators for matching an expression
+repeatedly against a search string to find successive non-overlapping
+matches. For example, to find all dates in a string and be able to access
+them by their component pieces:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
+let text = "2012-03-14, 2013-01-01 and 2014-07-05";
+for cap in re.captures_iter(text) {
+ println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]);
+}
+// Output:
+// Month: 03 Day: 14 Year: 2012
+// Month: 01 Day: 01 Year: 2013
+// Month: 07 Day: 05 Year: 2014
+# }
+```
+
+Notice that the year is in the capture group indexed at `1`. This is
+because the *entire match* is stored in the capture group at index `0`.
+
+# Example: replacement with named capture groups
+
+Building on the previous example, perhaps we'd like to rearrange the date
+formats. This can be done with text replacement. But to make the code
+clearer, we can *name* our capture groups and use those names as variables
+in our replacement text:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap();
+let before = "2012-03-14, 2013-01-01 and 2014-07-05";
+let after = re.replace_all(before, "$m/$d/$y");
+assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
+# }
+```
+
+The `replace` methods are actually polymorphic in the replacement, which
+provides more flexibility than is seen here. (See the documentation for
+`Regex::replace` for more details.)
+
+Note that if your regex gets complicated, you can use the `x` flag to
+enable insignificant whitespace mode, which also lets you write comments:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?x)
+ (?P<y>\d{4}) # the year
+ -
+ (?P<m>\d{2}) # the month
+ -
+ (?P<d>\d{2}) # the day
+").unwrap();
+let before = "2012-03-14, 2013-01-01 and 2014-07-05";
+let after = re.replace_all(before, "$m/$d/$y");
+assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
+# }
+```
+
+If you wish to match against whitespace in this mode, you can still use `\s`,
+`\n`, `\t`, etc. For escaping a single space character, you can escape it
+directly with `\ `, use its hex character code `\x20` or temporarily disable
+the `x` flag, e.g., `(?-x: )`.
+
+# Example: match multiple regular expressions simultaneously
+
+This demonstrates how to use a `RegexSet` to match multiple (possibly
+overlapping) regular expressions in a single scan of the search text:
+
+```rust
+use regex::RegexSet;
+
+let set = RegexSet::new(&[
+ r"\w+",
+ r"\d+",
+ r"\pL+",
+ r"foo",
+ r"bar",
+ r"barfoo",
+ r"foobar",
+]).unwrap();
+
+// Iterate over and collect all of the matches.
+let matches: Vec<_> = set.matches("foobar").into_iter().collect();
+assert_eq!(matches, vec![0, 2, 3, 4, 6]);
+
+// You can also test whether a particular regex matched:
+let matches = set.matches("foobar");
+assert!(!matches.matched(5));
+assert!(matches.matched(6));
+```
+
+# Pay for what you use
+
+With respect to searching text with a regular expression, there are three
+questions that can be asked:
+
+1. Does the text match this expression?
+2. If so, where does it match?
+3. Where did the capturing groups match?
+
+Generally speaking, this crate could provide a function to answer only #3,
+which would subsume #1 and #2 automatically. However, it can be significantly
+more expensive to compute the location of capturing group matches, so it's best
+not to do it if you don't need to.
+
+Therefore, only use what you need. For example, don't use `find` if you
+only need to test if an expression matches a string. (Use `is_match`
+instead.)
+
+# Unicode
+
+This implementation executes regular expressions **only** on valid UTF-8
+while exposing match locations as byte indices into the search string. (To
+relax this restriction, use the [`bytes`](bytes/index.html) sub-module.)
+
+Only simple case folding is supported. Namely, when matching
+case-insensitively, the characters are first mapped using the "simple" case
+folding rules defined by Unicode.
+
+Regular expressions themselves are **only** interpreted as a sequence of
+Unicode scalar values. This means you can use Unicode characters directly
+in your expression:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?i)Δ+").unwrap();
+let mat = re.find("ΔδΔ").unwrap();
+assert_eq!((mat.start(), mat.end()), (0, 6));
+# }
+```
+
+Most features of the regular expressions in this crate are Unicode aware. Here
+are some examples:
+
+* `.` will match any valid UTF-8 encoded Unicode scalar value except for `\n`.
+ (To also match `\n`, enable the `s` flag, e.g., `(?s:.)`.)
+* `\w`, `\d` and `\s` are Unicode aware. For example, `\s` will match all forms
+ of whitespace categorized by Unicode.
+* `\b` matches a Unicode word boundary.
+* Negated character classes like `[^a]` match all Unicode scalar values except
+ for `a`.
+* `^` and `$` are **not** Unicode aware in multi-line mode. Namely, they only
+ recognize `\n` and not any of the other forms of line terminators defined
+ by Unicode.
+
+Unicode general categories, scripts, script extensions, ages and a smattering
+of boolean properties are available as character classes. For example, you can
+match a sequence of numerals, Greek or Cherokee letters:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap();
+let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap();
+assert_eq!((mat.start(), mat.end()), (3, 23));
+# }
+```
+
+For a more detailed breakdown of Unicode support with respect to
+[UTS#18](https://unicode.org/reports/tr18/),
+please see the
+[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md)
+document in the root of the regex repository.
+
+# Opt out of Unicode support
+
+The `bytes` sub-module provides a `Regex` type that can be used to match
+on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with
+the main `Regex` type. However, this behavior can be disabled by turning
+off the `u` flag, even if doing so could result in matching invalid UTF-8.
+For example, when the `u` flag is disabled, `.` will match any byte instead
+of any Unicode scalar value.
+
+Disabling the `u` flag is also possible with the standard `&str`-based `Regex`
+type, but it is only allowed where the UTF-8 invariant is maintained. For
+example, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an
+`&str`-based `Regex`, but `(?-u:\xFF)` will attempt to match the raw byte
+`\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based
+regexes.
+
+Finally, since Unicode support requires bundling large Unicode data
+tables, this crate exposes knobs to disable the compilation of those
+data tables, which can be useful for shrinking binary size and reducing
+compilation times. For details on how to do that, see the section on [crate
+features](#crate-features).
+
+# Syntax
+
+The syntax supported in this crate is documented below.
+
+Note that the regular expression parser and abstract syntax are exposed in
+a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax).
+
+## Matching one character
+
+<pre class="rust">
+. any character except new line (includes new line with s flag)
+\d digit (\p{Nd})
+\D not digit
+\pN One-letter name Unicode character class
+\p{Greek} Unicode character class (general category or script)
+\PN Negated one-letter name Unicode character class
+\P{Greek} negated Unicode character class (general category or script)
+</pre>
+
+### Character classes
+
+<pre class="rust">
+[xyz] A character class matching either x, y or z (union).
+[^xyz] A character class matching any character except x, y and z.
+[a-z] A character class matching any character in range a-z.
+[[:alpha:]] ASCII character class ([A-Za-z])
+[[:^alpha:]] Negated ASCII character class ([^A-Za-z])
+[x[^xyz]] Nested/grouping character class (matching any character except y and z)
+[a-y&&xyz] Intersection (matching x or y)
+[0-9&&[^4]] Subtraction using intersection and negation (matching 0-9 except 4)
+[0-9--4] Direct subtraction (matching 0-9 except 4)
+[a-g~~b-h] Symmetric difference (matching `a` and `h` only)
+[\[\]] Escaping in character classes (matching [ or ])
+</pre>
+
+Any named character class may appear inside a bracketed `[...]` character
+class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII
+digit. `[\p{Greek}&&\pL]` matches Greek letters.
+
+Precedence in character classes, from most binding to least:
+
+1. Ranges: `a-cd` == `[a-c]d`
+2. Union: `ab&&bc` == `[ab]&&[bc]`
+3. Intersection: `^a-z&&b` == `^[a-z&&b]`
+4. Negation
+
+## Composites
+
+<pre class="rust">
+xy concatenation (x followed by y)
+x|y alternation (x or y, prefer x)
+</pre>
+
+## Repetitions
+
+<pre class="rust">
+x* zero or more of x (greedy)
+x+ one or more of x (greedy)
+x? zero or one of x (greedy)
+x*? zero or more of x (ungreedy/lazy)
+x+? one or more of x (ungreedy/lazy)
+x?? zero or one of x (ungreedy/lazy)
+x{n,m} at least n x and at most m x (greedy)
+x{n,} at least n x (greedy)
+x{n} exactly n x
+x{n,m}? at least n x and at most m x (ungreedy/lazy)
+x{n,}? at least n x (ungreedy/lazy)
+x{n}? exactly n x
+</pre>
+
+## Empty matches
+
+<pre class="rust">
+^ the beginning of text (or start-of-line with multi-line mode)
+$ the end of text (or end-of-line with multi-line mode)
+\A only the beginning of text (even with multi-line mode enabled)
+\z only the end of text (even with multi-line mode enabled)
+\b a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B not a Unicode word boundary
+</pre>
+
+The empty regex is valid and matches the empty string. For example, the empty
+regex matches `abc` at positions `0`, `1`, `2` and `3`.
+
+## Grouping and flags
+
+<pre class="rust">
+(exp) numbered capture group (indexed by opening parenthesis)
+(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?:exp) non-capturing group
+(?flags) set flags within current group
+(?flags:exp) set flags for exp (non-capturing)
+</pre>
+
+Flags are each a single character. For example, `(?x)` sets the flag `x`
+and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
+the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
+the `x` flag and clears the `y` flag.
+
+All flags are by default disabled unless stated otherwise. They are:
+
+<pre class="rust">
+i case-insensitive: letters match both upper and lower case
+m multi-line mode: ^ and $ match begin/end of line
+s allow . to match \n
+U swap the meaning of x* and x*?
+u Unicode support (enabled by default)
+x ignore whitespace and allow line comments (starting with `#`)
+</pre>
+
+Flags can be toggled within a pattern. Here's an example that matches
+case-insensitively for the first part but case-sensitively for the second part:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?i)a+(?-i)b+").unwrap();
+let cap = re.captures("AaAaAbbBBBb").unwrap();
+assert_eq!(&cap[0], "AaAaAbb");
+# }
+```
+
+Notice that the `a+` matches either `a` or `A`, but the `b+` only matches
+`b`.
+
+Multi-line mode means `^` and `$` no longer match just at the beginning/end of
+the input, but at the beginning/end of lines:
+
+```
+# use regex::Regex;
+let re = Regex::new(r"(?m)^line \d+").unwrap();
+let m = re.find("line one\nline 2\n").unwrap();
+assert_eq!(m.as_str(), "line 2");
+```
+
+Note that `^` matches after new lines, even at the end of input:
+
+```
+# use regex::Regex;
+let re = Regex::new(r"(?m)^").unwrap();
+let m = re.find_iter("test\n").last().unwrap();
+assert_eq!((m.start(), m.end()), (5, 5));
+```
+
+Here is an example that uses an ASCII word boundary instead of a Unicode
+word boundary:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap();
+let cap = re.captures("$$abc$$").unwrap();
+assert_eq!(&cap[0], "abc");
+# }
+```
+
+## Escape sequences
+
+<pre class="rust">
+\* literal *, works for any punctuation character: \.+*?()|[]{}^$
+\a bell (\x07)
+\f form feed (\x0C)
+\t horizontal tab
+\n new line
+\r carriage return
+\v vertical tab (\x0B)
+\123 octal character code (up to three digits) (when enabled)
+\x7F hex character code (exactly two digits)
+\x{10FFFF} any hex character code corresponding to a Unicode code point
+\u007F hex character code (exactly four digits)
+\u{7F} any hex character code corresponding to a Unicode code point
+\U0000007F hex character code (exactly eight digits)
+\U{7F} any hex character code corresponding to a Unicode code point
+</pre>
+
+## Perl character classes (Unicode friendly)
+
+These classes are based on the definitions provided in
+[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties):
+
+<pre class="rust">
+\d digit (\p{Nd})
+\D not digit
+\s whitespace (\p{White_Space})
+\S not whitespace
+\w word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
+\W not word character
+</pre>
+
+## ASCII character classes
+
+<pre class="rust">
+[[:alnum:]] alphanumeric ([0-9A-Za-z])
+[[:alpha:]] alphabetic ([A-Za-z])
+[[:ascii:]] ASCII ([\x00-\x7F])
+[[:blank:]] blank ([\t ])
+[[:cntrl:]] control ([\x00-\x1F\x7F])
+[[:digit:]] digits ([0-9])
+[[:graph:]] graphical ([!-~])
+[[:lower:]] lower case ([a-z])
+[[:print:]] printable ([ -~])
+[[:punct:]] punctuation ([!-/:-@\[-`{-~])
+[[:space:]] whitespace ([\t\n\v\f\r ])
+[[:upper:]] upper case ([A-Z])
+[[:word:]] word characters ([0-9A-Za-z_])
+[[:xdigit:]] hex digit ([0-9A-Fa-f])
+</pre>
+
+# Crate features
+
+By default, this crate tries pretty hard to make regex matching both as fast
+as possible and as correct as it can be, within reason. This means that there
+is a lot of code dedicated to performance, the handling of Unicode data and the
+Unicode data itself. Overall, this leads to more dependencies, larger binaries
+and longer compile times. This trade off may not be appropriate in all cases,
+and indeed, even when all Unicode and performance features are disabled, one
+is still left with a perfectly serviceable regex engine that will work well
+in many cases.
+
+This crate exposes a number of features for controlling that trade off. Some
+of these features are strictly performance oriented, such that disabling them
+won't result in a loss of functionality, but may result in worse performance.
+Other features, such as the ones controlling the presence or absence of Unicode
+data, can result in a loss of functionality. For example, if one disables the
+`unicode-case` feature (described below), then compiling the regex `(?i)a`
+will fail since Unicode case insensitivity is enabled by default. Instead,
+callers must use `(?i-u)a` instead to disable Unicode case folding. Stated
+differently, enabling or disabling any of the features below can only add or
+subtract from the total set of valid regular expressions. Enabling or disabling
+a feature will never modify the match semantics of a regular expression.
+
+All features below are enabled by default.
+
+### Ecosystem features
+
+* **std** -
+ When enabled, this will cause `regex` to use the standard library. Currently,
+ disabling this feature will always result in a compilation error. It is
+ intended to add `alloc`-only support to regex in the future.
+
+### Performance features
+
+* **perf** -
+ Enables all performance related features. This feature is enabled by default
+ and will always cover all features that improve performance, even if more
+ are added in the future.
+* **perf-dfa** -
+ Enables the use of a lazy DFA for matching. The lazy DFA is used to compile
+ portions of a regex to a very fast DFA on an as-needed basis. This can
+ result in substantial speedups, usually by an order of magnitude on large
+ haystacks. The lazy DFA does not bring in any new dependencies, but it can
+ make compile times longer.
+* **perf-inline** -
+ Enables the use of aggressive inlining inside match routines. This reduces
+ the overhead of each match. The aggressive inlining, however, increases
+ compile times and binary size.
+* **perf-literal** -
+ Enables the use of literal optimizations for speeding up matches. In some
+ cases, literal optimizations can result in speedups of _several_ orders of
+ magnitude. Disabling this drops the `aho-corasick` and `memchr` dependencies.
+* **perf-cache** -
+ This feature used to enable a faster internal cache at the cost of using
+ additional dependencies, but this is no longer an option. A fast internal
+ cache is now used unconditionally with no additional dependencies. This may
+ change in the future.
+
+### Unicode features
+
+* **unicode** -
+ Enables all Unicode features. This feature is enabled by default, and will
+ always cover all Unicode features, even if more are added in the future.
+* **unicode-age** -
+ Provide the data for the
+ [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
+ This makes it possible to use classes like `\p{Age:6.0}` to refer to all
+ codepoints first introduced in Unicode 6.0
+* **unicode-bool** -
+ Provide the data for numerous Unicode boolean properties. The full list
+ is not included here, but contains properties like `Alphabetic`, `Emoji`,
+ `Lowercase`, `Math`, `Uppercase` and `White_Space`.
+* **unicode-case** -
+ Provide the data for case insensitive matching using
+ [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
+* **unicode-gencat** -
+ Provide the data for
+ [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
+ This includes, but is not limited to, `Decimal_Number`, `Letter`,
+ `Math_Symbol`, `Number` and `Punctuation`.
+* **unicode-perl** -
+ Provide the data for supporting the Unicode-aware Perl character classes,
+ corresponding to `\w`, `\s` and `\d`. This is also necessary for using
+ Unicode-aware word boundary assertions. Note that if this feature is
+ disabled, the `\s` and `\d` character classes are still available if the
+ `unicode-bool` and `unicode-gencat` features are enabled, respectively.
+* **unicode-script** -
+ Provide the data for
+ [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
+ This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
+ `Latin` and `Thai`.
+* **unicode-segment** -
+ Provide the data necessary to provide the properties used to implement the
+ [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
+ This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
+ `\p{sb=ATerm}`.
+
+
+# Untrusted input
+
+This crate can handle both untrusted regular expressions and untrusted
+search text.
+
+Untrusted regular expressions are handled by capping the size of a compiled
+regular expression.
+(See [`RegexBuilder::size_limit`](struct.RegexBuilder.html#method.size_limit).)
+Without this, it would be trivial for an attacker to exhaust your system's
+memory with expressions like `a{100}{100}{100}`.
+
+Untrusted search text is allowed because the matching engine(s) in this
+crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search
+text`), which means there's no way to cause exponential blow-up like with
+some other regular expression engines. (We pay for this by disallowing
+features like arbitrary look-ahead and backreferences.)
+
+When a DFA is used, pathological cases with exponential state blow-up are
+avoided by constructing the DFA lazily or in an "online" manner. Therefore,
+at most one new state can be created for each byte of input. This satisfies
+our time complexity guarantees, but can lead to memory growth
+proportional to the size of the input. As a stopgap, the DFA is only
+allowed to store a fixed number of states. When the limit is reached, its
+states are wiped and continues on, possibly duplicating previous work. If
+the limit is reached too frequently, it gives up and hands control off to
+another matching engine with fixed memory requirements.
+(The DFA size limit can also be tweaked. See
+[`RegexBuilder::dfa_size_limit`](struct.RegexBuilder.html#method.dfa_size_limit).)
+*/
+
+#![deny(missing_docs)]
+#![cfg_attr(feature = "pattern", feature(pattern))]
+#![warn(missing_debug_implementations)]
+
+#[cfg(not(feature = "std"))]
+compile_error!("`std` feature is currently required to build this crate");
+
+// To check README's example
+// TODO: Re-enable this once the MSRV is 1.43 or greater.
+// See: https://github.com/rust-lang/regex/issues/684
+// See: https://github.com/rust-lang/regex/issues/685
+// #[cfg(doctest)]
+// doc_comment::doctest!("../README.md");
+
+#[cfg(feature = "std")]
+pub use crate::error::Error;
+#[cfg(feature = "std")]
+pub use crate::re_builder::set_unicode::*;
+#[cfg(feature = "std")]
+pub use crate::re_builder::unicode::*;
+#[cfg(feature = "std")]
+pub use crate::re_set::unicode::*;
+#[cfg(feature = "std")]
+pub use crate::re_unicode::{
+ escape, CaptureLocations, CaptureMatches, CaptureNames, Captures,
+ Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split,
+ SplitN, SubCaptureMatches,
+};
+
+/**
+Match regular expressions on arbitrary bytes.
+
+This module provides a nearly identical API to the one found in the
+top-level of this crate. There are two important differences:
+
+1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
+is used where `String` would have been used.
+2. Unicode support can be disabled even when disabling it would result in
+matching invalid UTF-8 bytes.
+
+# Example: match null terminated string
+
+This shows how to find all null-terminated strings in a slice of bytes:
+
+```rust
+# use regex::bytes::Regex;
+let re = Regex::new(r"(?-u)(?P<cstr>[^\x00]+)\x00").unwrap();
+let text = b"foo\x00bar\x00baz\x00";
+
+// Extract all of the strings without the null terminator from each match.
+// The unwrap is OK here since a match requires the `cstr` capture to match.
+let cstrs: Vec<&[u8]> =
+ re.captures_iter(text)
+ .map(|c| c.name("cstr").unwrap().as_bytes())
+ .collect();
+assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs);
+```
+
+# Example: selectively enable Unicode support
+
+This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded
+string (e.g., to extract a title from a Matroska file):
+
+```rust
+# use std::str;
+# use regex::bytes::Regex;
+let re = Regex::new(
+ r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))"
+).unwrap();
+let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65";
+let caps = re.captures(text).unwrap();
+
+// Notice that despite the `.*` at the end, it will only match valid UTF-8
+// because Unicode mode was enabled with the `u` flag. Without the `u` flag,
+// the `.*` would match the rest of the bytes.
+let mat = caps.get(1).unwrap();
+assert_eq!((7, 10), (mat.start(), mat.end()));
+
+// If there was a match, Unicode mode guarantees that `title` is valid UTF-8.
+let title = str::from_utf8(&caps[1]).unwrap();
+assert_eq!("☃", title);
+```
+
+In general, if the Unicode flag is enabled in a capture group and that capture
+is part of the overall match, then the capture is *guaranteed* to be valid
+UTF-8.
+
+# Syntax
+
+The supported syntax is pretty much the same as the syntax for Unicode
+regular expressions with a few changes that make sense for matching arbitrary
+bytes:
+
+1. The `u` flag can be disabled even when disabling it might cause the regex to
+match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
+"ASCII compatible" mode.
+2. In ASCII compatible mode, neither Unicode scalar values nor Unicode
+character classes are allowed.
+3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
+revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps
+to `[[:digit:]]` and `\s` maps to `[[:space:]]`.
+4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to
+determine whether a byte is a word byte or not.
+5. Hexadecimal notation can be used to specify arbitrary bytes instead of
+Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the
+literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that
+matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when
+enabled.
+6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the
+`s` flag is additionally enabled, `.` matches any byte.
+
+# Performance
+
+In general, one should expect performance on `&[u8]` to be roughly similar to
+performance on `&str`.
+*/
+#[cfg(feature = "std")]
+pub mod bytes {
+ pub use crate::re_builder::bytes::*;
+ pub use crate::re_builder::set_bytes::*;
+ pub use crate::re_bytes::*;
+ pub use crate::re_set::bytes::*;
+}
+
+mod backtrack;
+mod compile;
+#[cfg(feature = "perf-dfa")]
+mod dfa;
+mod error;
+mod exec;
+mod expand;
+mod find_byte;
+mod input;
+mod literal;
+#[cfg(feature = "pattern")]
+mod pattern;
+mod pikevm;
+mod pool;
+mod prog;
+mod re_builder;
+mod re_bytes;
+mod re_set;
+mod re_trait;
+mod re_unicode;
+mod sparse;
+mod utf8;
+
+/// The `internal` module exists to support suspicious activity, such as
+/// testing different matching engines and supporting the `regex-debug` CLI
+/// utility.
+#[doc(hidden)]
+#[cfg(feature = "std")]
+pub mod internal {
+ pub use crate::compile::Compiler;
+ pub use crate::exec::{Exec, ExecBuilder};
+ pub use crate::input::{Char, CharInput, Input, InputAt};
+ pub use crate::literal::LiteralSearcher;
+ pub use crate::prog::{EmptyLook, Inst, InstRanges, Program};
+}
diff --git a/third_party/rust/regex/src/literal/imp.rs b/third_party/rust/regex/src/literal/imp.rs
new file mode 100644
index 0000000000..90b2f11606
--- /dev/null
+++ b/third_party/rust/regex/src/literal/imp.rs
@@ -0,0 +1,402 @@
+use std::mem;
+
+use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
+use memchr::{memchr, memchr2, memchr3, memmem};
+use regex_syntax::hir::literal::{Literal, Literals};
+
+/// A prefix extracted from a compiled regular expression.
+///
+/// A regex prefix is a set of literal strings that *must* be matched at the
+/// beginning of a regex in order for the entire regex to match. Similarly
+/// for a regex suffix.
+#[derive(Clone, Debug)]
+pub struct LiteralSearcher {
+ complete: bool,
+ lcp: Memmem,
+ lcs: Memmem,
+ matcher: Matcher,
+}
+
+#[derive(Clone, Debug)]
+enum Matcher {
+ /// No literals. (Never advances through the input.)
+ Empty,
+ /// A set of four or more single byte literals.
+ Bytes(SingleByteSet),
+ /// A single substring, using vector accelerated routines when available.
+ Memmem(Memmem),
+ /// An Aho-Corasick automaton.
+ AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
+ /// A packed multiple substring searcher, using SIMD.
+ ///
+ /// Note that Aho-Corasick will actually use this packed searcher
+ /// internally automatically, however, there is some overhead associated
+ /// with going through the Aho-Corasick machinery. So using the packed
+ /// searcher directly results in some gains.
+ Packed { s: packed::Searcher, lits: Vec<Literal> },
+}
+
+impl LiteralSearcher {
+ /// Returns a matcher that never matches and never advances the input.
+ pub fn empty() -> Self {
+ Self::new(Literals::empty(), Matcher::Empty)
+ }
+
+ /// Returns a matcher for literal prefixes from the given set.
+ pub fn prefixes(lits: Literals) -> Self {
+ let matcher = Matcher::prefixes(&lits);
+ Self::new(lits, matcher)
+ }
+
+ /// Returns a matcher for literal suffixes from the given set.
+ pub fn suffixes(lits: Literals) -> Self {
+ let matcher = Matcher::suffixes(&lits);
+ Self::new(lits, matcher)
+ }
+
+ fn new(lits: Literals, matcher: Matcher) -> Self {
+ let complete = lits.all_complete();
+ LiteralSearcher {
+ complete,
+ lcp: Memmem::new(lits.longest_common_prefix()),
+ lcs: Memmem::new(lits.longest_common_suffix()),
+ matcher,
+ }
+ }
+
+ /// Returns true if all matches comprise the entire regular expression.
+ ///
+ /// This does not necessarily mean that a literal match implies a match
+ /// of the regular expression. For example, the regular expression `^a`
+ /// is comprised of a single complete literal `a`, but the regular
+ /// expression demands that it only match at the beginning of a string.
+ pub fn complete(&self) -> bool {
+ self.complete && !self.is_empty()
+ }
+
+ /// Find the position of a literal in `haystack` if it exists.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+ use self::Matcher::*;
+ match self.matcher {
+ Empty => Some((0, 0)),
+ Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
+ Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
+ AC { ref ac, .. } => {
+ ac.find(haystack).map(|m| (m.start(), m.end()))
+ }
+ Packed { ref s, .. } => {
+ s.find(haystack).map(|m| (m.start(), m.end()))
+ }
+ }
+ }
+
+ /// Like find, except matches must start at index `0`.
+ pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+ for lit in self.iter() {
+ if lit.len() > haystack.len() {
+ continue;
+ }
+ if lit == &haystack[0..lit.len()] {
+ return Some((0, lit.len()));
+ }
+ }
+ None
+ }
+
+ /// Like find, except matches must end at index `haystack.len()`.
+ pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+ for lit in self.iter() {
+ if lit.len() > haystack.len() {
+ continue;
+ }
+ if lit == &haystack[haystack.len() - lit.len()..] {
+ return Some((haystack.len() - lit.len(), haystack.len()));
+ }
+ }
+ None
+ }
+
+ /// Returns an iterator over all literals to be matched.
+ pub fn iter(&self) -> LiteralIter<'_> {
+ match self.matcher {
+ Matcher::Empty => LiteralIter::Empty,
+ Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
+ Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
+ Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
+ Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
+ }
+ }
+
+ /// Returns a matcher for the longest common prefix of this matcher.
+ pub fn lcp(&self) -> &Memmem {
+ &self.lcp
+ }
+
+ /// Returns a matcher for the longest common suffix of this matcher.
+ pub fn lcs(&self) -> &Memmem {
+ &self.lcs
+ }
+
+ /// Returns true iff this prefix is empty.
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns the number of prefixes in this machine.
+ pub fn len(&self) -> usize {
+ use self::Matcher::*;
+ match self.matcher {
+ Empty => 0,
+ Bytes(ref sset) => sset.dense.len(),
+ Memmem(_) => 1,
+ AC { ref ac, .. } => ac.pattern_count(),
+ Packed { ref lits, .. } => lits.len(),
+ }
+ }
+
+ /// Return the approximate heap usage of literals in bytes.
+ pub fn approximate_size(&self) -> usize {
+ use self::Matcher::*;
+ match self.matcher {
+ Empty => 0,
+ Bytes(ref sset) => sset.approximate_size(),
+ Memmem(ref single) => single.approximate_size(),
+ AC { ref ac, .. } => ac.heap_bytes(),
+ Packed { ref s, .. } => s.heap_bytes(),
+ }
+ }
+}
+
+impl Matcher {
+ fn prefixes(lits: &Literals) -> Self {
+ let sset = SingleByteSet::prefixes(lits);
+ Matcher::new(lits, sset)
+ }
+
+ fn suffixes(lits: &Literals) -> Self {
+ let sset = SingleByteSet::suffixes(lits);
+ Matcher::new(lits, sset)
+ }
+
+ fn new(lits: &Literals, sset: SingleByteSet) -> Self {
+ if lits.literals().is_empty() {
+ return Matcher::Empty;
+ }
+ if sset.dense.len() >= 26 {
+ // Avoid trying to match a large number of single bytes.
+ // This is *very* sensitive to a frequency analysis comparison
+ // between the bytes in sset and the composition of the haystack.
+ // No matter the size of sset, if its members all are rare in the
+ // haystack, then it'd be worth using it. How to tune this... IDK.
+ // ---AG
+ return Matcher::Empty;
+ }
+ if sset.complete {
+ return Matcher::Bytes(sset);
+ }
+ if lits.literals().len() == 1 {
+ return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
+ }
+
+ let pats = lits.literals().to_owned();
+ let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
+ if lits.literals().len() <= 100 && !is_aho_corasick_fast {
+ let mut builder = packed::Config::new()
+ .match_kind(packed::MatchKind::LeftmostFirst)
+ .builder();
+ if let Some(s) = builder.extend(&pats).build() {
+ return Matcher::Packed { s, lits: pats };
+ }
+ }
+ let ac = AhoCorasickBuilder::new()
+ .match_kind(aho_corasick::MatchKind::LeftmostFirst)
+ .dfa(true)
+ .build_with_size::<u32, _, _>(&pats)
+ .unwrap();
+ Matcher::AC { ac, lits: pats }
+ }
+}
+
+#[derive(Debug)]
+pub enum LiteralIter<'a> {
+ Empty,
+ Bytes(&'a [u8]),
+ Single(&'a [u8]),
+ AC(&'a [Literal]),
+ Packed(&'a [Literal]),
+}
+
+impl<'a> Iterator for LiteralIter<'a> {
+ type Item = &'a [u8];
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match *self {
+ LiteralIter::Empty => None,
+ LiteralIter::Bytes(ref mut many) => {
+ if many.is_empty() {
+ None
+ } else {
+ let next = &many[0..1];
+ *many = &many[1..];
+ Some(next)
+ }
+ }
+ LiteralIter::Single(ref mut one) => {
+ if one.is_empty() {
+ None
+ } else {
+ let next = &one[..];
+ *one = &[];
+ Some(next)
+ }
+ }
+ LiteralIter::AC(ref mut lits) => {
+ if lits.is_empty() {
+ None
+ } else {
+ let next = &lits[0];
+ *lits = &lits[1..];
+ Some(&**next)
+ }
+ }
+ LiteralIter::Packed(ref mut lits) => {
+ if lits.is_empty() {
+ None
+ } else {
+ let next = &lits[0];
+ *lits = &lits[1..];
+ Some(&**next)
+ }
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+struct SingleByteSet {
+ sparse: Vec<bool>,
+ dense: Vec<u8>,
+ complete: bool,
+ all_ascii: bool,
+}
+
+impl SingleByteSet {
+ fn new() -> SingleByteSet {
+ SingleByteSet {
+ sparse: vec![false; 256],
+ dense: vec![],
+ complete: true,
+ all_ascii: true,
+ }
+ }
+
+ fn prefixes(lits: &Literals) -> SingleByteSet {
+ let mut sset = SingleByteSet::new();
+ for lit in lits.literals() {
+ sset.complete = sset.complete && lit.len() == 1;
+ if let Some(&b) = lit.get(0) {
+ if !sset.sparse[b as usize] {
+ if b > 0x7F {
+ sset.all_ascii = false;
+ }
+ sset.dense.push(b);
+ sset.sparse[b as usize] = true;
+ }
+ }
+ }
+ sset
+ }
+
+ fn suffixes(lits: &Literals) -> SingleByteSet {
+ let mut sset = SingleByteSet::new();
+ for lit in lits.literals() {
+ sset.complete = sset.complete && lit.len() == 1;
+ if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
+ if !sset.sparse[b as usize] {
+ if b > 0x7F {
+ sset.all_ascii = false;
+ }
+ sset.dense.push(b);
+ sset.sparse[b as usize] = true;
+ }
+ }
+ }
+ sset
+ }
+
+ /// Faster find that special cases certain sizes to use memchr.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find(&self, text: &[u8]) -> Option<usize> {
+ match self.dense.len() {
+ 0 => None,
+ 1 => memchr(self.dense[0], text),
+ 2 => memchr2(self.dense[0], self.dense[1], text),
+ 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
+ _ => self._find(text),
+ }
+ }
+
+ /// Generic find that works on any sized set.
+ fn _find(&self, haystack: &[u8]) -> Option<usize> {
+ for (i, &b) in haystack.iter().enumerate() {
+ if self.sparse[b as usize] {
+ return Some(i);
+ }
+ }
+ None
+ }
+
+ fn approximate_size(&self) -> usize {
+ (self.dense.len() * mem::size_of::<u8>())
+ + (self.sparse.len() * mem::size_of::<bool>())
+ }
+}
+
+/// A simple wrapper around the memchr crate's memmem implementation.
+///
+/// The API this exposes mirrors the API of previous substring searchers that
+/// this supplanted.
+#[derive(Clone, Debug)]
+pub struct Memmem {
+ finder: memmem::Finder<'static>,
+ char_len: usize,
+}
+
+impl Memmem {
+ fn new(pat: &[u8]) -> Memmem {
+ Memmem {
+ finder: memmem::Finder::new(pat).into_owned(),
+ char_len: char_len_lossy(pat),
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ self.finder.find(haystack)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn is_suffix(&self, text: &[u8]) -> bool {
+ if text.len() < self.len() {
+ return false;
+ }
+ &text[text.len() - self.len()..] == self.finder.needle()
+ }
+
+ pub fn len(&self) -> usize {
+ self.finder.needle().len()
+ }
+
+ pub fn char_len(&self) -> usize {
+ self.char_len
+ }
+
+ fn approximate_size(&self) -> usize {
+ self.finder.needle().len() * mem::size_of::<u8>()
+ }
+}
+
+fn char_len_lossy(bytes: &[u8]) -> usize {
+ String::from_utf8_lossy(bytes).chars().count()
+}
diff --git a/third_party/rust/regex/src/literal/mod.rs b/third_party/rust/regex/src/literal/mod.rs
new file mode 100644
index 0000000000..980f523309
--- /dev/null
+++ b/third_party/rust/regex/src/literal/mod.rs
@@ -0,0 +1,55 @@
+pub use self::imp::*;
+
+#[cfg(feature = "perf-literal")]
+mod imp;
+
+#[allow(missing_docs)]
+#[cfg(not(feature = "perf-literal"))]
+mod imp {
+ use regex_syntax::hir::literal::Literals;
+
+ #[derive(Clone, Debug)]
+ pub struct LiteralSearcher(());
+
+ impl LiteralSearcher {
+ pub fn empty() -> Self {
+ LiteralSearcher(())
+ }
+
+ pub fn prefixes(_: Literals) -> Self {
+ LiteralSearcher(())
+ }
+
+ pub fn suffixes(_: Literals) -> Self {
+ LiteralSearcher(())
+ }
+
+ pub fn complete(&self) -> bool {
+ false
+ }
+
+ pub fn find(&self, _: &[u8]) -> Option<(usize, usize)> {
+ unreachable!()
+ }
+
+ pub fn find_start(&self, _: &[u8]) -> Option<(usize, usize)> {
+ unreachable!()
+ }
+
+ pub fn find_end(&self, _: &[u8]) -> Option<(usize, usize)> {
+ unreachable!()
+ }
+
+ pub fn is_empty(&self) -> bool {
+ true
+ }
+
+ pub fn len(&self) -> usize {
+ 0
+ }
+
+ pub fn approximate_size(&self) -> usize {
+ 0
+ }
+ }
+}
diff --git a/third_party/rust/regex/src/pattern.rs b/third_party/rust/regex/src/pattern.rs
new file mode 100644
index 0000000000..00549e5106
--- /dev/null
+++ b/third_party/rust/regex/src/pattern.rs
@@ -0,0 +1,63 @@
+use std::str::pattern::{Pattern, SearchStep, Searcher};
+
+use crate::re_unicode::{Matches, Regex};
+
+#[derive(Debug)]
+pub struct RegexSearcher<'r, 't> {
+ haystack: &'t str,
+ it: Matches<'r, 't>,
+ last_step_end: usize,
+ next_match: Option<(usize, usize)>,
+}
+
+impl<'r, 't> Pattern<'t> for &'r Regex {
+ type Searcher = RegexSearcher<'r, 't>;
+
+ fn into_searcher(self, haystack: &'t str) -> RegexSearcher<'r, 't> {
+ RegexSearcher {
+ haystack,
+ it: self.find_iter(haystack),
+ last_step_end: 0,
+ next_match: None,
+ }
+ }
+}
+
+unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> {
+ #[inline]
+ fn haystack(&self) -> &'t str {
+ self.haystack
+ }
+
+ #[inline]
+ fn next(&mut self) -> SearchStep {
+ if let Some((s, e)) = self.next_match {
+ self.next_match = None;
+ self.last_step_end = e;
+ return SearchStep::Match(s, e);
+ }
+ match self.it.next() {
+ None => {
+ if self.last_step_end < self.haystack().len() {
+ let last = self.last_step_end;
+ self.last_step_end = self.haystack().len();
+ SearchStep::Reject(last, self.haystack().len())
+ } else {
+ SearchStep::Done
+ }
+ }
+ Some(m) => {
+ let (s, e) = (m.start(), m.end());
+ if s == self.last_step_end {
+ self.last_step_end = e;
+ SearchStep::Match(s, e)
+ } else {
+ self.next_match = Some((s, e));
+ let last = self.last_step_end;
+ self.last_step_end = s;
+ SearchStep::Reject(last, s)
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/rust/regex/src/pikevm.rs b/third_party/rust/regex/src/pikevm.rs
new file mode 100644
index 0000000000..8c9eac2d39
--- /dev/null
+++ b/third_party/rust/regex/src/pikevm.rs
@@ -0,0 +1,360 @@
+// This module implements the Pike VM. That is, it guarantees linear time
+// search of a regex on any text with memory use proportional to the size of
+// the regex.
+//
+// It is equal in power to the backtracking engine in this crate, except the
+// backtracking engine is typically faster on small regexes/texts at the
+// expense of a bigger memory footprint.
+//
+// It can do more than the DFA can (specifically, record capture locations
+// and execute Unicode word boundary assertions), but at a slower speed.
+// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
+// epsilon transitions. That is, the Pike VM engine can be in multiple states
+// at once where as the DFA is only ever in one state at a time.
+//
+// Therefore, the Pike VM is generally treated as the fallback when the other
+// matching engines either aren't feasible to run or are insufficient.
+
+use std::mem;
+
+use crate::exec::ProgramCache;
+use crate::input::{Input, InputAt};
+use crate::prog::{InstPtr, Program};
+use crate::re_trait::Slot;
+use crate::sparse::SparseSet;
+
+/// An NFA simulation matching engine.
+#[derive(Debug)]
+pub struct Fsm<'r, I> {
+ /// The sequence of opcodes (among other things) that is actually executed.
+ ///
+ /// The program may be byte oriented or Unicode codepoint oriented.
+ prog: &'r Program,
+ /// An explicit stack used for following epsilon transitions. (This is
+ /// borrowed from the cache.)
+ stack: &'r mut Vec<FollowEpsilon>,
+ /// The input to search.
+ input: I,
+}
+
+/// A cached allocation that can be reused on each execution.
+#[derive(Clone, Debug)]
+pub struct Cache {
+ /// A pair of ordered sets for tracking NFA states.
+ clist: Threads,
+ nlist: Threads,
+ /// An explicit stack used for following epsilon transitions.
+ stack: Vec<FollowEpsilon>,
+}
+
+/// An ordered set of NFA states and their captures.
+#[derive(Clone, Debug)]
+struct Threads {
+ /// An ordered set of opcodes (each opcode is an NFA state).
+ set: SparseSet,
+ /// Captures for every NFA state.
+ ///
+ /// It is stored in row-major order, where the columns are the capture
+ /// slots and the rows are the states.
+ caps: Vec<Slot>,
+ /// The number of capture slots stored per thread. (Every capture has
+ /// two slots.)
+ slots_per_thread: usize,
+}
+
+/// A representation of an explicit stack frame when following epsilon
+/// transitions. This is used to avoid recursion.
+#[derive(Clone, Debug)]
+enum FollowEpsilon {
+ /// Follow transitions at the given instruction pointer.
+ IP(InstPtr),
+ /// Restore the capture slot with the given position in the input.
+ Capture { slot: usize, pos: Slot },
+}
+
+impl Cache {
+ /// Create a new allocation used by the NFA machine to record execution
+ /// and captures.
+ pub fn new(_prog: &Program) -> Self {
+ Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
+ }
+}
+
+impl<'r, I: Input> Fsm<'r, I> {
+ /// Execute the NFA matching engine.
+ ///
+ /// If there's a match, `exec` returns `true` and populates the given
+ /// captures accordingly.
+ pub fn exec(
+ prog: &'r Program,
+ cache: &ProgramCache,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ input: I,
+ start: usize,
+ end: usize,
+ ) -> bool {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.pikevm;
+ cache.clist.resize(prog.len(), prog.captures.len());
+ cache.nlist.resize(prog.len(), prog.captures.len());
+ let at = input.at(start);
+ Fsm { prog, stack: &mut cache.stack, input }.exec_(
+ &mut cache.clist,
+ &mut cache.nlist,
+ matches,
+ slots,
+ quit_after_match,
+ at,
+ end,
+ )
+ }
+
+ fn exec_(
+ &mut self,
+ mut clist: &mut Threads,
+ mut nlist: &mut Threads,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ mut at: InputAt,
+ end: usize,
+ ) -> bool {
+ let mut matched = false;
+ let mut all_matched = false;
+ clist.set.clear();
+ nlist.set.clear();
+ 'LOOP: loop {
+ if clist.set.is_empty() {
+ // Three ways to bail out when our current set of threads is
+ // empty.
+ //
+ // 1. We have a match---so we're done exploring any possible
+ // alternatives. Time to quit. (We can't do this if we're
+ // looking for matches for multiple regexes, unless we know
+ // they all matched.)
+ //
+ // 2. If the expression starts with a '^' we can terminate as
+ // soon as the last thread dies.
+ if (matched && matches.len() <= 1)
+ || all_matched
+ || (!at.is_start() && self.prog.is_anchored_start)
+ {
+ break;
+ }
+
+ // 3. If there's a literal prefix for the program, try to
+ // jump ahead quickly. If it can't be found, then we can
+ // bail out early.
+ if !self.prog.prefixes.is_empty() {
+ at = match self.input.prefix_at(&self.prog.prefixes, at) {
+ None => break,
+ Some(at) => at,
+ };
+ }
+ }
+
+ // This simulates a preceding '.*?' for every regex by adding
+ // a state starting at the current position in the input for the
+ // beginning of the program only if we don't already have a match.
+ if clist.set.is_empty()
+ || (!self.prog.is_anchored_start && !all_matched)
+ {
+ self.add(&mut clist, slots, 0, at);
+ }
+ // The previous call to "add" actually inspects the position just
+ // before the current character. For stepping through the machine,
+ // we can to look at the current character, so we advance the
+ // input.
+ let at_next = self.input.at(at.next_pos());
+ for i in 0..clist.set.len() {
+ let ip = clist.set[i];
+ if self.step(
+ &mut nlist,
+ matches,
+ slots,
+ clist.caps(ip),
+ ip,
+ at,
+ at_next,
+ ) {
+ matched = true;
+ all_matched = all_matched || matches.iter().all(|&b| b);
+ if quit_after_match {
+ // If we only care if a match occurs (not its
+ // position), then we can quit right now.
+ break 'LOOP;
+ }
+ if self.prog.matches.len() == 1 {
+ // We don't need to check the rest of the threads
+ // in this set because we've matched something
+ // ("leftmost-first"). However, we still need to check
+ // threads in the next set to support things like
+ // greedy matching.
+ //
+ // This is only true on normal regexes. For regex sets,
+ // we need to mush on to observe other matches.
+ break;
+ }
+ }
+ }
+ if at.pos() >= end {
+ break;
+ }
+ at = at_next;
+ mem::swap(clist, nlist);
+ nlist.set.clear();
+ }
+ matched
+ }
+
+ /// Step through the input, one token (byte or codepoint) at a time.
+ ///
+ /// nlist is the set of states that will be processed on the next token
+ /// in the input.
+ ///
+ /// caps is the set of captures passed by the caller of the NFA. They are
+ /// written to only when a match state is visited.
+ ///
+ /// thread_caps is the set of captures set for the current NFA state, ip.
+ ///
+ /// at and at_next are the current and next positions in the input. at or
+ /// at_next may be EOF.
+ fn step(
+ &mut self,
+ nlist: &mut Threads,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ thread_caps: &mut [Option<usize>],
+ ip: usize,
+ at: InputAt,
+ at_next: InputAt,
+ ) -> bool {
+ use crate::prog::Inst::*;
+ match self.prog[ip] {
+ Match(match_slot) => {
+ if match_slot < matches.len() {
+ matches[match_slot] = true;
+ }
+ for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
+ *slot = *val;
+ }
+ true
+ }
+ Char(ref inst) => {
+ if inst.c == at.char() {
+ self.add(nlist, thread_caps, inst.goto, at_next);
+ }
+ false
+ }
+ Ranges(ref inst) => {
+ if inst.matches(at.char()) {
+ self.add(nlist, thread_caps, inst.goto, at_next);
+ }
+ false
+ }
+ Bytes(ref inst) => {
+ if let Some(b) = at.byte() {
+ if inst.matches(b) {
+ self.add(nlist, thread_caps, inst.goto, at_next);
+ }
+ }
+ false
+ }
+ EmptyLook(_) | Save(_) | Split(_) => false,
+ }
+ }
+
+ /// Follows epsilon transitions and adds them for processing to nlist,
+ /// starting at and including ip.
+ fn add(
+ &mut self,
+ nlist: &mut Threads,
+ thread_caps: &mut [Option<usize>],
+ ip: usize,
+ at: InputAt,
+ ) {
+ self.stack.push(FollowEpsilon::IP(ip));
+ while let Some(frame) = self.stack.pop() {
+ match frame {
+ FollowEpsilon::IP(ip) => {
+ self.add_step(nlist, thread_caps, ip, at);
+ }
+ FollowEpsilon::Capture { slot, pos } => {
+ thread_caps[slot] = pos;
+ }
+ }
+ }
+ }
+
+ /// A helper function for add that avoids excessive pushing to the stack.
+ fn add_step(
+ &mut self,
+ nlist: &mut Threads,
+ thread_caps: &mut [Option<usize>],
+ mut ip: usize,
+ at: InputAt,
+ ) {
+ // Instead of pushing and popping to the stack, we mutate ip as we
+ // traverse the set of states. We only push to the stack when we
+ // absolutely need recursion (restoring captures or following a
+ // branch).
+ use crate::prog::Inst::*;
+ loop {
+ // Don't visit states we've already added.
+ if nlist.set.contains(ip) {
+ return;
+ }
+ nlist.set.insert(ip);
+ match self.prog[ip] {
+ EmptyLook(ref inst) => {
+ if self.input.is_empty_match(at, inst) {
+ ip = inst.goto;
+ }
+ }
+ Save(ref inst) => {
+ if inst.slot < thread_caps.len() {
+ self.stack.push(FollowEpsilon::Capture {
+ slot: inst.slot,
+ pos: thread_caps[inst.slot],
+ });
+ thread_caps[inst.slot] = Some(at.pos());
+ }
+ ip = inst.goto;
+ }
+ Split(ref inst) => {
+ self.stack.push(FollowEpsilon::IP(inst.goto2));
+ ip = inst.goto1;
+ }
+ Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
+ let t = &mut nlist.caps(ip);
+ for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
+ *slot = *val;
+ }
+ return;
+ }
+ }
+ }
+ }
+}
+
+impl Threads {
+ fn new() -> Self {
+ Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
+ }
+
+ fn resize(&mut self, num_insts: usize, ncaps: usize) {
+ if num_insts == self.set.capacity() {
+ return;
+ }
+ self.slots_per_thread = ncaps * 2;
+ self.set = SparseSet::new(num_insts);
+ self.caps = vec![None; self.slots_per_thread * num_insts];
+ }
+
+ fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
+ let i = pc * self.slots_per_thread;
+ &mut self.caps[i..i + self.slots_per_thread]
+ }
+}
diff --git a/third_party/rust/regex/src/pool.rs b/third_party/rust/regex/src/pool.rs
new file mode 100644
index 0000000000..6a6f15b194
--- /dev/null
+++ b/third_party/rust/regex/src/pool.rs
@@ -0,0 +1,333 @@
+// This module provides a relatively simple thread-safe pool of reusable
+// objects. For the most part, it's implemented by a stack represented by a
+// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat
+// costly, in the case where a pool is accessed by the first thread that tried
+// to get a value, we bypass the mutex. Here are some benchmarks showing the
+// difference.
+//
+// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s)
+// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s)
+// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s)
+// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s)
+//
+// (1) represents our baseline: the master branch at the time of writing when
+// using the 'thread_local' crate to implement the pool below.
+//
+// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There
+// is no special trick for bypassing the mutex.
+//
+// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as
+// fast because a Box<T> is much smaller than the T we use with a Pool in this
+// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster
+// than for T.
+//
+// (4) is the same as (3), but with the trick for bypassing the mutex in the
+// case of the first-to-get thread.
+//
+// Why move off of thread_local? Even though (4) is a hair faster than (1)
+// above, this was not the main goal. The main goal was to move off of
+// thread_local and find a way to *simply* re-capture some of its speed for
+// regex's specific case. So again, why move off of it? The *primary* reason is
+// because of memory leaks. See https://github.com/rust-lang/regex/issues/362
+// for example. (Why do I want it to be simple? Well, I suppose what I mean is,
+// "use as much safe code as possible to minimize risk and be as sure as I can
+// be that it is correct.")
+//
+// My guess is that the thread_local design is probably not appropriate for
+// regex since its memory usage scales to the number of active threads that
+// have used a regex, where as the pool below scales to the number of threads
+// that simultaneously use a regex. While neither case permits contraction,
+// since we own the pool data structure below, we can add contraction if a
+// clear use case pops up in the wild. More pressingly though, it seems that
+// there are at least some use case patterns where one might have many threads
+// sitting around that might have used a regex at one point. While thread_local
+// does try to reuse space previously used by a thread that has since stopped,
+// its maximal memory usage still scales with the total number of active
+// threads. In contrast, the pool below scales with the total number of threads
+// *simultaneously* using the pool. The hope is that this uses less memory
+// overall. And if it doesn't, we can hopefully tune it somehow.
+//
+// It seems that these sort of conditions happen frequently
+// in FFI inside of other more "managed" languages. This was
+// mentioned in the issue linked above, and also mentioned here:
+// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users
+// confirm that disabling the use of thread_local resolves the leak.
+//
+// There were other weaker reasons for moving off of thread_local as well.
+// Namely, at the time, I was looking to reduce dependencies. And for something
+// like regex, maintenance can be simpler when we own the full dependency tree.
+
+use std::panic::{RefUnwindSafe, UnwindSafe};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Mutex;
+
+/// An atomic counter used to allocate thread IDs.
+static COUNTER: AtomicUsize = AtomicUsize::new(1);
+
+thread_local!(
+ /// A thread local used to assign an ID to a thread.
+ static THREAD_ID: usize = {
+ let next = COUNTER.fetch_add(1, Ordering::Relaxed);
+ // SAFETY: We cannot permit the reuse of thread IDs since reusing a
+ // thread ID might result in more than one thread "owning" a pool,
+ // and thus, permit accessing a mutable value from multiple threads
+ // simultaneously without synchronization. The intent of this panic is
+ // to be a sanity check. It is not expected that the thread ID space
+ // will actually be exhausted in practice.
+ //
+ // This checks that the counter never wraps around, since atomic
+ // addition wraps around on overflow.
+ if next == 0 {
+ panic!("regex: thread ID allocation space exhausted");
+ }
+ next
+ };
+);
+
+/// The type of the function used to create values in a pool when the pool is
+/// empty and the caller requests one.
+type CreateFn<T> =
+ Box<dyn Fn() -> T + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
+
+/// A simple thread safe pool for reusing values.
+///
+/// Getting a value out comes with a guard. When that guard is dropped, the
+/// value is automatically put back in the pool.
+///
+/// A Pool<T> impls Sync when T is Send (even if it's not Sync). This means
+/// that T can use interior mutability. This is possible because a pool is
+/// guaranteed to provide a value to exactly one thread at any time.
+///
+/// Currently, a pool never contracts in size. Its size is proportional to the
+/// number of simultaneous uses.
+pub struct Pool<T> {
+ /// A stack of T values to hand out. These are used when a Pool is
+ /// accessed by a thread that didn't create it.
+ stack: Mutex<Vec<Box<T>>>,
+ /// A function to create more T values when stack is empty and a caller
+ /// has requested a T.
+ create: CreateFn<T>,
+ /// The ID of the thread that owns this pool. The owner is the thread
+ /// that makes the first call to 'get'. When the owner calls 'get', it
+ /// gets 'owner_val' directly instead of returning a T from 'stack'.
+ /// See comments elsewhere for details, but this is intended to be an
+ /// optimization for the common case that makes getting a T faster.
+ ///
+ /// It is initialized to a value of zero (an impossible thread ID) as a
+ /// sentinel to indicate that it is unowned.
+ owner: AtomicUsize,
+ /// A value to return when the caller is in the same thread that created
+ /// the Pool.
+ owner_val: T,
+}
+
+// SAFETY: Since we want to use a Pool from multiple threads simultaneously
+// behind an Arc, we need for it to be Sync. In cases where T is sync, Pool<T>
+// would be Sync. However, since we use a Pool to store mutable scratch space,
+// we wind up using a T that has interior mutability and is thus itself not
+// Sync. So what we *really* want is for our Pool<T> to by Sync even when T is
+// not Sync (but is at least Send).
+//
+// The only non-sync aspect of a Pool is its 'owner_val' field, which is used
+// to implement faster access to a pool value in the common case of a pool
+// being accessed in the same thread in which it was created. The 'stack' field
+// is also shared, but a Mutex<T> where T: Send is already Sync. So we only
+// need to worry about 'owner_val'.
+//
+// The key is to guarantee that 'owner_val' can only ever be accessed from one
+// thread. In our implementation below, we guarantee this by only returning the
+// 'owner_val' when the ID of the current thread matches the ID of the thread
+// that created the Pool. Since this can only ever be one thread, it follows
+// that only one thread can access 'owner_val' at any point in time. Thus, it
+// is safe to declare that Pool<T> is Sync when T is Send.
+//
+// NOTE: It would also be possible to make the owning thread be the *first*
+// thread that tries to get a value out of a Pool. However, the current
+// implementation is a little simpler and it's not clear if making the first
+// thread (rather than the creating thread) is meaningfully better.
+//
+// If there is a way to achieve our performance goals using safe code, then
+// I would very much welcome a patch. As it stands, the implementation below
+// tries to balance safety with performance. The case where a Regex is used
+// from multiple threads simultaneously will suffer a bit since getting a cache
+// will require unlocking a mutex.
+unsafe impl<T: Send> Sync for Pool<T> {}
+
+impl<T: ::std::fmt::Debug> ::std::fmt::Debug for Pool<T> {
+ fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result {
+ f.debug_struct("Pool")
+ .field("stack", &self.stack)
+ .field("owner", &self.owner)
+ .field("owner_val", &self.owner_val)
+ .finish()
+ }
+}
+
+/// A guard that is returned when a caller requests a value from the pool.
+///
+/// The purpose of the guard is to use RAII to automatically put the value back
+/// in the pool once it's dropped.
+#[derive(Debug)]
+pub struct PoolGuard<'a, T: Send> {
+ /// The pool that this guard is attached to.
+ pool: &'a Pool<T>,
+ /// This is None when the guard represents the special "owned" value. In
+ /// which case, the value is retrieved from 'pool.owner_val'.
+ value: Option<Box<T>>,
+}
+
+impl<T: Send> Pool<T> {
+ /// Create a new pool. The given closure is used to create values in the
+ /// pool when necessary.
+ pub fn new(create: CreateFn<T>) -> Pool<T> {
+ let owner = AtomicUsize::new(0);
+ let owner_val = create();
+ Pool { stack: Mutex::new(vec![]), create, owner, owner_val }
+ }
+
+ /// Get a value from the pool. The caller is guaranteed to have exclusive
+ /// access to the given value.
+ ///
+ /// Note that there is no guarantee provided about which value in the
+ /// pool is returned. That is, calling get, dropping the guard (causing
+ /// the value to go back into the pool) and then calling get again is NOT
+ /// guaranteed to return the same value received in the first get call.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn get(&self) -> PoolGuard<'_, T> {
+ // Our fast path checks if the caller is the thread that "owns" this
+ // pool. Or stated differently, whether it is the first thread that
+ // tried to extract a value from the pool. If it is, then we can return
+ // a T to the caller without going through a mutex.
+ //
+ // SAFETY: We must guarantee that only one thread gets access to this
+ // value. Since a thread is uniquely identified by the THREAD_ID thread
+ // local, it follows that is the caller's thread ID is equal to the
+ // owner, then only one thread may receive this value.
+ let caller = THREAD_ID.with(|id| *id);
+ let owner = self.owner.load(Ordering::Relaxed);
+ if caller == owner {
+ return self.guard_owned();
+ }
+ self.get_slow(caller, owner)
+ }
+
+ /// This is the "slow" version that goes through a mutex to pop an
+ /// allocated value off a stack to return to the caller. (Or, if the stack
+ /// is empty, a new value is created.)
+ ///
+ /// If the pool has no owner, then this will set the owner.
+ #[cold]
+ fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<'_, T> {
+ use std::sync::atomic::Ordering::Relaxed;
+
+ if owner == 0 {
+ // The sentinel 0 value means this pool is not yet owned. We
+ // try to atomically set the owner. If we do, then this thread
+ // becomes the owner and we can return a guard that represents
+ // the special T for the owner.
+ let res = self.owner.compare_exchange(0, caller, Relaxed, Relaxed);
+ if res.is_ok() {
+ return self.guard_owned();
+ }
+ }
+ let mut stack = self.stack.lock().unwrap();
+ let value = match stack.pop() {
+ None => Box::new((self.create)()),
+ Some(value) => value,
+ };
+ self.guard_stack(value)
+ }
+
+ /// Puts a value back into the pool. Callers don't need to call this. Once
+ /// the guard that's returned by 'get' is dropped, it is put back into the
+ /// pool automatically.
+ fn put(&self, value: Box<T>) {
+ let mut stack = self.stack.lock().unwrap();
+ stack.push(value);
+ }
+
+ /// Create a guard that represents the special owned T.
+ fn guard_owned(&self) -> PoolGuard<'_, T> {
+ PoolGuard { pool: self, value: None }
+ }
+
+ /// Create a guard that contains a value from the pool's stack.
+ fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T> {
+ PoolGuard { pool: self, value: Some(value) }
+ }
+}
+
+impl<'a, T: Send> PoolGuard<'a, T> {
+ /// Return the underlying value.
+ pub fn value(&self) -> &T {
+ match self.value {
+ None => &self.pool.owner_val,
+ Some(ref v) => &**v,
+ }
+ }
+}
+
+impl<'a, T: Send> Drop for PoolGuard<'a, T> {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn drop(&mut self) {
+ if let Some(value) = self.value.take() {
+ self.pool.put(value);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::panic::{RefUnwindSafe, UnwindSafe};
+
+ use super::*;
+
+ #[test]
+ fn oibits() {
+ use crate::exec::ProgramCache;
+
+ fn has_oibits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
+ has_oibits::<Pool<ProgramCache>>();
+ }
+
+ // Tests that Pool implements the "single owner" optimization. That is, the
+ // thread that first accesses the pool gets its own copy, while all other
+ // threads get distinct copies.
+ #[test]
+ fn thread_owner_optimization() {
+ use std::cell::RefCell;
+ use std::sync::Arc;
+
+ let pool: Arc<Pool<RefCell<Vec<char>>>> =
+ Arc::new(Pool::new(Box::new(|| RefCell::new(vec!['a']))));
+ pool.get().value().borrow_mut().push('x');
+
+ let pool1 = pool.clone();
+ let t1 = std::thread::spawn(move || {
+ let guard = pool1.get();
+ let v = guard.value();
+ v.borrow_mut().push('y');
+ });
+
+ let pool2 = pool.clone();
+ let t2 = std::thread::spawn(move || {
+ let guard = pool2.get();
+ let v = guard.value();
+ v.borrow_mut().push('z');
+ });
+
+ t1.join().unwrap();
+ t2.join().unwrap();
+
+ // If we didn't implement the single owner optimization, then one of
+ // the threads above is likely to have mutated the [a, x] vec that
+ // we stuffed in the pool before spawning the threads. But since
+ // neither thread was first to access the pool, and because of the
+ // optimization, we should be guaranteed that neither thread mutates
+ // the special owned pool value.
+ //
+ // (Technically this is an implementation detail and not a contract of
+ // Pool's API.)
+ assert_eq!(vec!['a', 'x'], *pool.get().value().borrow());
+ }
+}
diff --git a/third_party/rust/regex/src/prog.rs b/third_party/rust/regex/src/prog.rs
new file mode 100644
index 0000000000..c211f71d8a
--- /dev/null
+++ b/third_party/rust/regex/src/prog.rs
@@ -0,0 +1,447 @@
+use std::cmp::Ordering;
+use std::collections::HashMap;
+use std::fmt;
+use std::mem;
+use std::ops::Deref;
+use std::slice;
+use std::sync::Arc;
+
+use crate::input::Char;
+use crate::literal::LiteralSearcher;
+
+/// `InstPtr` represents the index of an instruction in a regex program.
+pub type InstPtr = usize;
+
+/// Program is a sequence of instructions and various facts about thos
+/// instructions.
+#[derive(Clone)]
+pub struct Program {
+ /// A sequence of instructions that represents an NFA.
+ pub insts: Vec<Inst>,
+ /// Pointers to each Match instruction in the sequence.
+ ///
+ /// This is always length 1 unless this program represents a regex set.
+ pub matches: Vec<InstPtr>,
+ /// The ordered sequence of all capture groups extracted from the AST.
+ /// Unnamed groups are `None`.
+ pub captures: Vec<Option<String>>,
+ /// Pointers to all named capture groups into `captures`.
+ pub capture_name_idx: Arc<HashMap<String, usize>>,
+ /// A pointer to the start instruction. This can vary depending on how
+ /// the program was compiled. For example, programs for use with the DFA
+ /// engine have a `.*?` inserted at the beginning of unanchored regular
+ /// expressions. The actual starting point of the program is after the
+ /// `.*?`.
+ pub start: InstPtr,
+ /// A set of equivalence classes for discriminating bytes in the compiled
+ /// program.
+ pub byte_classes: Vec<u8>,
+ /// When true, this program can only match valid UTF-8.
+ pub only_utf8: bool,
+ /// When true, this program uses byte range instructions instead of Unicode
+ /// range instructions.
+ pub is_bytes: bool,
+ /// When true, the program is compiled for DFA matching. For example, this
+ /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
+ /// regexes.
+ pub is_dfa: bool,
+ /// When true, the program matches text in reverse (for use only in the
+ /// DFA).
+ pub is_reverse: bool,
+ /// Whether the regex must match from the start of the input.
+ pub is_anchored_start: bool,
+ /// Whether the regex must match at the end of the input.
+ pub is_anchored_end: bool,
+ /// Whether this program contains a Unicode word boundary instruction.
+ pub has_unicode_word_boundary: bool,
+ /// A possibly empty machine for very quickly matching prefix literals.
+ pub prefixes: LiteralSearcher,
+ /// A limit on the size of the cache that the DFA is allowed to use while
+ /// matching.
+ ///
+ /// The cache limit specifies approximately how much space we're willing to
+ /// give to the state cache. Once the state cache exceeds the size, it is
+ /// wiped and all states must be re-computed.
+ ///
+ /// Note that this value does not impact correctness. It can be set to 0
+ /// and the DFA will run just fine. (It will only ever store exactly one
+ /// state in the cache, and will likely run very slowly, but it will work.)
+ ///
+ /// Also note that this limit is *per thread of execution*. That is,
+ /// if the same regex is used to search text across multiple threads
+ /// simultaneously, then the DFA cache is not shared. Instead, copies are
+ /// made.
+ pub dfa_size_limit: usize,
+}
+
+impl Program {
+ /// Creates an empty instruction sequence. Fields are given default
+ /// values.
+ pub fn new() -> Self {
+ Program {
+ insts: vec![],
+ matches: vec![],
+ captures: vec![],
+ capture_name_idx: Arc::new(HashMap::new()),
+ start: 0,
+ byte_classes: vec![0; 256],
+ only_utf8: true,
+ is_bytes: false,
+ is_dfa: false,
+ is_reverse: false,
+ is_anchored_start: false,
+ is_anchored_end: false,
+ has_unicode_word_boundary: false,
+ prefixes: LiteralSearcher::empty(),
+ dfa_size_limit: 2 * (1 << 20),
+ }
+ }
+
+ /// If pc is an index to a no-op instruction (like Save), then return the
+ /// next pc that is not a no-op instruction.
+ pub fn skip(&self, mut pc: usize) -> usize {
+ loop {
+ match self[pc] {
+ Inst::Save(ref i) => pc = i.goto,
+ _ => return pc,
+ }
+ }
+ }
+
+ /// Return true if and only if an execution engine at instruction `pc` will
+ /// always lead to a match.
+ pub fn leads_to_match(&self, pc: usize) -> bool {
+ if self.matches.len() > 1 {
+ // If we have a regex set, then we have more than one ending
+ // state, so leading to one of those states is generally
+ // meaningless.
+ return false;
+ }
+ match self[self.skip(pc)] {
+ Inst::Match(_) => true,
+ _ => false,
+ }
+ }
+
+ /// Returns true if the current configuration demands that an implicit
+ /// `.*?` be prepended to the instruction sequence.
+ pub fn needs_dotstar(&self) -> bool {
+ self.is_dfa && !self.is_reverse && !self.is_anchored_start
+ }
+
+ /// Returns true if this program uses Byte instructions instead of
+ /// Char/Range instructions.
+ pub fn uses_bytes(&self) -> bool {
+ self.is_bytes || self.is_dfa
+ }
+
+ /// Returns true if this program exclusively matches valid UTF-8 bytes.
+ ///
+ /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
+ pub fn only_utf8(&self) -> bool {
+ self.only_utf8
+ }
+
+ /// Return the approximate heap usage of this instruction sequence in
+ /// bytes.
+ pub fn approximate_size(&self) -> usize {
+ // The only instruction that uses heap space is Ranges (for
+ // Unicode codepoint programs) to store non-overlapping codepoint
+ // ranges. To keep this operation constant time, we ignore them.
+ (self.len() * mem::size_of::<Inst>())
+ + (self.matches.len() * mem::size_of::<InstPtr>())
+ + (self.captures.len() * mem::size_of::<Option<String>>())
+ + (self.capture_name_idx.len()
+ * (mem::size_of::<String>() + mem::size_of::<usize>()))
+ + (self.byte_classes.len() * mem::size_of::<u8>())
+ + self.prefixes.approximate_size()
+ }
+}
+
+impl Deref for Program {
+ type Target = [Inst];
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn deref(&self) -> &Self::Target {
+ &*self.insts
+ }
+}
+
+impl fmt::Debug for Program {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use self::Inst::*;
+
+ fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
+ if goto == cur + 1 {
+ fmtd
+ } else {
+ format!("{} (goto: {})", fmtd, goto)
+ }
+ }
+
+ fn visible_byte(b: u8) -> String {
+ use std::ascii::escape_default;
+ let escaped = escape_default(b).collect::<Vec<u8>>();
+ String::from_utf8_lossy(&escaped).into_owned()
+ }
+
+ for (pc, inst) in self.iter().enumerate() {
+ match *inst {
+ Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
+ Save(ref inst) => {
+ let s = format!("{:04} Save({})", pc, inst.slot);
+ write!(f, "{}", with_goto(pc, inst.goto, s))?;
+ }
+ Split(ref inst) => {
+ write!(
+ f,
+ "{:04} Split({}, {})",
+ pc, inst.goto1, inst.goto2
+ )?;
+ }
+ EmptyLook(ref inst) => {
+ let s = format!("{:?}", inst.look);
+ write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+ }
+ Char(ref inst) => {
+ let s = format!("{:?}", inst.c);
+ write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+ }
+ Ranges(ref inst) => {
+ let ranges = inst
+ .ranges
+ .iter()
+ .map(|r| format!("{:?}-{:?}", r.0, r.1))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(
+ f,
+ "{:04} {}",
+ pc,
+ with_goto(pc, inst.goto, ranges)
+ )?;
+ }
+ Bytes(ref inst) => {
+ let s = format!(
+ "Bytes({}, {})",
+ visible_byte(inst.start),
+ visible_byte(inst.end)
+ );
+ write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+ }
+ }
+ if pc == self.start {
+ write!(f, " (start)")?;
+ }
+ writeln!(f)?;
+ }
+ Ok(())
+ }
+}
+
+impl<'a> IntoIterator for &'a Program {
+ type Item = &'a Inst;
+ type IntoIter = slice::Iter<'a, Inst>;
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
+
+/// Inst is an instruction code in a Regex program.
+///
+/// Regrettably, a regex program either contains Unicode codepoint
+/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
+/// A regex program can never contain both.
+///
+/// It would be worth investigating splitting this into two distinct types and
+/// then figuring out how to make the matching engines polymorphic over those
+/// types without sacrificing performance.
+///
+/// Other than the benefit of moving invariants into the type system, another
+/// benefit is the decreased size. If we remove the `Char` and `Ranges`
+/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
+/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
+/// variant.) Given that byte based machines are typically much bigger than
+/// their Unicode analogues (because they can decode UTF-8 directly), this ends
+/// up being a pretty significant savings.
+#[derive(Clone, Debug)]
+pub enum Inst {
+ /// Match indicates that the program has reached a match state.
+ ///
+ /// The number in the match corresponds to the Nth logical regular
+ /// expression in this program. This index is always 0 for normal regex
+ /// programs. Values greater than 0 appear when compiling regex sets, and
+ /// each match instruction gets its own unique value. The value corresponds
+ /// to the Nth regex in the set.
+ Match(usize),
+ /// Save causes the program to save the current location of the input in
+ /// the slot indicated by InstSave.
+ Save(InstSave),
+ /// Split causes the program to diverge to one of two paths in the
+ /// program, preferring goto1 in InstSplit.
+ Split(InstSplit),
+ /// EmptyLook represents a zero-width assertion in a regex program. A
+ /// zero-width assertion does not consume any of the input text.
+ EmptyLook(InstEmptyLook),
+ /// Char requires the regex program to match the character in InstChar at
+ /// the current position in the input.
+ Char(InstChar),
+ /// Ranges requires the regex program to match the character at the current
+ /// position in the input with one of the ranges specified in InstRanges.
+ Ranges(InstRanges),
+ /// Bytes is like Ranges, except it expresses a single byte range. It is
+ /// used in conjunction with Split instructions to implement multi-byte
+ /// character classes.
+ Bytes(InstBytes),
+}
+
+impl Inst {
+ /// Returns true if and only if this is a match instruction.
+ pub fn is_match(&self) -> bool {
+ match *self {
+ Inst::Match(_) => true,
+ _ => false,
+ }
+ }
+}
+
+/// Representation of the Save instruction.
+#[derive(Clone, Debug)]
+pub struct InstSave {
+ /// The next location to execute in the program.
+ pub goto: InstPtr,
+ /// The capture slot (there are two slots for every capture in a regex,
+ /// including the zeroth capture for the entire match).
+ pub slot: usize,
+}
+
+/// Representation of the Split instruction.
+#[derive(Clone, Debug)]
+pub struct InstSplit {
+ /// The first instruction to try. A match resulting from following goto1
+ /// has precedence over a match resulting from following goto2.
+ pub goto1: InstPtr,
+ /// The second instruction to try. A match resulting from following goto1
+ /// has precedence over a match resulting from following goto2.
+ pub goto2: InstPtr,
+}
+
+/// Representation of the `EmptyLook` instruction.
+#[derive(Clone, Debug)]
+pub struct InstEmptyLook {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The type of zero-width assertion to check.
+ pub look: EmptyLook,
+}
+
+/// The set of zero-width match instructions.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum EmptyLook {
+ /// Start of line or input.
+ StartLine,
+ /// End of line or input.
+ EndLine,
+ /// Start of input.
+ StartText,
+ /// End of input.
+ EndText,
+ /// Word character on one side and non-word character on other.
+ WordBoundary,
+ /// Word character on both sides or non-word character on both sides.
+ NotWordBoundary,
+ /// ASCII word boundary.
+ WordBoundaryAscii,
+ /// Not ASCII word boundary.
+ NotWordBoundaryAscii,
+}
+
+/// Representation of the Char instruction.
+#[derive(Clone, Debug)]
+pub struct InstChar {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The character to test.
+ pub c: char,
+}
+
+/// Representation of the Ranges instruction.
+#[derive(Clone, Debug)]
+pub struct InstRanges {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The set of Unicode scalar value ranges to test.
+ pub ranges: Box<[(char, char)]>,
+}
+
+impl InstRanges {
+ /// Tests whether the given input character matches this instruction.
+ pub fn matches(&self, c: Char) -> bool {
+ // This speeds up the `match_class_unicode` benchmark by checking
+ // some common cases quickly without binary search. e.g., Matching
+ // a Unicode class on predominantly ASCII text.
+ for r in self.ranges.iter().take(4) {
+ if c < r.0 {
+ return false;
+ }
+ if c <= r.1 {
+ return true;
+ }
+ }
+ self.ranges
+ .binary_search_by(|r| {
+ if r.1 < c {
+ Ordering::Less
+ } else if r.0 > c {
+ Ordering::Greater
+ } else {
+ Ordering::Equal
+ }
+ })
+ .is_ok()
+ }
+
+ /// Return the number of distinct characters represented by all of the
+ /// ranges.
+ pub fn num_chars(&self) -> usize {
+ self.ranges
+ .iter()
+ .map(|&(s, e)| 1 + (e as u32) - (s as u32))
+ .sum::<u32>() as usize
+ }
+}
+
+/// Representation of the Bytes instruction.
+#[derive(Clone, Debug)]
+pub struct InstBytes {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The start (inclusive) of this byte range.
+ pub start: u8,
+ /// The end (inclusive) of this byte range.
+ pub end: u8,
+}
+
+impl InstBytes {
+ /// Returns true if and only if the given byte is in this range.
+ pub fn matches(&self, byte: u8) -> bool {
+ self.start <= byte && byte <= self.end
+ }
+}
+
+#[cfg(test)]
+mod test {
+ #[test]
+ #[cfg(target_pointer_width = "64")]
+ fn test_size_of_inst() {
+ use std::mem::size_of;
+
+ use super::Inst;
+
+ assert_eq!(32, size_of::<Inst>());
+ }
+}
diff --git a/third_party/rust/regex/src/re_builder.rs b/third_party/rust/regex/src/re_builder.rs
new file mode 100644
index 0000000000..ee6383690d
--- /dev/null
+++ b/third_party/rust/regex/src/re_builder.rs
@@ -0,0 +1,421 @@
+/// The set of user configurable options for compiling zero or more regexes.
+#[derive(Clone, Debug)]
+#[allow(missing_docs)]
+pub struct RegexOptions {
+ pub pats: Vec<String>,
+ pub size_limit: usize,
+ pub dfa_size_limit: usize,
+ pub nest_limit: u32,
+ pub case_insensitive: bool,
+ pub multi_line: bool,
+ pub dot_matches_new_line: bool,
+ pub swap_greed: bool,
+ pub ignore_whitespace: bool,
+ pub unicode: bool,
+ pub octal: bool,
+}
+
+impl Default for RegexOptions {
+ fn default() -> Self {
+ RegexOptions {
+ pats: vec![],
+ size_limit: 10 * (1 << 20),
+ dfa_size_limit: 2 * (1 << 20),
+ nest_limit: 250,
+ case_insensitive: false,
+ multi_line: false,
+ dot_matches_new_line: false,
+ swap_greed: false,
+ ignore_whitespace: false,
+ unicode: true,
+ octal: false,
+ }
+ }
+}
+
+macro_rules! define_builder {
+ ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
+ pub mod $name {
+ use super::RegexOptions;
+ use crate::error::Error;
+ use crate::exec::ExecBuilder;
+
+ use crate::$regex_mod::Regex;
+
+ /// A configurable builder for a regular expression.
+ ///
+ /// A builder can be used to configure how the regex is built, for example, by
+ /// setting the default flags (which can be overridden in the expression
+ /// itself) or setting various limits.
+ #[derive(Debug)]
+ pub struct RegexBuilder(RegexOptions);
+
+ impl RegexBuilder {
+ /// Create a new regular expression builder with the given pattern.
+ ///
+ /// If the pattern is invalid, then an error will be returned when
+ /// `build` is called.
+ pub fn new(pattern: &str) -> RegexBuilder {
+ let mut builder = RegexBuilder(RegexOptions::default());
+ builder.0.pats.push(pattern.to_owned());
+ builder
+ }
+
+ /// Consume the builder and compile the regular expression.
+ ///
+ /// Note that calling `as_str` on the resulting `Regex` will produce the
+ /// pattern given to `new` verbatim. Notably, it will not incorporate any
+ /// of the flags set on this builder.
+ pub fn build(&self) -> Result<Regex, Error> {
+ ExecBuilder::new_options(self.0.clone())
+ .only_utf8($only_utf8)
+ .build()
+ .map(Regex::from)
+ }
+
+ /// Set the value for the case insensitive (`i`) flag.
+ ///
+ /// When enabled, letters in the pattern will match both upper case and
+ /// lower case variants.
+ pub fn case_insensitive(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexBuilder {
+ self.0.case_insensitive = yes;
+ self
+ }
+
+ /// Set the value for the multi-line matching (`m`) flag.
+ ///
+ /// When enabled, `^` matches the beginning of lines and `$` matches the
+ /// end of lines.
+ ///
+ /// By default, they match beginning/end of the input.
+ pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.0.multi_line = yes;
+ self
+ }
+
+ /// Set the value for the any character (`s`) flag, where in `.` matches
+ /// anything when `s` is set and matches anything except for new line when
+ /// it is not set (the default).
+ ///
+ /// N.B. "matches anything" means "any byte" when Unicode is disabled and
+ /// means "any valid UTF-8 encoding of any Unicode scalar value" when
+ /// Unicode is enabled.
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexBuilder {
+ self.0.dot_matches_new_line = yes;
+ self
+ }
+
+ /// Set the value for the greedy swap (`U`) flag.
+ ///
+ /// When enabled, a pattern like `a*` is lazy (tries to find shortest
+ /// match) and `a*?` is greedy (tries to find longest match).
+ ///
+ /// By default, `a*` is greedy and `a*?` is lazy.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.0.swap_greed = yes;
+ self
+ }
+
+ /// Set the value for the ignore whitespace (`x`) flag.
+ ///
+ /// When enabled, whitespace such as new lines and spaces will be ignored
+ /// between expressions of the pattern, and `#` can be used to start a
+ /// comment until the next new line.
+ pub fn ignore_whitespace(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexBuilder {
+ self.0.ignore_whitespace = yes;
+ self
+ }
+
+ /// Set the value for the Unicode (`u`) flag.
+ ///
+ /// Enabled by default. When disabled, character classes such as `\w` only
+ /// match ASCII word characters instead of all Unicode word characters.
+ pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.0.unicode = yes;
+ self
+ }
+
+ /// Whether to support octal syntax or not.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints in
+ /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+ /// `\141` are all equivalent regular expressions, where the last example
+ /// shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem, it does
+ /// make good error messages harder. That is, in PCRE based regex engines,
+ /// syntax like `\0` invokes a backreference, which is explicitly
+ /// unsupported in Rust's regex engine. However, many users expect it to
+ /// be supported. Therefore, when octal support is disabled, the error
+ /// message will explicitly mention that backreferences aren't supported.
+ ///
+ /// Octal syntax is disabled by default.
+ pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.0.octal = yes;
+ self
+ }
+
+ /// Set the approximate size limit of the compiled regular expression.
+ ///
+ /// This roughly corresponds to the number of bytes occupied by a single
+ /// compiled program. If the program exceeds this number, then a
+ /// compilation error is returned.
+ pub fn size_limit(
+ &mut self,
+ limit: usize,
+ ) -> &mut RegexBuilder {
+ self.0.size_limit = limit;
+ self
+ }
+
+ /// Set the approximate size of the cache used by the DFA.
+ ///
+ /// This roughly corresponds to the number of bytes that the DFA will
+ /// use while searching.
+ ///
+ /// Note that this is a *per thread* limit. There is no way to set a global
+ /// limit. In particular, if a regex is used from multiple threads
+ /// simultaneously, then each thread may use up to the number of bytes
+ /// specified here.
+ pub fn dfa_size_limit(
+ &mut self,
+ limit: usize,
+ ) -> &mut RegexBuilder {
+ self.0.dfa_size_limit = limit;
+ self
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an `Ast` using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire Ast is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since this parser
+ /// implementation will limit itself to heap space proportional to the
+ /// length of the pattern string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation, which results in a nest
+ /// depth of `1`. In general, a nest limit is not something that manifests
+ /// in an obvious way in the concrete syntax, therefore, it should not be
+ /// used in a granular way.
+ pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
+ self.0.nest_limit = limit;
+ self
+ }
+ }
+ }
+ };
+}
+
+define_builder!(bytes, re_bytes, false);
+define_builder!(unicode, re_unicode, true);
+
+macro_rules! define_set_builder {
+ ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
+ pub mod $name {
+ use super::RegexOptions;
+ use crate::error::Error;
+ use crate::exec::ExecBuilder;
+
+ use crate::re_set::$regex_mod::RegexSet;
+
+ /// A configurable builder for a set of regular expressions.
+ ///
+ /// A builder can be used to configure how the regexes are built, for example,
+ /// by setting the default flags (which can be overridden in the expression
+ /// itself) or setting various limits.
+ #[derive(Debug)]
+ pub struct RegexSetBuilder(RegexOptions);
+
+ impl RegexSetBuilder {
+ /// Create a new regular expression builder with the given pattern.
+ ///
+ /// If the pattern is invalid, then an error will be returned when
+ /// `build` is called.
+ pub fn new<I, S>(patterns: I) -> RegexSetBuilder
+ where
+ S: AsRef<str>,
+ I: IntoIterator<Item = S>,
+ {
+ let mut builder = RegexSetBuilder(RegexOptions::default());
+ for pat in patterns {
+ builder.0.pats.push(pat.as_ref().to_owned());
+ }
+ builder
+ }
+
+ /// Consume the builder and compile the regular expressions into a set.
+ pub fn build(&self) -> Result<RegexSet, Error> {
+ ExecBuilder::new_options(self.0.clone())
+ .only_utf8($only_utf8)
+ .build()
+ .map(RegexSet::from)
+ }
+
+ /// Set the value for the case insensitive (`i`) flag.
+ pub fn case_insensitive(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.case_insensitive = yes;
+ self
+ }
+
+ /// Set the value for the multi-line matching (`m`) flag.
+ pub fn multi_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.multi_line = yes;
+ self
+ }
+
+ /// Set the value for the any character (`s`) flag, where in `.` matches
+ /// anything when `s` is set and matches anything except for new line when
+ /// it is not set (the default).
+ ///
+ /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
+ /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
+ /// expressions.
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.dot_matches_new_line = yes;
+ self
+ }
+
+ /// Set the value for the greedy swap (`U`) flag.
+ pub fn swap_greed(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.swap_greed = yes;
+ self
+ }
+
+ /// Set the value for the ignore whitespace (`x`) flag.
+ pub fn ignore_whitespace(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.ignore_whitespace = yes;
+ self
+ }
+
+ /// Set the value for the Unicode (`u`) flag.
+ pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.0.unicode = yes;
+ self
+ }
+
+ /// Whether to support octal syntax or not.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints in
+ /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+ /// `\141` are all equivalent regular expressions, where the last example
+ /// shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem, it does
+ /// make good error messages harder. That is, in PCRE based regex engines,
+ /// syntax like `\0` invokes a backreference, which is explicitly
+ /// unsupported in Rust's regex engine. However, many users expect it to
+ /// be supported. Therefore, when octal support is disabled, the error
+ /// message will explicitly mention that backreferences aren't supported.
+ ///
+ /// Octal syntax is disabled by default.
+ pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.0.octal = yes;
+ self
+ }
+
+ /// Set the approximate size limit of the compiled regular expression.
+ ///
+ /// This roughly corresponds to the number of bytes occupied by a single
+ /// compiled program. If the program exceeds this number, then a
+ /// compilation error is returned.
+ pub fn size_limit(
+ &mut self,
+ limit: usize,
+ ) -> &mut RegexSetBuilder {
+ self.0.size_limit = limit;
+ self
+ }
+
+ /// Set the approximate size of the cache used by the DFA.
+ ///
+ /// This roughly corresponds to the number of bytes that the DFA will
+ /// use while searching.
+ ///
+ /// Note that this is a *per thread* limit. There is no way to set a global
+ /// limit. In particular, if a regex is used from multiple threads
+ /// simultaneously, then each thread may use up to the number of bytes
+ /// specified here.
+ pub fn dfa_size_limit(
+ &mut self,
+ limit: usize,
+ ) -> &mut RegexSetBuilder {
+ self.0.dfa_size_limit = limit;
+ self
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an `Ast` using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire Ast is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since this parser
+ /// implementation will limit itself to heap space proportional to the
+ /// length of the pattern string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation, which results in a nest
+ /// depth of `1`. In general, a nest limit is not something that manifests
+ /// in an obvious way in the concrete syntax, therefore, it should not be
+ /// used in a granular way.
+ pub fn nest_limit(
+ &mut self,
+ limit: u32,
+ ) -> &mut RegexSetBuilder {
+ self.0.nest_limit = limit;
+ self
+ }
+ }
+ }
+ };
+}
+
+define_set_builder!(set_bytes, bytes, false);
+define_set_builder!(set_unicode, unicode, true);
diff --git a/third_party/rust/regex/src/re_bytes.rs b/third_party/rust/regex/src/re_bytes.rs
new file mode 100644
index 0000000000..07e9f98acc
--- /dev/null
+++ b/third_party/rust/regex/src/re_bytes.rs
@@ -0,0 +1,1260 @@
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::fmt;
+use std::iter::FusedIterator;
+use std::ops::{Index, Range};
+use std::str::FromStr;
+use std::sync::Arc;
+
+use crate::find_byte::find_byte;
+
+use crate::error::Error;
+use crate::exec::{Exec, ExecNoSync};
+use crate::expand::expand_bytes;
+use crate::re_builder::bytes::RegexBuilder;
+use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
+
+/// Match represents a single match of a regex in a haystack.
+///
+/// The lifetime parameter `'t` refers to the lifetime of the matched text.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub struct Match<'t> {
+ text: &'t [u8],
+ start: usize,
+ end: usize,
+}
+
+impl<'t> Match<'t> {
+ /// Returns the starting byte offset of the match in the haystack.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// Returns the ending byte offset of the match in the haystack.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns the range over the starting and ending byte offsets of the
+ /// match in the haystack.
+ #[inline]
+ pub fn range(&self) -> Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns the matched text.
+ #[inline]
+ pub fn as_bytes(&self) -> &'t [u8] {
+ &self.text[self.range()]
+ }
+
+ /// Creates a new match from the given haystack and byte offsets.
+ #[inline]
+ fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> {
+ Match { text: haystack, start, end }
+ }
+}
+
+impl<'t> From<Match<'t>> for Range<usize> {
+ fn from(m: Match<'t>) -> Range<usize> {
+ m.range()
+ }
+}
+
+/// A compiled regular expression for matching arbitrary bytes.
+///
+/// It can be used to search, split or replace text. All searching is done with
+/// an implicit `.*?` at the beginning and end of an expression. To force an
+/// expression to match the whole string (or a prefix or a suffix), you must
+/// use an anchor like `^` or `$` (or `\A` and `\z`).
+///
+/// Like the `Regex` type in the parent module, matches with this regex return
+/// byte offsets into the search text. **Unlike** the parent `Regex` type,
+/// these byte offsets may not correspond to UTF-8 sequence boundaries since
+/// the regexes in this module can match arbitrary bytes.
+#[derive(Clone)]
+pub struct Regex(Exec);
+
+impl fmt::Display for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+impl fmt::Debug for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt::Display::fmt(self, f)
+ }
+}
+
+/// A constructor for Regex from an Exec.
+///
+/// This is hidden because Exec isn't actually part of the public API.
+#[doc(hidden)]
+impl From<Exec> for Regex {
+ fn from(exec: Exec) -> Regex {
+ Regex(exec)
+ }
+}
+
+impl FromStr for Regex {
+ type Err = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn from_str(s: &str) -> Result<Regex, Error> {
+ Regex::new(s)
+ }
+}
+
+/// Core regular expression methods.
+impl Regex {
+ /// Compiles a regular expression. Once compiled, it can be used repeatedly
+ /// to search, split or replace text in a string.
+ ///
+ /// If an invalid expression is given, then an error is returned.
+ pub fn new(re: &str) -> Result<Regex, Error> {
+ RegexBuilder::new(re).build()
+ }
+
+ /// Returns true if and only if there is a match for the regex in the
+ /// string given.
+ ///
+ /// It is recommended to use this method if all you need to do is test
+ /// a match, since the underlying matching engine may be able to do less
+ /// work.
+ ///
+ /// # Example
+ ///
+ /// Test if some text contains at least one word with exactly 13 ASCII word
+ /// bytes:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let text = b"I categorically deny having triskaidekaphobia.";
+ /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
+ /// # }
+ /// ```
+ pub fn is_match(&self, text: &[u8]) -> bool {
+ self.is_match_at(text, 0)
+ }
+
+ /// Returns the start and end byte range of the leftmost-first match in
+ /// `text`. If no match exists, then `None` is returned.
+ ///
+ /// Note that this should only be used if you want to discover the position
+ /// of the match. Testing the existence of a match is faster if you use
+ /// `is_match`.
+ ///
+ /// # Example
+ ///
+ /// Find the start and end location of the first word with exactly 13
+ /// ASCII word bytes:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let text = b"I categorically deny having triskaidekaphobia.";
+ /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
+ /// assert_eq!((mat.start(), mat.end()), (2, 15));
+ /// # }
+ /// ```
+ pub fn find<'t>(&self, text: &'t [u8]) -> Option<Match<'t>> {
+ self.find_at(text, 0)
+ }
+
+ /// Returns an iterator for each successive non-overlapping match in
+ /// `text`, returning the start and end byte indices with respect to
+ /// `text`.
+ ///
+ /// # Example
+ ///
+ /// Find the start and end location of every word with exactly 13 ASCII
+ /// word bytes:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let text = b"Retroactively relinquishing remunerations is reprehensible.";
+ /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
+ /// println!("{:?}", mat);
+ /// }
+ /// # }
+ /// ```
+ pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> {
+ Matches(self.0.searcher().find_iter(text))
+ }
+
+ /// Returns the capture groups corresponding to the leftmost-first
+ /// match in `text`. Capture group `0` always corresponds to the entire
+ /// match. If no match is found, then `None` is returned.
+ ///
+ /// You should only use `captures` if you need access to the location of
+ /// capturing group matches. Otherwise, `find` is faster for discovering
+ /// the location of the overall match.
+ ///
+ /// # Examples
+ ///
+ /// Say you have some text with movie names and their release years,
+ /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
+ /// looking like that, while also extracting the movie name and its release
+ /// year separately.
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
+ /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(text).unwrap();
+ /// assert_eq!(caps.get(1).unwrap().as_bytes(), &b"Citizen Kane"[..]);
+ /// assert_eq!(caps.get(2).unwrap().as_bytes(), &b"1941"[..]);
+ /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]);
+ /// // You can also access the groups by index using the Index notation.
+ /// // Note that this will panic on an invalid index.
+ /// assert_eq!(&caps[1], b"Citizen Kane");
+ /// assert_eq!(&caps[2], b"1941");
+ /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
+ /// # }
+ /// ```
+ ///
+ /// Note that the full match is at capture group `0`. Each subsequent
+ /// capture group is indexed by the order of its opening `(`.
+ ///
+ /// We can make this example a bit clearer by using *named* capture groups:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
+ /// .unwrap();
+ /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(text).unwrap();
+ /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane");
+ /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941");
+ /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]);
+ /// // You can also access the groups by name using the Index notation.
+ /// // Note that this will panic on an invalid group name.
+ /// assert_eq!(&caps["title"], b"Citizen Kane");
+ /// assert_eq!(&caps["year"], b"1941");
+ /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
+ ///
+ /// # }
+ /// ```
+ ///
+ /// Here we name the capture groups, which we can access with the `name`
+ /// method or the `Index` notation with a `&str`. Note that the named
+ /// capture groups are still accessible with `get` or the `Index` notation
+ /// with a `usize`.
+ ///
+ /// The `0`th capture group is always unnamed, so it must always be
+ /// accessed with `get(0)` or `[0]`.
+ pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
+ let mut locs = self.capture_locations();
+ self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
+ text,
+ locs: locs.0,
+ named_groups: self.0.capture_name_idx().clone(),
+ })
+ }
+
+ /// Returns an iterator over all the non-overlapping capture groups matched
+ /// in `text`. This is operationally the same as `find_iter`, except it
+ /// yields information about capturing group matches.
+ ///
+ /// # Example
+ ///
+ /// We can use this to find all movie titles and their release years in
+ /// some text, where the movie is formatted like "'Title' (xxxx)":
+ ///
+ /// ```rust
+ /// # use std::str; use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
+ /// .unwrap();
+ /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
+ /// for caps in re.captures_iter(text) {
+ /// let title = str::from_utf8(&caps["title"]).unwrap();
+ /// let year = str::from_utf8(&caps["year"]).unwrap();
+ /// println!("Movie: {:?}, Released: {:?}", title, year);
+ /// }
+ /// // Output:
+ /// // Movie: Citizen Kane, Released: 1941
+ /// // Movie: The Wizard of Oz, Released: 1939
+ /// // Movie: M, Released: 1931
+ /// # }
+ /// ```
+ pub fn captures_iter<'r, 't>(
+ &'r self,
+ text: &'t [u8],
+ ) -> CaptureMatches<'r, 't> {
+ CaptureMatches(self.0.searcher().captures_iter(text))
+ }
+
+ /// Returns an iterator of substrings of `text` delimited by a match of the
+ /// regular expression. Namely, each element of the iterator corresponds to
+ /// text that *isn't* matched by the regular expression.
+ ///
+ /// This method will *not* copy the text given.
+ ///
+ /// # Example
+ ///
+ /// To split a string delimited by arbitrary amounts of spaces or tabs:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"[ \t]+").unwrap();
+ /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect();
+ /// assert_eq!(fields, vec![
+ /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..],
+ /// ]);
+ /// # }
+ /// ```
+ pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> {
+ Split { finder: self.find_iter(text), last: 0 }
+ }
+
+ /// Returns an iterator of at most `limit` substrings of `text` delimited
+ /// by a match of the regular expression. (A `limit` of `0` will return no
+ /// substrings.) Namely, each element of the iterator corresponds to text
+ /// that *isn't* matched by the regular expression. The remainder of the
+ /// string that is not split will be the last element in the iterator.
+ ///
+ /// This method will *not* copy the text given.
+ ///
+ /// # Example
+ ///
+ /// Get the first two words in some text:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"\W+").unwrap();
+ /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect();
+ /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]);
+ /// # }
+ /// ```
+ pub fn splitn<'r, 't>(
+ &'r self,
+ text: &'t [u8],
+ limit: usize,
+ ) -> SplitN<'r, 't> {
+ SplitN { splits: self.split(text), n: limit }
+ }
+
+ /// Replaces the leftmost-first match with the replacement provided. The
+ /// replacement can be a regular byte string (where `$N` and `$name` are
+ /// expanded to match capture groups) or a function that takes the matches'
+ /// `Captures` and returns the replaced byte string.
+ ///
+ /// If no match is found, then a copy of the byte string is returned
+ /// unchanged.
+ ///
+ /// # Replacement string syntax
+ ///
+ /// All instances of `$name` in the replacement text is replaced with the
+ /// corresponding capture group `name`.
+ ///
+ /// `name` may be an integer corresponding to the index of the
+ /// capture group (counted by order of opening parenthesis where `0` is the
+ /// entire match) or it can be a name (consisting of letters, digits or
+ /// underscores) corresponding to a named capture group.
+ ///
+ /// If `name` isn't a valid capture group (whether the name doesn't exist
+ /// or isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name is used. e.g., `$1a` looks up the capture
+ /// group named `1a` and not the capture group at index `1`. To exert more
+ /// precise control over the name, use braces, e.g., `${1}a`.
+ ///
+ /// To write a literal `$` use `$$`.
+ ///
+ /// # Examples
+ ///
+ /// Note that this function is polymorphic with respect to the replacement.
+ /// In typical usage, this can just be a normal byte string:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new("[^01]+").unwrap();
+ /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]);
+ /// # }
+ /// ```
+ ///
+ /// But anything satisfying the `Replacer` trait will work. For example, a
+ /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the
+ /// captures corresponding to a match. This allows one to access capturing
+ /// group matches easily:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # use regex::bytes::Captures; fn main() {
+ /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| {
+ /// let mut replacement = caps[2].to_owned();
+ /// replacement.push(b' ');
+ /// replacement.extend(&caps[1]);
+ /// replacement
+ /// });
+ /// assert_eq!(result, &b"Bruce Springsteen"[..]);
+ /// # }
+ /// ```
+ ///
+ /// But this is a bit cumbersome to use all the time. Instead, a simple
+ /// syntax is supported that expands `$name` into the corresponding capture
+ /// group. Here's the last example, but using this expansion technique
+ /// with named capture groups:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
+ /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]);
+ /// assert_eq!(result, &b"Bruce Springsteen"[..]);
+ /// # }
+ /// ```
+ ///
+ /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
+ /// would produce the same result. To write a literal `$` use `$$`.
+ ///
+ /// Sometimes the replacement string requires use of curly braces to
+ /// delineate a capture group replacement and surrounding literal text.
+ /// For example, if we wanted to join two words together with an
+ /// underscore:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
+ /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]);
+ /// assert_eq!(result, &b"deep_fried"[..]);
+ /// # }
+ /// ```
+ ///
+ /// Without the curly braces, the capture group name `first_` would be
+ /// used, and since it doesn't exist, it would be replaced with the empty
+ /// string.
+ ///
+ /// Finally, sometimes you just want to replace a literal string with no
+ /// regard for capturing group expansion. This can be done by wrapping a
+ /// byte string with `NoExpand`:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// use regex::bytes::NoExpand;
+ ///
+ /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
+ /// assert_eq!(result, &b"$2 $last"[..]);
+ /// # }
+ /// ```
+ pub fn replace<'t, R: Replacer>(
+ &self,
+ text: &'t [u8],
+ rep: R,
+ ) -> Cow<'t, [u8]> {
+ self.replacen(text, 1, rep)
+ }
+
+ /// Replaces all non-overlapping matches in `text` with the replacement
+ /// provided. This is the same as calling `replacen` with `limit` set to
+ /// `0`.
+ ///
+ /// See the documentation for `replace` for details on how to access
+ /// capturing group matches in the replacement text.
+ pub fn replace_all<'t, R: Replacer>(
+ &self,
+ text: &'t [u8],
+ rep: R,
+ ) -> Cow<'t, [u8]> {
+ self.replacen(text, 0, rep)
+ }
+
+ /// Replaces at most `limit` non-overlapping matches in `text` with the
+ /// replacement provided. If `limit` is 0, then all non-overlapping matches
+ /// are replaced.
+ ///
+ /// See the documentation for `replace` for details on how to access
+ /// capturing group matches in the replacement text.
+ pub fn replacen<'t, R: Replacer>(
+ &self,
+ text: &'t [u8],
+ limit: usize,
+ mut rep: R,
+ ) -> Cow<'t, [u8]> {
+ if let Some(rep) = rep.no_expansion() {
+ let mut it = self.find_iter(text).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(text);
+ }
+ let mut new = Vec::with_capacity(text.len());
+ let mut last_match = 0;
+ for (i, m) in it {
+ new.extend_from_slice(&text[last_match..m.start()]);
+ new.extend_from_slice(&rep);
+ last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
+ }
+ new.extend_from_slice(&text[last_match..]);
+ return Cow::Owned(new);
+ }
+
+ // The slower path, which we use if the replacement needs access to
+ // capture groups.
+ let mut it = self.captures_iter(text).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(text);
+ }
+ let mut new = Vec::with_capacity(text.len());
+ let mut last_match = 0;
+ for (i, cap) in it {
+ // unwrap on 0 is OK because captures only reports matches
+ let m = cap.get(0).unwrap();
+ new.extend_from_slice(&text[last_match..m.start()]);
+ rep.replace_append(&cap, &mut new);
+ last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
+ }
+ new.extend_from_slice(&text[last_match..]);
+ Cow::Owned(new)
+ }
+}
+
+/// Advanced or "lower level" search methods.
+impl Regex {
+ /// Returns the end location of a match in the text given.
+ ///
+ /// This method may have the same performance characteristics as
+ /// `is_match`, except it provides an end location for a match. In
+ /// particular, the location returned *may be shorter* than the proper end
+ /// of the leftmost-first match.
+ ///
+ /// # Example
+ ///
+ /// Typically, `a+` would match the entire first sequence of `a` in some
+ /// text, but `shortest_match` can give up as soon as it sees the first
+ /// `a`.
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let text = b"aaaaa";
+ /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
+ /// assert_eq!(pos, Some(1));
+ /// # }
+ /// ```
+ pub fn shortest_match(&self, text: &[u8]) -> Option<usize> {
+ self.shortest_match_at(text, 0)
+ }
+
+ /// Returns the same as shortest_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn shortest_match_at(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> Option<usize> {
+ self.0.searcher().shortest_match_at(text, start)
+ }
+
+ /// Returns the same as is_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
+ self.0.searcher().is_match_at(text, start)
+ }
+
+ /// Returns the same as find, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn find_at<'t>(
+ &self,
+ text: &'t [u8],
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.0
+ .searcher()
+ .find_at(text, start)
+ .map(|(s, e)| Match::new(text, s, e))
+ }
+
+ /// This is like `captures`, but uses
+ /// [`CaptureLocations`](struct.CaptureLocations.html)
+ /// instead of
+ /// [`Captures`](struct.Captures.html) in order to amortize allocations.
+ ///
+ /// To create a `CaptureLocations` value, use the
+ /// `Regex::capture_locations` method.
+ ///
+ /// This returns the overall match if this was successful, which is always
+ /// equivalence to the `0`th capture group.
+ pub fn captures_read<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t [u8],
+ ) -> Option<Match<'t>> {
+ self.captures_read_at(locs, text, 0)
+ }
+
+ /// Returns the same as `captures_read`, but starts the search at the given
+ /// offset and populates the capture locations given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn captures_read_at<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t [u8],
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.0
+ .searcher()
+ .captures_read_at(&mut locs.0, text, start)
+ .map(|(s, e)| Match::new(text, s, e))
+ }
+
+ /// An undocumented alias for `captures_read_at`.
+ ///
+ /// The `regex-capi` crate previously used this routine, so to avoid
+ /// breaking that crate, we continue to provide the name as an undocumented
+ /// alias.
+ #[doc(hidden)]
+ pub fn read_captures_at<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t [u8],
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.captures_read_at(locs, text, start)
+ }
+}
+
+/// Auxiliary methods.
+impl Regex {
+ /// Returns the original string of this regex.
+ pub fn as_str(&self) -> &str {
+ &self.0.regex_strings()[0]
+ }
+
+ /// Returns an iterator over the capture names.
+ pub fn capture_names(&self) -> CaptureNames<'_> {
+ CaptureNames(self.0.capture_names().iter())
+ }
+
+ /// Returns the number of captures.
+ pub fn captures_len(&self) -> usize {
+ self.0.capture_names().len()
+ }
+
+ /// Returns an empty set of capture locations that can be reused in
+ /// multiple calls to `captures_read` or `captures_read_at`.
+ pub fn capture_locations(&self) -> CaptureLocations {
+ CaptureLocations(self.0.searcher().locations())
+ }
+
+ /// An alias for `capture_locations` to preserve backward compatibility.
+ ///
+ /// The `regex-capi` crate uses this method, so to avoid breaking that
+ /// crate, we continue to export it as an undocumented API.
+ #[doc(hidden)]
+ pub fn locations(&self) -> CaptureLocations {
+ CaptureLocations(self.0.searcher().locations())
+ }
+}
+
+/// An iterator over all non-overlapping matches for a particular string.
+///
+/// The iterator yields a tuple of integers corresponding to the start and end
+/// of the match. The indices are byte offsets. The iterator stops when no more
+/// matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the matched byte string.
+#[derive(Debug)]
+pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>);
+
+impl<'r, 't> Iterator for Matches<'r, 't> {
+ type Item = Match<'t>;
+
+ fn next(&mut self) -> Option<Match<'t>> {
+ let text = self.0.text();
+ self.0.next().map(|(s, e)| Match::new(text, s, e))
+ }
+}
+
+impl<'r, 't> FusedIterator for Matches<'r, 't> {}
+
+/// An iterator that yields all non-overlapping capture groups matching a
+/// particular regular expression.
+///
+/// The iterator stops when no more matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the matched byte string.
+#[derive(Debug)]
+pub struct CaptureMatches<'r, 't>(
+ re_trait::CaptureMatches<'t, ExecNoSync<'r>>,
+);
+
+impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
+ type Item = Captures<'t>;
+
+ fn next(&mut self) -> Option<Captures<'t>> {
+ self.0.next().map(|locs| Captures {
+ text: self.0.text(),
+ locs,
+ named_groups: self.0.regex().capture_name_idx().clone(),
+ })
+ }
+}
+
+impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {}
+
+/// Yields all substrings delimited by a regular expression match.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the byte string being split.
+#[derive(Debug)]
+pub struct Split<'r, 't> {
+ finder: Matches<'r, 't>,
+ last: usize,
+}
+
+impl<'r, 't> Iterator for Split<'r, 't> {
+ type Item = &'t [u8];
+
+ fn next(&mut self) -> Option<&'t [u8]> {
+ let text = self.finder.0.text();
+ match self.finder.next() {
+ None => {
+ if self.last > text.len() {
+ None
+ } else {
+ let s = &text[self.last..];
+ self.last = text.len() + 1; // Next call will return None
+ Some(s)
+ }
+ }
+ Some(m) => {
+ let matched = &text[self.last..m.start()];
+ self.last = m.end();
+ Some(matched)
+ }
+ }
+ }
+}
+
+impl<'r, 't> FusedIterator for Split<'r, 't> {}
+
+/// Yields at most `N` substrings delimited by a regular expression match.
+///
+/// The last substring will be whatever remains after splitting.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the byte string being split.
+#[derive(Debug)]
+pub struct SplitN<'r, 't> {
+ splits: Split<'r, 't>,
+ n: usize,
+}
+
+impl<'r, 't> Iterator for SplitN<'r, 't> {
+ type Item = &'t [u8];
+
+ fn next(&mut self) -> Option<&'t [u8]> {
+ if self.n == 0 {
+ return None;
+ }
+
+ self.n -= 1;
+ if self.n > 0 {
+ return self.splits.next();
+ }
+
+ let text = self.splits.finder.0.text();
+ if self.splits.last > text.len() {
+ // We've already returned all substrings.
+ None
+ } else {
+ // self.n == 0, so future calls will return None immediately
+ Some(&text[self.splits.last..])
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ (0, Some(self.n))
+ }
+}
+
+impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
+
+/// An iterator over the names of all possible captures.
+///
+/// `None` indicates an unnamed capture; the first element (capture 0, the
+/// whole matched region) is always unnamed.
+///
+/// `'r` is the lifetime of the compiled regular expression.
+#[derive(Clone, Debug)]
+pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
+
+impl<'r> Iterator for CaptureNames<'r> {
+ type Item = Option<&'r str>;
+
+ fn next(&mut self) -> Option<Option<&'r str>> {
+ self.0
+ .next()
+ .as_ref()
+ .map(|slot| slot.as_ref().map(|name| name.as_ref()))
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+
+ fn count(self) -> usize {
+ self.0.count()
+ }
+}
+
+impl<'r> ExactSizeIterator for CaptureNames<'r> {}
+
+impl<'r> FusedIterator for CaptureNames<'r> {}
+
+/// CaptureLocations is a low level representation of the raw offsets of each
+/// submatch.
+///
+/// You can think of this as a lower level
+/// [`Captures`](struct.Captures.html), where this type does not support
+/// named capturing groups directly and it does not borrow the text that these
+/// offsets were matched on.
+///
+/// Primarily, this type is useful when using the lower level `Regex` APIs
+/// such as `read_captures`, which permits amortizing the allocation in which
+/// capture match locations are stored.
+///
+/// In order to build a value of this type, you'll need to call the
+/// `capture_locations` method on the `Regex` being used to execute the search.
+/// The value returned can then be reused in subsequent searches.
+#[derive(Clone, Debug)]
+pub struct CaptureLocations(re_trait::Locations);
+
+/// A type alias for `CaptureLocations` for backwards compatibility.
+///
+/// Previously, we exported `CaptureLocations` as `Locations` in an
+/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
+/// we continue re-exporting the same undocumented API.
+#[doc(hidden)]
+pub type Locations = CaptureLocations;
+
+impl CaptureLocations {
+ /// Returns the start and end positions of the Nth capture group. Returns
+ /// `None` if `i` is not a valid capture group or if the capture group did
+ /// not match anything. The positions returned are *always* byte indices
+ /// with respect to the original string matched.
+ #[inline]
+ pub fn get(&self, i: usize) -> Option<(usize, usize)> {
+ self.0.pos(i)
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ ///
+ /// This is always at least `1` since every regex has at least `1`
+ /// capturing group that corresponds to the entire match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ /// An alias for the `get` method for backwards compatibility.
+ ///
+ /// Previously, we exported `get` as `pos` in an undocumented API. To
+ /// prevent breaking that code (e.g., in `regex-capi`), we continue
+ /// re-exporting the same undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
+ self.get(i)
+ }
+}
+
+/// Captures represents a group of captured byte strings for a single match.
+///
+/// The 0th capture always corresponds to the entire match. Each subsequent
+/// index corresponds to the next capture group in the regex. If a capture
+/// group is named, then the matched byte string is *also* available via the
+/// `name` method. (Note that the 0th capture is always unnamed and so must be
+/// accessed with the `get` method.)
+///
+/// Positions returned from a capture group are always byte indices.
+///
+/// `'t` is the lifetime of the matched text.
+pub struct Captures<'t> {
+ text: &'t [u8],
+ locs: re_trait::Locations,
+ named_groups: Arc<HashMap<String, usize>>,
+}
+
+impl<'t> Captures<'t> {
+ /// Returns the match associated with the capture group at index `i`. If
+ /// `i` does not correspond to a capture group, or if the capture group
+ /// did not participate in the match, then `None` is returned.
+ ///
+ /// # Examples
+ ///
+ /// Get the text of the match with a default of an empty string if this
+ /// group didn't participate in the match:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
+ /// let caps = re.captures(b"abc123").unwrap();
+ ///
+ /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
+ /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
+ /// assert_eq!(text1, &b"123"[..]);
+ /// assert_eq!(text2, &b""[..]);
+ /// ```
+ pub fn get(&self, i: usize) -> Option<Match<'t>> {
+ self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
+ }
+
+ /// Returns the match for the capture group named `name`. If `name` isn't a
+ /// valid capture group or didn't match anything, then `None` is returned.
+ pub fn name(&self, name: &str) -> Option<Match<'t>> {
+ self.named_groups.get(name).and_then(|&i| self.get(i))
+ }
+
+ /// An iterator that yields all capturing matches in the order in which
+ /// they appear in the regex. If a particular capture group didn't
+ /// participate in the match, then `None` is yielded for that capture.
+ ///
+ /// The first match always corresponds to the overall match of the regex.
+ pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
+ SubCaptureMatches { caps: self, it: self.locs.iter() }
+ }
+
+ /// Expands all instances of `$name` in `replacement` to the corresponding
+ /// capture group `name`, and writes them to the `dst` buffer given.
+ ///
+ /// `name` may be an integer corresponding to the index of the capture
+ /// group (counted by order of opening parenthesis where `0` is the
+ /// entire match) or it can be a name (consisting of letters, digits or
+ /// underscores) corresponding to a named capture group.
+ ///
+ /// If `name` isn't a valid capture group (whether the name doesn't exist
+ /// or isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+ /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+ /// capture group at index `1`. To exert more precise control over the
+ /// name, or to refer to a capture group name that uses characters outside
+ /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+ /// using braces, any sequence of valid UTF-8 bytes is permitted. If the
+ /// sequence does not refer to a capture group name in the corresponding
+ /// regex, then it is replaced with an empty string.
+ ///
+ /// To write a literal `$` use `$$`.
+ pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
+ expand_bytes(self, replacement, dst)
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ ///
+ /// This is always at least `1`, since every regex has at least one capture
+ /// group that corresponds to the full match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.locs.len()
+ }
+}
+
+impl<'t> fmt::Debug for Captures<'t> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
+ }
+}
+
+struct CapturesDebug<'c, 't>(&'c Captures<'t>);
+
+impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fn escape_bytes(bytes: &[u8]) -> String {
+ let mut s = String::new();
+ for &b in bytes {
+ s.push_str(&escape_byte(b));
+ }
+ s
+ }
+
+ fn escape_byte(byte: u8) -> String {
+ use std::ascii::escape_default;
+
+ let escaped: Vec<u8> = escape_default(byte).collect();
+ String::from_utf8_lossy(&escaped).into_owned()
+ }
+
+ // We'd like to show something nice here, even if it means an
+ // allocation to build a reverse index.
+ let slot_to_name: HashMap<&usize, &String> =
+ self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
+ let mut map = f.debug_map();
+ for (slot, m) in self.0.locs.iter().enumerate() {
+ let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e]));
+ if let Some(name) = slot_to_name.get(&slot) {
+ map.entry(&name, &m);
+ } else {
+ map.entry(&slot, &m);
+ }
+ }
+ map.finish()
+ }
+}
+
+/// Get a group by index.
+///
+/// `'t` is the lifetime of the matched text.
+///
+/// The text can't outlive the `Captures` object if this method is
+/// used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it); to do that, use `get()` instead.
+///
+/// # Panics
+///
+/// If there is no group at the given index.
+impl<'t> Index<usize> for Captures<'t> {
+ type Output = [u8];
+
+ fn index(&self, i: usize) -> &[u8] {
+ self.get(i)
+ .map(|m| m.as_bytes())
+ .unwrap_or_else(|| panic!("no group at index '{}'", i))
+ }
+}
+
+/// Get a group by name.
+///
+/// `'t` is the lifetime of the matched text and `'i` is the lifetime
+/// of the group name (the index).
+///
+/// The text can't outlive the `Captures` object if this method is
+/// used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it); to do that, use `name` instead.
+///
+/// # Panics
+///
+/// If there is no group named by the given value.
+impl<'t, 'i> Index<&'i str> for Captures<'t> {
+ type Output = [u8];
+
+ fn index<'a>(&'a self, name: &'i str) -> &'a [u8] {
+ self.name(name)
+ .map(|m| m.as_bytes())
+ .unwrap_or_else(|| panic!("no group named '{}'", name))
+ }
+}
+
+/// An iterator that yields all capturing matches in the order in which they
+/// appear in the regex.
+///
+/// If a particular capture group didn't participate in the match, then `None`
+/// is yielded for that capture. The first match always corresponds to the
+/// overall match of the regex.
+///
+/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
+/// the lifetime `'t` corresponds to the originally matched text.
+#[derive(Clone, Debug)]
+pub struct SubCaptureMatches<'c, 't> {
+ caps: &'c Captures<'t>,
+ it: SubCapturesPosIter<'c>,
+}
+
+impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
+ type Item = Option<Match<'t>>;
+
+ fn next(&mut self) -> Option<Option<Match<'t>>> {
+ self.it
+ .next()
+ .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
+ }
+}
+
+impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
+
+/// Replacer describes types that can be used to replace matches in a byte
+/// string.
+///
+/// In general, users of this crate shouldn't need to implement this trait,
+/// since implementations are already provided for `&[u8]` along with other
+/// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any
+/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases.
+pub trait Replacer {
+ /// Appends text to `dst` to replace the current match.
+ ///
+ /// The current match is represented by `caps`, which is guaranteed to
+ /// have a match at capture group `0`.
+ ///
+ /// For example, a no-op replacement would be
+ /// `dst.extend(&caps[0])`.
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>);
+
+ /// Return a fixed unchanging replacement byte string.
+ ///
+ /// When doing replacements, if access to `Captures` is not needed (e.g.,
+ /// the replacement byte string does not need `$` expansion), then it can
+ /// be beneficial to avoid finding sub-captures.
+ ///
+ /// In general, this is called once for every call to `replacen`.
+ fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
+ None
+ }
+
+ /// Return a `Replacer` that borrows and wraps this `Replacer`.
+ ///
+ /// This is useful when you want to take a generic `Replacer` (which might
+ /// not be cloneable) and use it without consuming it, so it can be used
+ /// more than once.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::{Regex, Replacer};
+ ///
+ /// fn replace_all_twice<R: Replacer>(
+ /// re: Regex,
+ /// src: &[u8],
+ /// mut rep: R,
+ /// ) -> Vec<u8> {
+ /// let dst = re.replace_all(src, rep.by_ref());
+ /// let dst = re.replace_all(&dst, rep.by_ref());
+ /// dst.into_owned()
+ /// }
+ /// ```
+ fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
+ ReplacerRef(self)
+ }
+}
+
+/// By-reference adaptor for a `Replacer`
+///
+/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
+#[derive(Debug)]
+pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
+
+impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ self.0.replace_append(caps, dst)
+ }
+ fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
+ self.0.no_expansion()
+ }
+}
+
+impl<'a> Replacer for &'a [u8] {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Vec<u8> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl Replacer for Vec<u8> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for Cow<'a, [u8]> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(self.as_ref(), dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Cow<'a, [u8]> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(self.as_ref(), dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> {
+ let s = t.as_ref();
+ match find_byte(b'$', s) {
+ Some(_) => None,
+ None => Some(Cow::Borrowed(s)),
+ }
+}
+
+impl<F, T> Replacer for F
+where
+ F: FnMut(&Captures<'_>) -> T,
+ T: AsRef<[u8]>,
+{
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ dst.extend_from_slice((*self)(caps).as_ref());
+ }
+}
+
+/// `NoExpand` indicates literal byte string replacement.
+///
+/// It can be used with `replace` and `replace_all` to do a literal byte string
+/// replacement without expanding `$name` to their corresponding capture
+/// groups. This can be both convenient (to avoid escaping `$`, for example)
+/// and performant (since capture groups don't need to be found).
+///
+/// `'t` is the lifetime of the literal text.
+#[derive(Clone, Debug)]
+pub struct NoExpand<'t>(pub &'t [u8]);
+
+impl<'t> Replacer for NoExpand<'t> {
+ fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) {
+ dst.extend_from_slice(self.0);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ Some(Cow::Borrowed(self.0))
+ }
+}
diff --git a/third_party/rust/regex/src/re_set.rs b/third_party/rust/regex/src/re_set.rs
new file mode 100644
index 0000000000..a6d886d761
--- /dev/null
+++ b/third_party/rust/regex/src/re_set.rs
@@ -0,0 +1,507 @@
+macro_rules! define_set {
+ ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
+ $(#[$doc_regexset_example:meta])* ) => {
+ pub mod $name {
+ use std::fmt;
+ use std::iter;
+ use std::slice;
+ use std::vec;
+
+ use crate::error::Error;
+ use crate::exec::Exec;
+ use crate::re_builder::$builder_mod::RegexSetBuilder;
+ use crate::re_trait::RegularExpression;
+
+/// Match multiple (possibly overlapping) regular expressions in a single scan.
+///
+/// A regex set corresponds to the union of two or more regular expressions.
+/// That is, a regex set will match text where at least one of its
+/// constituent regular expressions matches. A regex set as its formulated here
+/// provides a touch more power: it will also report *which* regular
+/// expressions in the set match. Indeed, this is the key difference between
+/// regex sets and a single `Regex` with many alternates, since only one
+/// alternate can match at a time.
+///
+/// For example, consider regular expressions to match email addresses and
+/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
+/// regex set is constructed from those regexes, then searching the text
+/// `foo@example.com` will report both regexes as matching. Of course, one
+/// could accomplish this by compiling each regex on its own and doing two
+/// searches over the text. The key advantage of using a regex set is that it
+/// will report the matching regexes using a *single pass through the text*.
+/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
+/// router for a complex web application or a user agent matcher), then a regex
+/// set can realize huge performance gains.
+///
+/// # Example
+///
+/// This shows how the above two regexes (for matching email addresses and
+/// domains) might work:
+///
+$(#[$doc_regexset_example])*
+///
+/// Note that it would be possible to adapt the above example to using `Regex`
+/// with an expression like:
+///
+/// ```text
+/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
+/// ```
+///
+/// After a match, one could then inspect the capture groups to figure out
+/// which alternates matched. The problem is that it is hard to make this
+/// approach scale when there are many regexes since the overlap between each
+/// alternate isn't always obvious to reason about.
+///
+/// # Limitations
+///
+/// Regex sets are limited to answering the following two questions:
+///
+/// 1. Does any regex in the set match?
+/// 2. If so, which regexes in the set match?
+///
+/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
+/// instead of (2) since the matching engines can stop after the first match
+/// is found.
+///
+/// You cannot directly extract [`Match`][crate::Match] or
+/// [`Captures`][crate::Captures] objects from a regex set. If you need these
+/// operations, the recommended approach is to compile each pattern in the set
+/// independently and scan the exact same input a second time with those
+/// independently compiled patterns:
+///
+/// ```rust
+/// use regex::{Regex, RegexSet};
+///
+/// let patterns = ["foo", "bar"];
+/// // Both patterns will match different ranges of this string.
+/// let text = "barfoo";
+///
+/// // Compile a set matching any of our patterns.
+/// let set = RegexSet::new(&patterns).unwrap();
+/// // Compile each pattern independently.
+/// let regexes: Vec<_> = set.patterns().iter()
+/// .map(|pat| Regex::new(pat).unwrap())
+/// .collect();
+///
+/// // Match against the whole set first and identify the individual
+/// // matching patterns.
+/// let matches: Vec<&str> = set.matches(text).into_iter()
+/// // Dereference the match index to get the corresponding
+/// // compiled pattern.
+/// .map(|match_idx| &regexes[match_idx])
+/// // To get match locations or any other info, we then have to search
+/// // the exact same text again, using our separately-compiled pattern.
+/// .map(|pat| pat.find(text).unwrap().as_str())
+/// .collect();
+///
+/// // Matches arrive in the order the constituent patterns were declared,
+/// // not the order they appear in the input.
+/// assert_eq!(vec!["foo", "bar"], matches);
+/// ```
+///
+/// # Performance
+///
+/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
+/// search takes `O(mn)` time, where `m` is proportional to the size of the
+/// regex set and `n` is proportional to the length of the search text.
+#[derive(Clone)]
+pub struct RegexSet(Exec);
+
+impl RegexSet {
+ /// Create a new regex set with the given regular expressions.
+ ///
+ /// This takes an iterator of `S`, where `S` is something that can produce
+ /// a `&str`. If any of the strings in the iterator are not valid regular
+ /// expressions, then an error is returned.
+ ///
+ /// # Example
+ ///
+ /// Create a new regex set from an iterator of strings:
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
+ /// assert!(set.is_match("foo"));
+ /// ```
+ pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
+ where S: AsRef<str>, I: IntoIterator<Item=S> {
+ RegexSetBuilder::new(exprs).build()
+ }
+
+ /// Create a new empty regex set.
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::empty();
+ /// assert!(set.is_empty());
+ /// ```
+ pub fn empty() -> RegexSet {
+ RegexSetBuilder::new(&[""; 0]).build().unwrap()
+ }
+
+ /// Returns true if and only if one of the regexes in this set matches
+ /// the text given.
+ ///
+ /// This method should be preferred if you only need to test whether any
+ /// of the regexes in the set should match, but don't care about *which*
+ /// regexes matched. This is because the underlying matching engine will
+ /// quit immediately after seeing the first match instead of continuing to
+ /// find all matches.
+ ///
+ /// Note that as with searches using `Regex`, the expression is unanchored
+ /// by default. That is, if the regex does not start with `^` or `\A`, or
+ /// end with `$` or `\z`, then it is permitted to match anywhere in the
+ /// text.
+ ///
+ /// # Example
+ ///
+ /// Tests whether a set matches some text:
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
+ /// assert!(set.is_match("foo"));
+ /// assert!(!set.is_match("☃"));
+ /// ```
+ pub fn is_match(&self, text: $text_ty) -> bool {
+ self.is_match_at(text, 0)
+ }
+
+ /// Returns the same as is_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ #[doc(hidden)]
+ pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
+ self.0.searcher().is_match_at($as_bytes(text), start)
+ }
+
+ /// Returns the set of regular expressions that match in the given text.
+ ///
+ /// The set returned contains the index of each regular expression that
+ /// matches in the given text. The index is in correspondence with the
+ /// order of regular expressions given to `RegexSet`'s constructor.
+ ///
+ /// The set can also be used to iterate over the matched indices.
+ ///
+ /// Note that as with searches using `Regex`, the expression is unanchored
+ /// by default. That is, if the regex does not start with `^` or `\A`, or
+ /// end with `$` or `\z`, then it is permitted to match anywhere in the
+ /// text.
+ ///
+ /// # Example
+ ///
+ /// Tests which regular expressions match the given text:
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::new(&[
+ /// r"\w+",
+ /// r"\d+",
+ /// r"\pL+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ]).unwrap();
+ /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
+ /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
+ ///
+ /// // You can also test whether a particular regex matched:
+ /// let matches = set.matches("foobar");
+ /// assert!(!matches.matched(5));
+ /// assert!(matches.matched(6));
+ /// ```
+ pub fn matches(&self, text: $text_ty) -> SetMatches {
+ let mut matches = vec![false; self.0.regex_strings().len()];
+ let any = self.read_matches_at(&mut matches, text, 0);
+ SetMatches {
+ matched_any: any,
+ matches: matches,
+ }
+ }
+
+ /// Returns the same as matches, but starts the search at the given
+ /// offset and stores the matches into the slice given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// `matches` must have a length that is at least the number of regexes
+ /// in this set.
+ ///
+ /// This method returns true if and only if at least one member of
+ /// `matches` is true after executing the set against `text`.
+ #[doc(hidden)]
+ pub fn read_matches_at(
+ &self,
+ matches: &mut [bool],
+ text: $text_ty,
+ start: usize,
+ ) -> bool {
+ self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
+ }
+
+ /// Returns the total number of regular expressions in this set.
+ pub fn len(&self) -> usize {
+ self.0.regex_strings().len()
+ }
+
+ /// Returns `true` if this set contains no regular expressions.
+ pub fn is_empty(&self) -> bool {
+ self.0.regex_strings().is_empty()
+ }
+
+ /// Returns the patterns that this set will match on.
+ ///
+ /// This function can be used to determine the pattern for a match. The
+ /// slice returned has exactly as many patterns givens to this regex set,
+ /// and the order of the slice is the same as the order of the patterns
+ /// provided to the set.
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::new(&[
+ /// r"\w+",
+ /// r"\d+",
+ /// r"\pL+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ]).unwrap();
+ /// let matches: Vec<_> = set
+ /// .matches("foobar")
+ /// .into_iter()
+ /// .map(|match_idx| &set.patterns()[match_idx])
+ /// .collect();
+ /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
+ /// ```
+ pub fn patterns(&self) -> &[String] {
+ self.0.regex_strings()
+ }
+}
+
+/// A set of matches returned by a regex set.
+#[derive(Clone, Debug)]
+pub struct SetMatches {
+ matched_any: bool,
+ matches: Vec<bool>,
+}
+
+impl SetMatches {
+ /// Whether this set contains any matches.
+ pub fn matched_any(&self) -> bool {
+ self.matched_any
+ }
+
+ /// Whether the regex at the given index matched.
+ ///
+ /// The index for a regex is determined by its insertion order upon the
+ /// initial construction of a `RegexSet`, starting at `0`.
+ ///
+ /// # Panics
+ ///
+ /// If `regex_index` is greater than or equal to `self.len()`.
+ pub fn matched(&self, regex_index: usize) -> bool {
+ self.matches[regex_index]
+ }
+
+ /// The total number of regexes in the set that created these matches.
+ pub fn len(&self) -> usize {
+ self.matches.len()
+ }
+
+ /// Returns an iterator over indexes in the regex that matched.
+ ///
+ /// This will always produces matches in ascending order of index, where
+ /// the index corresponds to the index of the regex that matched with
+ /// respect to its position when initially building the set.
+ pub fn iter(&self) -> SetMatchesIter<'_> {
+ SetMatchesIter((&*self.matches).into_iter().enumerate())
+ }
+}
+
+impl IntoIterator for SetMatches {
+ type IntoIter = SetMatchesIntoIter;
+ type Item = usize;
+
+ fn into_iter(self) -> Self::IntoIter {
+ SetMatchesIntoIter(self.matches.into_iter().enumerate())
+ }
+}
+
+impl<'a> IntoIterator for &'a SetMatches {
+ type IntoIter = SetMatchesIter<'a>;
+ type Item = usize;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
+
+/// An owned iterator over the set of matches from a regex set.
+///
+/// This will always produces matches in ascending order of index, where the
+/// index corresponds to the index of the regex that matched with respect to
+/// its position when initially building the set.
+#[derive(Debug)]
+pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
+
+impl Iterator for SetMatchesIntoIter {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ loop {
+ match self.0.next() {
+ None => return None,
+ Some((_, false)) => {}
+ Some((i, true)) => return Some(i),
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+}
+
+impl DoubleEndedIterator for SetMatchesIntoIter {
+ fn next_back(&mut self) -> Option<usize> {
+ loop {
+ match self.0.next_back() {
+ None => return None,
+ Some((_, false)) => {}
+ Some((i, true)) => return Some(i),
+ }
+ }
+ }
+}
+
+impl iter::FusedIterator for SetMatchesIntoIter {}
+
+/// A borrowed iterator over the set of matches from a regex set.
+///
+/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
+///
+/// This will always produces matches in ascending order of index, where the
+/// index corresponds to the index of the regex that matched with respect to
+/// its position when initially building the set.
+#[derive(Clone, Debug)]
+pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
+
+impl<'a> Iterator for SetMatchesIter<'a> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ loop {
+ match self.0.next() {
+ None => return None,
+ Some((_, &false)) => {}
+ Some((i, &true)) => return Some(i),
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+}
+
+impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
+ fn next_back(&mut self) -> Option<usize> {
+ loop {
+ match self.0.next_back() {
+ None => return None,
+ Some((_, &false)) => {}
+ Some((i, &true)) => return Some(i),
+ }
+ }
+ }
+}
+
+impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
+
+#[doc(hidden)]
+impl From<Exec> for RegexSet {
+ fn from(exec: Exec) -> Self {
+ RegexSet(exec)
+ }
+}
+
+impl fmt::Debug for RegexSet {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "RegexSet({:?})", self.0.regex_strings())
+ }
+}
+
+#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
+#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
+ }
+ }
+}
+
+define_set! {
+ unicode,
+ set_unicode,
+ &str,
+ as_bytes_str,
+/// ```rust
+/// # use regex::RegexSet;
+/// let set = RegexSet::new(&[
+/// r"[a-z]+@[a-z]+\.(com|org|net)",
+/// r"[a-z]+\.(com|org|net)",
+/// ]).unwrap();
+///
+/// // Ask whether any regexes in the set match.
+/// assert!(set.is_match("foo@example.com"));
+///
+/// // Identify which regexes in the set match.
+/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
+/// assert_eq!(vec![0, 1], matches);
+///
+/// // Try again, but with text that only matches one of the regexes.
+/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
+/// assert_eq!(vec![1], matches);
+///
+/// // Try again, but with text that doesn't match any regex in the set.
+/// let matches: Vec<_> = set.matches("example").into_iter().collect();
+/// assert!(matches.is_empty());
+/// ```
+}
+
+define_set! {
+ bytes,
+ set_bytes,
+ &[u8],
+ as_bytes_bytes,
+/// ```rust
+/// # use regex::bytes::RegexSet;
+/// let set = RegexSet::new(&[
+/// r"[a-z]+@[a-z]+\.(com|org|net)",
+/// r"[a-z]+\.(com|org|net)",
+/// ]).unwrap();
+///
+/// // Ask whether any regexes in the set match.
+/// assert!(set.is_match(b"foo@example.com"));
+///
+/// // Identify which regexes in the set match.
+/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
+/// assert_eq!(vec![0, 1], matches);
+///
+/// // Try again, but with text that only matches one of the regexes.
+/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
+/// assert_eq!(vec![1], matches);
+///
+/// // Try again, but with text that doesn't match any regex in the set.
+/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
+/// assert!(matches.is_empty());
+/// ```
+}
diff --git a/third_party/rust/regex/src/re_trait.rs b/third_party/rust/regex/src/re_trait.rs
new file mode 100644
index 0000000000..d0c717df5a
--- /dev/null
+++ b/third_party/rust/regex/src/re_trait.rs
@@ -0,0 +1,294 @@
+use std::fmt;
+use std::iter::FusedIterator;
+
+/// Slot is a single saved capture location. Note that there are two slots for
+/// every capture in a regular expression (one slot each for the start and end
+/// of the capture).
+pub type Slot = Option<usize>;
+
+/// Locations represents the offsets of each capturing group in a regex for
+/// a single match.
+///
+/// Unlike `Captures`, a `Locations` value only stores offsets.
+#[doc(hidden)]
+#[derive(Clone, Debug)]
+pub struct Locations(Vec<Slot>);
+
+impl Locations {
+ /// Returns the start and end positions of the Nth capture group. Returns
+ /// `None` if `i` is not a valid capture group or if the capture group did
+ /// not match anything. The positions returned are *always* byte indices
+ /// with respect to the original string matched.
+ pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
+ let (s, e) = (i * 2, i * 2 + 1);
+ match (self.0.get(s), self.0.get(e)) {
+ (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
+ _ => None,
+ }
+ }
+
+ /// Creates an iterator of all the capture group positions in order of
+ /// appearance in the regular expression. Positions are byte indices
+ /// in terms of the original string matched.
+ pub fn iter(&self) -> SubCapturesPosIter<'_> {
+ SubCapturesPosIter { idx: 0, locs: self }
+ }
+
+ /// Returns the total number of capturing groups.
+ ///
+ /// This is always at least `1` since every regex has at least `1`
+ /// capturing group that corresponds to the entire match.
+ pub fn len(&self) -> usize {
+ self.0.len() / 2
+ }
+
+ /// Return the individual slots as a slice.
+ pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
+ &mut self.0
+ }
+}
+
+/// An iterator over capture group positions for a particular match of a
+/// regular expression.
+///
+/// Positions are byte indices in terms of the original string matched.
+///
+/// `'c` is the lifetime of the captures.
+#[derive(Clone, Debug)]
+pub struct SubCapturesPosIter<'c> {
+ idx: usize,
+ locs: &'c Locations,
+}
+
+impl<'c> Iterator for SubCapturesPosIter<'c> {
+ type Item = Option<(usize, usize)>;
+
+ fn next(&mut self) -> Option<Option<(usize, usize)>> {
+ if self.idx >= self.locs.len() {
+ return None;
+ }
+ let x = match self.locs.pos(self.idx) {
+ None => Some(None),
+ Some((s, e)) => Some(Some((s, e))),
+ };
+ self.idx += 1;
+ x
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let len = self.locs.len() - self.idx;
+ (len, Some(len))
+ }
+
+ fn count(self) -> usize {
+ self.len()
+ }
+}
+
+impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {}
+
+impl<'c> FusedIterator for SubCapturesPosIter<'c> {}
+
+/// `RegularExpression` describes types that can implement regex searching.
+///
+/// This trait is my attempt at reducing code duplication and to standardize
+/// the internal API. Specific duplication that is avoided are the `find`
+/// and `capture` iterators, which are slightly tricky.
+///
+/// It's not clear whether this trait is worth it, and it also isn't
+/// clear whether it's useful as a public trait or not. Methods like
+/// `next_after_empty` reak of bad design, but the rest of the methods seem
+/// somewhat reasonable. One particular thing this trait would expose would be
+/// the ability to start the search of a regex anywhere in a haystack, which
+/// isn't possible in the current public API.
+pub trait RegularExpression: Sized + fmt::Debug {
+ /// The type of the haystack.
+ type Text: ?Sized + fmt::Debug;
+
+ /// The number of capture slots in the compiled regular expression. This is
+ /// always two times the number of capture groups (two slots per group).
+ fn slots_len(&self) -> usize;
+
+ /// Allocates fresh space for all capturing groups in this regex.
+ fn locations(&self) -> Locations {
+ Locations(vec![None; self.slots_len()])
+ }
+
+ /// Returns the position of the next character after `i`.
+ ///
+ /// For example, a haystack with type `&[u8]` probably returns `i+1`,
+ /// whereas a haystack with type `&str` probably returns `i` plus the
+ /// length of the next UTF-8 sequence.
+ fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;
+
+ /// Returns the location of the shortest match.
+ fn shortest_match_at(
+ &self,
+ text: &Self::Text,
+ start: usize,
+ ) -> Option<usize>;
+
+ /// Returns whether the regex matches the text given.
+ fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
+
+ /// Returns the leftmost-first match location if one exists.
+ fn find_at(
+ &self,
+ text: &Self::Text,
+ start: usize,
+ ) -> Option<(usize, usize)>;
+
+ /// Returns the leftmost-first match location if one exists, and also
+ /// fills in any matching capture slot locations.
+ fn captures_read_at(
+ &self,
+ locs: &mut Locations,
+ text: &Self::Text,
+ start: usize,
+ ) -> Option<(usize, usize)>;
+
+ /// Returns an iterator over all non-overlapping successive leftmost-first
+ /// matches.
+ fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> {
+ Matches { re: self, text, last_end: 0, last_match: None }
+ }
+
+ /// Returns an iterator over all non-overlapping successive leftmost-first
+ /// matches with captures.
+ fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> {
+ CaptureMatches(self.find_iter(text))
+ }
+}
+
+/// An iterator over all non-overlapping successive leftmost-first matches.
+#[derive(Debug)]
+pub struct Matches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't,
+{
+ re: R,
+ text: &'t R::Text,
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'t, R> Matches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't,
+{
+ /// Return the text being searched.
+ pub fn text(&self) -> &'t R::Text {
+ self.text
+ }
+
+ /// Return the underlying regex.
+ pub fn regex(&self) -> &R {
+ &self.re
+ }
+}
+
+impl<'t, R> Iterator for Matches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't + AsRef<[u8]>,
+{
+ type Item = (usize, usize);
+
+ fn next(&mut self) -> Option<(usize, usize)> {
+ if self.last_end > self.text.as_ref().len() {
+ return None;
+ }
+ let (s, e) = match self.re.find_at(self.text, self.last_end) {
+ None => return None,
+ Some((s, e)) => (s, e),
+ };
+ if s == e {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = self.re.next_after_empty(self.text, e);
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(e) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = e;
+ }
+ self.last_match = Some(e);
+ Some((s, e))
+ }
+}
+
+impl<'t, R> FusedIterator for Matches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't + AsRef<[u8]>,
+{
+}
+
+/// An iterator over all non-overlapping successive leftmost-first matches with
+/// captures.
+#[derive(Debug)]
+pub struct CaptureMatches<'t, R>(Matches<'t, R>)
+where
+ R: RegularExpression,
+ R::Text: 't;
+
+impl<'t, R> CaptureMatches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't,
+{
+ /// Return the text being searched.
+ pub fn text(&self) -> &'t R::Text {
+ self.0.text()
+ }
+
+ /// Return the underlying regex.
+ pub fn regex(&self) -> &R {
+ self.0.regex()
+ }
+}
+
+impl<'t, R> Iterator for CaptureMatches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't + AsRef<[u8]>,
+{
+ type Item = Locations;
+
+ fn next(&mut self) -> Option<Locations> {
+ if self.0.last_end > self.0.text.as_ref().len() {
+ return None;
+ }
+ let mut locs = self.0.re.locations();
+ let (s, e) = match self.0.re.captures_read_at(
+ &mut locs,
+ self.0.text,
+ self.0.last_end,
+ ) {
+ None => return None,
+ Some((s, e)) => (s, e),
+ };
+ if s == e {
+ self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
+ if Some(e) == self.0.last_match {
+ return self.next();
+ }
+ } else {
+ self.0.last_end = e;
+ }
+ self.0.last_match = Some(e);
+ Some(locs)
+ }
+}
+
+impl<'t, R> FusedIterator for CaptureMatches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't + AsRef<[u8]>,
+{
+}
diff --git a/third_party/rust/regex/src/re_unicode.rs b/third_party/rust/regex/src/re_unicode.rs
new file mode 100644
index 0000000000..197510ea0d
--- /dev/null
+++ b/third_party/rust/regex/src/re_unicode.rs
@@ -0,0 +1,1311 @@
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::fmt;
+use std::iter::FusedIterator;
+use std::ops::{Index, Range};
+use std::str::FromStr;
+use std::sync::Arc;
+
+use crate::find_byte::find_byte;
+
+use crate::error::Error;
+use crate::exec::{Exec, ExecNoSyncStr};
+use crate::expand::expand_str;
+use crate::re_builder::unicode::RegexBuilder;
+use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
+
+/// Escapes all regular expression meta characters in `text`.
+///
+/// The string returned may be safely used as a literal in a regular
+/// expression.
+pub fn escape(text: &str) -> String {
+ regex_syntax::escape(text)
+}
+
+/// Match represents a single match of a regex in a haystack.
+///
+/// The lifetime parameter `'t` refers to the lifetime of the matched text.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub struct Match<'t> {
+ text: &'t str,
+ start: usize,
+ end: usize,
+}
+
+impl<'t> Match<'t> {
+ /// Returns the starting byte offset of the match in the haystack.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// Returns the ending byte offset of the match in the haystack.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns the range over the starting and ending byte offsets of the
+ /// match in the haystack.
+ #[inline]
+ pub fn range(&self) -> Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns the matched text.
+ #[inline]
+ pub fn as_str(&self) -> &'t str {
+ &self.text[self.range()]
+ }
+
+ /// Creates a new match from the given haystack and byte offsets.
+ #[inline]
+ fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
+ Match { text: haystack, start, end }
+ }
+}
+
+impl<'t> From<Match<'t>> for &'t str {
+ fn from(m: Match<'t>) -> &'t str {
+ m.as_str()
+ }
+}
+
+impl<'t> From<Match<'t>> for Range<usize> {
+ fn from(m: Match<'t>) -> Range<usize> {
+ m.range()
+ }
+}
+
+/// A compiled regular expression for matching Unicode strings.
+///
+/// It is represented as either a sequence of bytecode instructions (dynamic)
+/// or as a specialized Rust function (native). It can be used to search, split
+/// or replace text. All searching is done with an implicit `.*?` at the
+/// beginning and end of an expression. To force an expression to match the
+/// whole string (or a prefix or a suffix), you must use an anchor like `^` or
+/// `$` (or `\A` and `\z`).
+///
+/// While this crate will handle Unicode strings (whether in the regular
+/// expression or in the search text), all positions returned are **byte
+/// indices**. Every byte index is guaranteed to be at a Unicode code point
+/// boundary.
+///
+/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a
+/// compiled regular expression and text to search, respectively.
+///
+/// The only methods that allocate new strings are the string replacement
+/// methods. All other methods (searching and splitting) return borrowed
+/// pointers into the string given.
+///
+/// # Examples
+///
+/// Find the location of a US phone number:
+///
+/// ```rust
+/// # use regex::Regex;
+/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
+/// let mat = re.find("phone: 111-222-3333").unwrap();
+/// assert_eq!((mat.start(), mat.end()), (7, 19));
+/// ```
+///
+/// # Using the `std::str::pattern` methods with `Regex`
+///
+/// > **Note**: This section requires that this crate is compiled with the
+/// > `pattern` Cargo feature enabled, which **requires nightly Rust**.
+///
+/// Since `Regex` implements `Pattern`, you can use regexes with methods
+/// defined on `&str`. For example, `is_match`, `find`, `find_iter`
+/// and `split` can be replaced with `str::contains`, `str::find`,
+/// `str::match_indices` and `str::split`.
+///
+/// Here are some examples:
+///
+/// ```rust,ignore
+/// # use regex::Regex;
+/// let re = Regex::new(r"\d+").unwrap();
+/// let haystack = "a111b222c";
+///
+/// assert!(haystack.contains(&re));
+/// assert_eq!(haystack.find(&re), Some(1));
+/// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(),
+/// vec![(1, "111"), (5, "222")]);
+/// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]);
+/// ```
+#[derive(Clone)]
+pub struct Regex(Exec);
+
+impl fmt::Display for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+impl fmt::Debug for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt::Display::fmt(self, f)
+ }
+}
+
+#[doc(hidden)]
+impl From<Exec> for Regex {
+ fn from(exec: Exec) -> Regex {
+ Regex(exec)
+ }
+}
+
+impl FromStr for Regex {
+ type Err = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn from_str(s: &str) -> Result<Regex, Error> {
+ Regex::new(s)
+ }
+}
+
+/// Core regular expression methods.
+impl Regex {
+ /// Compiles a regular expression. Once compiled, it can be used repeatedly
+ /// to search, split or replace text in a string.
+ ///
+ /// If an invalid expression is given, then an error is returned.
+ pub fn new(re: &str) -> Result<Regex, Error> {
+ RegexBuilder::new(re).build()
+ }
+
+ /// Returns true if and only if there is a match for the regex in the
+ /// string given.
+ ///
+ /// It is recommended to use this method if all you need to do is test
+ /// a match, since the underlying matching engine may be able to do less
+ /// work.
+ ///
+ /// # Example
+ ///
+ /// Test if some text contains at least one word with exactly 13
+ /// Unicode word characters:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let text = "I categorically deny having triskaidekaphobia.";
+ /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
+ /// # }
+ /// ```
+ pub fn is_match(&self, text: &str) -> bool {
+ self.is_match_at(text, 0)
+ }
+
+ /// Returns the start and end byte range of the leftmost-first match in
+ /// `text`. If no match exists, then `None` is returned.
+ ///
+ /// Note that this should only be used if you want to discover the position
+ /// of the match. Testing the existence of a match is faster if you use
+ /// `is_match`.
+ ///
+ /// # Example
+ ///
+ /// Find the start and end location of the first word with exactly 13
+ /// Unicode word characters:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let text = "I categorically deny having triskaidekaphobia.";
+ /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
+ /// assert_eq!(mat.start(), 2);
+ /// assert_eq!(mat.end(), 15);
+ /// # }
+ /// ```
+ pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
+ self.find_at(text, 0)
+ }
+
+ /// Returns an iterator for each successive non-overlapping match in
+ /// `text`, returning the start and end byte indices with respect to
+ /// `text`.
+ ///
+ /// # Example
+ ///
+ /// Find the start and end location of every word with exactly 13 Unicode
+ /// word characters:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let text = "Retroactively relinquishing remunerations is reprehensible.";
+ /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
+ /// println!("{:?}", mat);
+ /// }
+ /// # }
+ /// ```
+ pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
+ Matches(self.0.searcher_str().find_iter(text))
+ }
+
+ /// Returns the capture groups corresponding to the leftmost-first
+ /// match in `text`. Capture group `0` always corresponds to the entire
+ /// match. If no match is found, then `None` is returned.
+ ///
+ /// You should only use `captures` if you need access to the location of
+ /// capturing group matches. Otherwise, `find` is faster for discovering
+ /// the location of the overall match.
+ ///
+ /// # Examples
+ ///
+ /// Say you have some text with movie names and their release years,
+ /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
+ /// looking like that, while also extracting the movie name and its release
+ /// year separately.
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
+ /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(text).unwrap();
+ /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane");
+ /// assert_eq!(caps.get(2).unwrap().as_str(), "1941");
+ /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
+ /// // You can also access the groups by index using the Index notation.
+ /// // Note that this will panic on an invalid index.
+ /// assert_eq!(&caps[1], "Citizen Kane");
+ /// assert_eq!(&caps[2], "1941");
+ /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
+ /// # }
+ /// ```
+ ///
+ /// Note that the full match is at capture group `0`. Each subsequent
+ /// capture group is indexed by the order of its opening `(`.
+ ///
+ /// We can make this example a bit clearer by using *named* capture groups:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
+ /// .unwrap();
+ /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(text).unwrap();
+ /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane");
+ /// assert_eq!(caps.name("year").unwrap().as_str(), "1941");
+ /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
+ /// // You can also access the groups by name using the Index notation.
+ /// // Note that this will panic on an invalid group name.
+ /// assert_eq!(&caps["title"], "Citizen Kane");
+ /// assert_eq!(&caps["year"], "1941");
+ /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
+ ///
+ /// # }
+ /// ```
+ ///
+ /// Here we name the capture groups, which we can access with the `name`
+ /// method or the `Index` notation with a `&str`. Note that the named
+ /// capture groups are still accessible with `get` or the `Index` notation
+ /// with a `usize`.
+ ///
+ /// The `0`th capture group is always unnamed, so it must always be
+ /// accessed with `get(0)` or `[0]`.
+ pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
+ let mut locs = self.capture_locations();
+ self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
+ text,
+ locs: locs.0,
+ named_groups: self.0.capture_name_idx().clone(),
+ })
+ }
+
+ /// Returns an iterator over all the non-overlapping capture groups matched
+ /// in `text`. This is operationally the same as `find_iter`, except it
+ /// yields information about capturing group matches.
+ ///
+ /// # Example
+ ///
+ /// We can use this to find all movie titles and their release years in
+ /// some text, where the movie is formatted like "'Title' (xxxx)":
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
+ /// .unwrap();
+ /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
+ /// for caps in re.captures_iter(text) {
+ /// println!("Movie: {:?}, Released: {:?}",
+ /// &caps["title"], &caps["year"]);
+ /// }
+ /// // Output:
+ /// // Movie: Citizen Kane, Released: 1941
+ /// // Movie: The Wizard of Oz, Released: 1939
+ /// // Movie: M, Released: 1931
+ /// # }
+ /// ```
+ pub fn captures_iter<'r, 't>(
+ &'r self,
+ text: &'t str,
+ ) -> CaptureMatches<'r, 't> {
+ CaptureMatches(self.0.searcher_str().captures_iter(text))
+ }
+
+ /// Returns an iterator of substrings of `text` delimited by a match of the
+ /// regular expression. Namely, each element of the iterator corresponds to
+ /// text that *isn't* matched by the regular expression.
+ ///
+ /// This method will *not* copy the text given.
+ ///
+ /// # Example
+ ///
+ /// To split a string delimited by arbitrary amounts of spaces or tabs:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"[ \t]+").unwrap();
+ /// let fields: Vec<&str> = re.split("a b \t c\td e").collect();
+ /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
+ /// # }
+ /// ```
+ pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> {
+ Split { finder: self.find_iter(text), last: 0 }
+ }
+
+ /// Returns an iterator of at most `limit` substrings of `text` delimited
+ /// by a match of the regular expression. (A `limit` of `0` will return no
+ /// substrings.) Namely, each element of the iterator corresponds to text
+ /// that *isn't* matched by the regular expression. The remainder of the
+ /// string that is not split will be the last element in the iterator.
+ ///
+ /// This method will *not* copy the text given.
+ ///
+ /// # Example
+ ///
+ /// Get the first two words in some text:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"\W+").unwrap();
+ /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
+ /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
+ /// # }
+ /// ```
+ pub fn splitn<'r, 't>(
+ &'r self,
+ text: &'t str,
+ limit: usize,
+ ) -> SplitN<'r, 't> {
+ SplitN { splits: self.split(text), n: limit }
+ }
+
+ /// Replaces the leftmost-first match with the replacement provided.
+ /// The replacement can be a regular string (where `$N` and `$name` are
+ /// expanded to match capture groups) or a function that takes the matches'
+ /// `Captures` and returns the replaced string.
+ ///
+ /// If no match is found, then a copy of the string is returned unchanged.
+ ///
+ /// # Replacement string syntax
+ ///
+ /// All instances of `$name` in the replacement text is replaced with the
+ /// corresponding capture group `name`.
+ ///
+ /// `name` may be an integer corresponding to the index of the
+ /// capture group (counted by order of opening parenthesis where `0` is the
+ /// entire match) or it can be a name (consisting of letters, digits or
+ /// underscores) corresponding to a named capture group.
+ ///
+ /// If `name` isn't a valid capture group (whether the name doesn't exist
+ /// or isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name is used. e.g., `$1a` looks up the capture
+ /// group named `1a` and not the capture group at index `1`. To exert more
+ /// precise control over the name, use braces, e.g., `${1}a`.
+ ///
+ /// To write a literal `$` use `$$`.
+ ///
+ /// # Examples
+ ///
+ /// Note that this function is polymorphic with respect to the replacement.
+ /// In typical usage, this can just be a normal string:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new("[^01]+").unwrap();
+ /// assert_eq!(re.replace("1078910", ""), "1010");
+ /// # }
+ /// ```
+ ///
+ /// But anything satisfying the `Replacer` trait will work. For example,
+ /// a closure of type `|&Captures| -> String` provides direct access to the
+ /// captures corresponding to a match. This allows one to access
+ /// capturing group matches easily:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # use regex::Captures; fn main() {
+ /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
+ /// format!("{} {}", &caps[2], &caps[1])
+ /// });
+ /// assert_eq!(result, "Bruce Springsteen");
+ /// # }
+ /// ```
+ ///
+ /// But this is a bit cumbersome to use all the time. Instead, a simple
+ /// syntax is supported that expands `$name` into the corresponding capture
+ /// group. Here's the last example, but using this expansion technique
+ /// with named capture groups:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
+ /// let result = re.replace("Springsteen, Bruce", "$first $last");
+ /// assert_eq!(result, "Bruce Springsteen");
+ /// # }
+ /// ```
+ ///
+ /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
+ /// would produce the same result. To write a literal `$` use `$$`.
+ ///
+ /// Sometimes the replacement string requires use of curly braces to
+ /// delineate a capture group replacement and surrounding literal text.
+ /// For example, if we wanted to join two words together with an
+ /// underscore:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
+ /// let result = re.replace("deep fried", "${first}_$second");
+ /// assert_eq!(result, "deep_fried");
+ /// # }
+ /// ```
+ ///
+ /// Without the curly braces, the capture group name `first_` would be
+ /// used, and since it doesn't exist, it would be replaced with the empty
+ /// string.
+ ///
+ /// Finally, sometimes you just want to replace a literal string with no
+ /// regard for capturing group expansion. This can be done by wrapping a
+ /// byte string with `NoExpand`:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// use regex::NoExpand;
+ ///
+ /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
+ /// assert_eq!(result, "$2 $last");
+ /// # }
+ /// ```
+ pub fn replace<'t, R: Replacer>(
+ &self,
+ text: &'t str,
+ rep: R,
+ ) -> Cow<'t, str> {
+ self.replacen(text, 1, rep)
+ }
+
+ /// Replaces all non-overlapping matches in `text` with the replacement
+ /// provided. This is the same as calling `replacen` with `limit` set to
+ /// `0`.
+ ///
+ /// See the documentation for `replace` for details on how to access
+ /// capturing group matches in the replacement string.
+ pub fn replace_all<'t, R: Replacer>(
+ &self,
+ text: &'t str,
+ rep: R,
+ ) -> Cow<'t, str> {
+ self.replacen(text, 0, rep)
+ }
+
+ /// Replaces at most `limit` non-overlapping matches in `text` with the
+ /// replacement provided. If `limit` is 0, then all non-overlapping matches
+ /// are replaced.
+ ///
+ /// See the documentation for `replace` for details on how to access
+ /// capturing group matches in the replacement string.
+ pub fn replacen<'t, R: Replacer>(
+ &self,
+ text: &'t str,
+ limit: usize,
+ mut rep: R,
+ ) -> Cow<'t, str> {
+ // If we know that the replacement doesn't have any capture expansions,
+ // then we can use the fast path. The fast path can make a tremendous
+ // difference:
+ //
+ // 1) We use `find_iter` instead of `captures_iter`. Not asking for
+ // captures generally makes the regex engines faster.
+ // 2) We don't need to look up all of the capture groups and do
+ // replacements inside the replacement string. We just push it
+ // at each match and be done with it.
+ if let Some(rep) = rep.no_expansion() {
+ let mut it = self.find_iter(text).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(text);
+ }
+ let mut new = String::with_capacity(text.len());
+ let mut last_match = 0;
+ for (i, m) in it {
+ new.push_str(&text[last_match..m.start()]);
+ new.push_str(&rep);
+ last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
+ }
+ new.push_str(&text[last_match..]);
+ return Cow::Owned(new);
+ }
+
+ // The slower path, which we use if the replacement needs access to
+ // capture groups.
+ let mut it = self.captures_iter(text).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(text);
+ }
+ let mut new = String::with_capacity(text.len());
+ let mut last_match = 0;
+ for (i, cap) in it {
+ // unwrap on 0 is OK because captures only reports matches
+ let m = cap.get(0).unwrap();
+ new.push_str(&text[last_match..m.start()]);
+ rep.replace_append(&cap, &mut new);
+ last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
+ }
+ new.push_str(&text[last_match..]);
+ Cow::Owned(new)
+ }
+}
+
+/// Advanced or "lower level" search methods.
+impl Regex {
+ /// Returns the end location of a match in the text given.
+ ///
+ /// This method may have the same performance characteristics as
+ /// `is_match`, except it provides an end location for a match. In
+ /// particular, the location returned *may be shorter* than the proper end
+ /// of the leftmost-first match.
+ ///
+ /// # Example
+ ///
+ /// Typically, `a+` would match the entire first sequence of `a` in some
+ /// text, but `shortest_match` can give up as soon as it sees the first
+ /// `a`.
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let text = "aaaaa";
+ /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
+ /// assert_eq!(pos, Some(1));
+ /// # }
+ /// ```
+ pub fn shortest_match(&self, text: &str) -> Option<usize> {
+ self.shortest_match_at(text, 0)
+ }
+
+ /// Returns the same as shortest_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn shortest_match_at(
+ &self,
+ text: &str,
+ start: usize,
+ ) -> Option<usize> {
+ self.0.searcher_str().shortest_match_at(text, start)
+ }
+
+ /// Returns the same as is_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn is_match_at(&self, text: &str, start: usize) -> bool {
+ self.0.searcher_str().is_match_at(text, start)
+ }
+
+ /// Returns the same as find, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn find_at<'t>(
+ &self,
+ text: &'t str,
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.0
+ .searcher_str()
+ .find_at(text, start)
+ .map(|(s, e)| Match::new(text, s, e))
+ }
+
+ /// This is like `captures`, but uses
+ /// [`CaptureLocations`](struct.CaptureLocations.html)
+ /// instead of
+ /// [`Captures`](struct.Captures.html) in order to amortize allocations.
+ ///
+ /// To create a `CaptureLocations` value, use the
+ /// `Regex::capture_locations` method.
+ ///
+ /// This returns the overall match if this was successful, which is always
+ /// equivalence to the `0`th capture group.
+ pub fn captures_read<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t str,
+ ) -> Option<Match<'t>> {
+ self.captures_read_at(locs, text, 0)
+ }
+
+ /// Returns the same as captures, but starts the search at the given
+ /// offset and populates the capture locations given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn captures_read_at<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t str,
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.0
+ .searcher_str()
+ .captures_read_at(&mut locs.0, text, start)
+ .map(|(s, e)| Match::new(text, s, e))
+ }
+
+ /// An undocumented alias for `captures_read_at`.
+ ///
+ /// The `regex-capi` crate previously used this routine, so to avoid
+ /// breaking that crate, we continue to provide the name as an undocumented
+ /// alias.
+ #[doc(hidden)]
+ pub fn read_captures_at<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t str,
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.captures_read_at(locs, text, start)
+ }
+}
+
+/// Auxiliary methods.
+impl Regex {
+ /// Returns the original string of this regex.
+ pub fn as_str(&self) -> &str {
+ &self.0.regex_strings()[0]
+ }
+
+ /// Returns an iterator over the capture names.
+ pub fn capture_names(&self) -> CaptureNames<'_> {
+ CaptureNames(self.0.capture_names().iter())
+ }
+
+ /// Returns the number of captures.
+ pub fn captures_len(&self) -> usize {
+ self.0.capture_names().len()
+ }
+
+ /// Returns an empty set of capture locations that can be reused in
+ /// multiple calls to `captures_read` or `captures_read_at`.
+ pub fn capture_locations(&self) -> CaptureLocations {
+ CaptureLocations(self.0.searcher_str().locations())
+ }
+
+ /// An alias for `capture_locations` to preserve backward compatibility.
+ ///
+ /// The `regex-capi` crate uses this method, so to avoid breaking that
+ /// crate, we continue to export it as an undocumented API.
+ #[doc(hidden)]
+ pub fn locations(&self) -> CaptureLocations {
+ CaptureLocations(self.0.searcher_str().locations())
+ }
+}
+
+/// An iterator over the names of all possible captures.
+///
+/// `None` indicates an unnamed capture; the first element (capture 0, the
+/// whole matched region) is always unnamed.
+///
+/// `'r` is the lifetime of the compiled regular expression.
+#[derive(Clone, Debug)]
+pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
+
+impl<'r> Iterator for CaptureNames<'r> {
+ type Item = Option<&'r str>;
+
+ fn next(&mut self) -> Option<Option<&'r str>> {
+ self.0
+ .next()
+ .as_ref()
+ .map(|slot| slot.as_ref().map(|name| name.as_ref()))
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+
+ fn count(self) -> usize {
+ self.0.count()
+ }
+}
+
+impl<'r> ExactSizeIterator for CaptureNames<'r> {}
+
+impl<'r> FusedIterator for CaptureNames<'r> {}
+
+/// Yields all substrings delimited by a regular expression match.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the string being split.
+#[derive(Debug)]
+pub struct Split<'r, 't> {
+ finder: Matches<'r, 't>,
+ last: usize,
+}
+
+impl<'r, 't> Iterator for Split<'r, 't> {
+ type Item = &'t str;
+
+ fn next(&mut self) -> Option<&'t str> {
+ let text = self.finder.0.text();
+ match self.finder.next() {
+ None => {
+ if self.last > text.len() {
+ None
+ } else {
+ let s = &text[self.last..];
+ self.last = text.len() + 1; // Next call will return None
+ Some(s)
+ }
+ }
+ Some(m) => {
+ let matched = &text[self.last..m.start()];
+ self.last = m.end();
+ Some(matched)
+ }
+ }
+ }
+}
+
+impl<'r, 't> FusedIterator for Split<'r, 't> {}
+
+/// Yields at most `N` substrings delimited by a regular expression match.
+///
+/// The last substring will be whatever remains after splitting.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the string being split.
+#[derive(Debug)]
+pub struct SplitN<'r, 't> {
+ splits: Split<'r, 't>,
+ n: usize,
+}
+
+impl<'r, 't> Iterator for SplitN<'r, 't> {
+ type Item = &'t str;
+
+ fn next(&mut self) -> Option<&'t str> {
+ if self.n == 0 {
+ return None;
+ }
+
+ self.n -= 1;
+ if self.n > 0 {
+ return self.splits.next();
+ }
+
+ let text = self.splits.finder.0.text();
+ if self.splits.last > text.len() {
+ // We've already returned all substrings.
+ None
+ } else {
+ // self.n == 0, so future calls will return None immediately
+ Some(&text[self.splits.last..])
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ (0, Some(self.n))
+ }
+}
+
+impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
+
+/// CaptureLocations is a low level representation of the raw offsets of each
+/// submatch.
+///
+/// You can think of this as a lower level
+/// [`Captures`](struct.Captures.html), where this type does not support
+/// named capturing groups directly and it does not borrow the text that these
+/// offsets were matched on.
+///
+/// Primarily, this type is useful when using the lower level `Regex` APIs
+/// such as `read_captures`, which permits amortizing the allocation in which
+/// capture match locations are stored.
+///
+/// In order to build a value of this type, you'll need to call the
+/// `capture_locations` method on the `Regex` being used to execute the search.
+/// The value returned can then be reused in subsequent searches.
+#[derive(Clone, Debug)]
+pub struct CaptureLocations(re_trait::Locations);
+
+/// A type alias for `CaptureLocations` for backwards compatibility.
+///
+/// Previously, we exported `CaptureLocations` as `Locations` in an
+/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
+/// we continue re-exporting the same undocumented API.
+#[doc(hidden)]
+pub type Locations = CaptureLocations;
+
+impl CaptureLocations {
+ /// Returns the start and end positions of the Nth capture group. Returns
+ /// `None` if `i` is not a valid capture group or if the capture group did
+ /// not match anything. The positions returned are *always* byte indices
+ /// with respect to the original string matched.
+ #[inline]
+ pub fn get(&self, i: usize) -> Option<(usize, usize)> {
+ self.0.pos(i)
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ ///
+ /// This is always at least `1` since every regex has at least `1`
+ /// capturing group that corresponds to the entire match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ /// An alias for the `get` method for backwards compatibility.
+ ///
+ /// Previously, we exported `get` as `pos` in an undocumented API. To
+ /// prevent breaking that code (e.g., in `regex-capi`), we continue
+ /// re-exporting the same undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
+ self.get(i)
+ }
+}
+
+/// Captures represents a group of captured strings for a single match.
+///
+/// The 0th capture always corresponds to the entire match. Each subsequent
+/// index corresponds to the next capture group in the regex. If a capture
+/// group is named, then the matched string is *also* available via the `name`
+/// method. (Note that the 0th capture is always unnamed and so must be
+/// accessed with the `get` method.)
+///
+/// Positions returned from a capture group are always byte indices.
+///
+/// `'t` is the lifetime of the matched text.
+pub struct Captures<'t> {
+ text: &'t str,
+ locs: re_trait::Locations,
+ named_groups: Arc<HashMap<String, usize>>,
+}
+
+impl<'t> Captures<'t> {
+ /// Returns the match associated with the capture group at index `i`. If
+ /// `i` does not correspond to a capture group, or if the capture group
+ /// did not participate in the match, then `None` is returned.
+ ///
+ /// # Examples
+ ///
+ /// Get the text of the match with a default of an empty string if this
+ /// group didn't participate in the match:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
+ /// let caps = re.captures("abc123").unwrap();
+ ///
+ /// let text1 = caps.get(1).map_or("", |m| m.as_str());
+ /// let text2 = caps.get(2).map_or("", |m| m.as_str());
+ /// assert_eq!(text1, "123");
+ /// assert_eq!(text2, "");
+ /// ```
+ pub fn get(&self, i: usize) -> Option<Match<'t>> {
+ self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
+ }
+
+ /// Returns the match for the capture group named `name`. If `name` isn't a
+ /// valid capture group or didn't match anything, then `None` is returned.
+ pub fn name(&self, name: &str) -> Option<Match<'t>> {
+ self.named_groups.get(name).and_then(|&i| self.get(i))
+ }
+
+ /// An iterator that yields all capturing matches in the order in which
+ /// they appear in the regex. If a particular capture group didn't
+ /// participate in the match, then `None` is yielded for that capture.
+ ///
+ /// The first match always corresponds to the overall match of the regex.
+ pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
+ SubCaptureMatches { caps: self, it: self.locs.iter() }
+ }
+
+ /// Expands all instances of `$name` in `replacement` to the corresponding
+ /// capture group `name`, and writes them to the `dst` buffer given.
+ ///
+ /// `name` may be an integer corresponding to the index of the capture
+ /// group (counted by order of opening parenthesis where `0` is the
+ /// entire match) or it can be a name (consisting of letters, digits or
+ /// underscores) corresponding to a named capture group.
+ ///
+ /// If `name` isn't a valid capture group (whether the name doesn't exist
+ /// or isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+ /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+ /// capture group at index `1`. To exert more precise control over the
+ /// name, or to refer to a capture group name that uses characters outside
+ /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+ /// using braces, any sequence of characters is permitted. If the sequence
+ /// does not refer to a capture group name in the corresponding regex, then
+ /// it is replaced with an empty string.
+ ///
+ /// To write a literal `$` use `$$`.
+ pub fn expand(&self, replacement: &str, dst: &mut String) {
+ expand_str(self, replacement, dst)
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ ///
+ /// This is always at least `1`, since every regex has at least one capture
+ /// group that corresponds to the full match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.locs.len()
+ }
+}
+
+impl<'t> fmt::Debug for Captures<'t> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
+ }
+}
+
+struct CapturesDebug<'c, 't>(&'c Captures<'t>);
+
+impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ // We'd like to show something nice here, even if it means an
+ // allocation to build a reverse index.
+ let slot_to_name: HashMap<&usize, &String> =
+ self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
+ let mut map = f.debug_map();
+ for (slot, m) in self.0.locs.iter().enumerate() {
+ let m = m.map(|(s, e)| &self.0.text[s..e]);
+ if let Some(name) = slot_to_name.get(&slot) {
+ map.entry(&name, &m);
+ } else {
+ map.entry(&slot, &m);
+ }
+ }
+ map.finish()
+ }
+}
+
+/// Get a group by index.
+///
+/// `'t` is the lifetime of the matched text.
+///
+/// The text can't outlive the `Captures` object if this method is
+/// used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it); to do that, use `get()` instead.
+///
+/// # Panics
+///
+/// If there is no group at the given index.
+impl<'t> Index<usize> for Captures<'t> {
+ type Output = str;
+
+ fn index(&self, i: usize) -> &str {
+ self.get(i)
+ .map(|m| m.as_str())
+ .unwrap_or_else(|| panic!("no group at index '{}'", i))
+ }
+}
+
+/// Get a group by name.
+///
+/// `'t` is the lifetime of the matched text and `'i` is the lifetime
+/// of the group name (the index).
+///
+/// The text can't outlive the `Captures` object if this method is
+/// used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it); to do that, use `name` instead.
+///
+/// # Panics
+///
+/// If there is no group named by the given value.
+impl<'t, 'i> Index<&'i str> for Captures<'t> {
+ type Output = str;
+
+ fn index<'a>(&'a self, name: &'i str) -> &'a str {
+ self.name(name)
+ .map(|m| m.as_str())
+ .unwrap_or_else(|| panic!("no group named '{}'", name))
+ }
+}
+
+/// An iterator that yields all capturing matches in the order in which they
+/// appear in the regex.
+///
+/// If a particular capture group didn't participate in the match, then `None`
+/// is yielded for that capture. The first match always corresponds to the
+/// overall match of the regex.
+///
+/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
+/// the lifetime `'t` corresponds to the originally matched text.
+#[derive(Clone, Debug)]
+pub struct SubCaptureMatches<'c, 't> {
+ caps: &'c Captures<'t>,
+ it: SubCapturesPosIter<'c>,
+}
+
+impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
+ type Item = Option<Match<'t>>;
+
+ fn next(&mut self) -> Option<Option<Match<'t>>> {
+ self.it
+ .next()
+ .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+
+ fn count(self) -> usize {
+ self.it.count()
+ }
+}
+
+impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {}
+
+impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
+
+/// An iterator that yields all non-overlapping capture groups matching a
+/// particular regular expression.
+///
+/// The iterator stops when no more matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the matched string.
+#[derive(Debug)]
+pub struct CaptureMatches<'r, 't>(
+ re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>,
+);
+
+impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
+ type Item = Captures<'t>;
+
+ fn next(&mut self) -> Option<Captures<'t>> {
+ self.0.next().map(|locs| Captures {
+ text: self.0.text(),
+ locs,
+ named_groups: self.0.regex().capture_name_idx().clone(),
+ })
+ }
+}
+
+impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {}
+
+/// An iterator over all non-overlapping matches for a particular string.
+///
+/// The iterator yields a `Match` value. The iterator stops when no more
+/// matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the matched string.
+#[derive(Debug)]
+pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>);
+
+impl<'r, 't> Iterator for Matches<'r, 't> {
+ type Item = Match<'t>;
+
+ fn next(&mut self) -> Option<Match<'t>> {
+ let text = self.0.text();
+ self.0.next().map(|(s, e)| Match::new(text, s, e))
+ }
+}
+
+impl<'r, 't> FusedIterator for Matches<'r, 't> {}
+
+/// Replacer describes types that can be used to replace matches in a string.
+///
+/// In general, users of this crate shouldn't need to implement this trait,
+/// since implementations are already provided for `&str` along with other
+/// variants of string types and `FnMut(&Captures) -> String` (or any
+/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases.
+pub trait Replacer {
+ /// Appends text to `dst` to replace the current match.
+ ///
+ /// The current match is represented by `caps`, which is guaranteed to
+ /// have a match at capture group `0`.
+ ///
+ /// For example, a no-op replacement would be
+ /// `dst.push_str(caps.get(0).unwrap().as_str())`.
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String);
+
+ /// Return a fixed unchanging replacement string.
+ ///
+ /// When doing replacements, if access to `Captures` is not needed (e.g.,
+ /// the replacement byte string does not need `$` expansion), then it can
+ /// be beneficial to avoid finding sub-captures.
+ ///
+ /// In general, this is called once for every call to `replacen`.
+ fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> {
+ None
+ }
+
+ /// Return a `Replacer` that borrows and wraps this `Replacer`.
+ ///
+ /// This is useful when you want to take a generic `Replacer` (which might
+ /// not be cloneable) and use it without consuming it, so it can be used
+ /// more than once.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::{Regex, Replacer};
+ ///
+ /// fn replace_all_twice<R: Replacer>(
+ /// re: Regex,
+ /// src: &str,
+ /// mut rep: R,
+ /// ) -> String {
+ /// let dst = re.replace_all(src, rep.by_ref());
+ /// let dst = re.replace_all(&dst, rep.by_ref());
+ /// dst.into_owned()
+ /// }
+ /// ```
+ fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
+ ReplacerRef(self)
+ }
+}
+
+/// By-reference adaptor for a `Replacer`
+///
+/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
+#[derive(Debug)]
+pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
+
+impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.0.replace_append(caps, dst)
+ }
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ self.0.no_expansion()
+ }
+}
+
+impl<'a> Replacer for &'a str {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a String {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_str().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl Replacer for String {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_str().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for Cow<'a, str> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_ref().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Cow<'a, str> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_ref().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> {
+ let s = t.as_ref();
+ match find_byte(b'$', s.as_bytes()) {
+ Some(_) => None,
+ None => Some(Cow::Borrowed(s)),
+ }
+}
+
+impl<F, T> Replacer for F
+where
+ F: FnMut(&Captures<'_>) -> T,
+ T: AsRef<str>,
+{
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ dst.push_str((*self)(caps).as_ref());
+ }
+}
+
+/// `NoExpand` indicates literal string replacement.
+///
+/// It can be used with `replace` and `replace_all` to do a literal string
+/// replacement without expanding `$name` to their corresponding capture
+/// groups. This can be both convenient (to avoid escaping `$`, for example)
+/// and performant (since capture groups don't need to be found).
+///
+/// `'t` is the lifetime of the literal text.
+#[derive(Clone, Debug)]
+pub struct NoExpand<'t>(pub &'t str);
+
+impl<'t> Replacer for NoExpand<'t> {
+ fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) {
+ dst.push_str(self.0);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ Some(Cow::Borrowed(self.0))
+ }
+}
diff --git a/third_party/rust/regex/src/sparse.rs b/third_party/rust/regex/src/sparse.rs
new file mode 100644
index 0000000000..98b726613d
--- /dev/null
+++ b/third_party/rust/regex/src/sparse.rs
@@ -0,0 +1,84 @@
+use std::fmt;
+use std::ops::Deref;
+use std::slice;
+
+/// A sparse set used for representing ordered NFA states.
+///
+/// This supports constant time addition and membership testing. Clearing an
+/// entire set can also be done in constant time. Iteration yields elements
+/// in the order in which they were inserted.
+///
+/// The data structure is based on: https://research.swtch.com/sparse
+/// Note though that we don't actually use uninitialized memory. We generally
+/// reuse allocations, so the initial allocation cost is bareable. However,
+/// its other properties listed above are extremely useful.
+#[derive(Clone)]
+pub struct SparseSet {
+ /// Dense contains the instruction pointers in the order in which they
+ /// were inserted.
+ dense: Vec<usize>,
+ /// Sparse maps instruction pointers to their location in dense.
+ ///
+ /// An instruction pointer is in the set if and only if
+ /// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
+ sparse: Box<[usize]>,
+}
+
+impl SparseSet {
+ pub fn new(size: usize) -> SparseSet {
+ SparseSet {
+ dense: Vec::with_capacity(size),
+ sparse: vec![0; size].into_boxed_slice(),
+ }
+ }
+
+ pub fn len(&self) -> usize {
+ self.dense.len()
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.dense.is_empty()
+ }
+
+ pub fn capacity(&self) -> usize {
+ self.dense.capacity()
+ }
+
+ pub fn insert(&mut self, value: usize) {
+ let i = self.len();
+ assert!(i < self.capacity());
+ self.dense.push(value);
+ self.sparse[value] = i;
+ }
+
+ pub fn contains(&self, value: usize) -> bool {
+ let i = self.sparse[value];
+ self.dense.get(i) == Some(&value)
+ }
+
+ pub fn clear(&mut self) {
+ self.dense.clear();
+ }
+}
+
+impl fmt::Debug for SparseSet {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "SparseSet({:?})", self.dense)
+ }
+}
+
+impl Deref for SparseSet {
+ type Target = [usize];
+
+ fn deref(&self) -> &Self::Target {
+ &self.dense
+ }
+}
+
+impl<'a> IntoIterator for &'a SparseSet {
+ type Item = &'a usize;
+ type IntoIter = slice::Iter<'a, usize>;
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
diff --git a/third_party/rust/regex/src/testdata/LICENSE b/third_party/rust/regex/src/testdata/LICENSE
new file mode 100644
index 0000000000..f47dbf4c44
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/LICENSE
@@ -0,0 +1,19 @@
+The following license covers testregex.c and all associated test data.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of the
+Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following disclaimer:
+
+THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/rust/regex/src/testdata/README b/third_party/rust/regex/src/testdata/README
new file mode 100644
index 0000000000..6efc2dad33
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/README
@@ -0,0 +1,17 @@
+Test data was taken from the Go distribution, which was in turn taken from the
+testregex test suite:
+
+ http://www2.research.att.com/~astopen/testregex/testregex.html
+
+The LICENSE in this directory corresponds to the LICENSE that the data was
+released under.
+
+The tests themselves were modified for RE2/Go. A couple were modified further
+by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
+(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
+have been a bad idea, but I think being consistent with an established Regex
+library is worth something.
+
+Note that these files are read by 'scripts/regex-match-tests.py' and turned
+into Rust tests found in 'regex_macros/tests/matches.rs'.
+
diff --git a/third_party/rust/regex/src/testdata/basic.dat b/third_party/rust/regex/src/testdata/basic.dat
new file mode 100644
index 0000000000..632e1bb416
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/basic.dat
@@ -0,0 +1,221 @@
+NOTE all standard compliant implementations should pass these : 2002-05-31
+
+BE abracadabra$ abracadabracadabra (7,18)
+BE a...b abababbb (2,7)
+BE XXXXXX ..XXXXXX (2,8)
+E \) () (1,2)
+BE a] a]a (0,2)
+B } } (0,1)
+E \} } (0,1)
+BE \] ] (0,1)
+B ] ] (0,1)
+E ] ] (0,1)
+B { { (0,1)
+B } } (0,1)
+BE ^a ax (0,1)
+BE \^a a^a (1,3)
+BE a\^ a^ (0,2)
+BE a$ aa (1,2)
+BE a\$ a$ (0,2)
+BE ^$ NULL (0,0)
+E $^ NULL (0,0)
+E a($) aa (1,2)(2,2)
+E a*(^a) aa (0,1)(0,1)
+E (..)*(...)* a (0,0)
+E (..)*(...)* abcd (0,4)(2,4)
+E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
+E (ab)c|abc abc (0,3)(0,2)
+E a{0}b ab (1,2)
+E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
+E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
+E a{9876543210} NULL BADBR
+E ((a|a)|a) a (0,1)(0,1)(0,1)
+E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
+E a*(a.|aa) aaaa (0,4)(2,4)
+E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
+E (a|b)?.* b (0,1)(0,1)
+E (a|b)c|a(b|c) ac (0,2)(0,1)
+E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
+E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
+E (a|b)*c|(a|ab)*c xc (1,2)
+E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
+E a?(ab|ba)ab abab (0,4)(0,2)
+E a?(ac{0}b|ba)ab abab (0,4)(0,2)
+E ab|abab abbabab (0,2)
+E aba|bab|bba baaabbbaba (5,8)
+E aba|bab baaabbbaba (6,9)
+E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
+E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
+E ab|a xabc (1,3)
+E ab|a xxabc (2,4)
+Ei (?-u)(Ab|cD)* aBcD (0,4)(2,4)
+BE [^-] --a (2,3)
+BE [a-]* --a (0,3)
+BE [a-m-]* --amoma-- (0,4)
+E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
+E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
+{E [[:upper:]] A (0,1) [[<element>]] not supported
+E [[:lower:]]+ `az{ (1,3)
+E [[:upper:]]+ @AZ[ (1,3)
+# No collation in Go
+#BE [[-]] [[-]] (2,4)
+#BE [[.NIL.]] NULL ECOLLATE
+#BE [[=aleph=]] NULL ECOLLATE
+}
+BE$ \n \n (0,1)
+BEn$ \n \n (0,1)
+BE$ [^a] \n (0,1)
+BE$ \na \na (0,2)
+E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
+BE xxx xxx (0,3)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
+E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
+E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
+E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
+E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
+E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
+E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
+BE$ .* \x01\x7f (0,2)
+E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
+L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
+E a*a*a*a*a*b aaaaaaaaab (0,10)
+BE ^ NULL (0,0)
+BE $ NULL (0,0)
+BE ^$ NULL (0,0)
+BE ^a$ a (0,1)
+BE abc abc (0,3)
+BE abc xabcy (1,4)
+BE abc ababc (2,5)
+BE ab*c abc (0,3)
+BE ab*bc abc (0,3)
+BE ab*bc abbc (0,4)
+BE ab*bc abbbbc (0,6)
+E ab+bc abbc (0,4)
+E ab+bc abbbbc (0,6)
+E ab?bc abbc (0,4)
+E ab?bc abc (0,3)
+E ab?c abc (0,3)
+BE ^abc$ abc (0,3)
+BE ^abc abcc (0,3)
+BE abc$ aabc (1,4)
+BE ^ abc (0,0)
+BE $ abc (3,3)
+BE a.c abc (0,3)
+BE a.c axc (0,3)
+BE a.*c axyzc (0,5)
+BE a[bc]d abd (0,3)
+BE a[b-d]e ace (0,3)
+BE a[b-d] aac (1,3)
+BE a[-b] a- (0,2)
+BE a[b-] a- (0,2)
+BE a] a] (0,2)
+BE a[]]b a]b (0,3)
+BE a[^bc]d aed (0,3)
+BE a[^-b]c adc (0,3)
+BE a[^]b]c adc (0,3)
+E ab|cd abc (0,2)
+E ab|cd abcd (0,2)
+E a\(b a(b (0,3)
+E a\(*b ab (0,2)
+E a\(*b a((b (0,4)
+E ((a)) abc (0,1)(0,1)(0,1)
+E (a)b(c) abc (0,3)(0,1)(2,3)
+E a+b+c aabbabc (4,7)
+E a* aaa (0,3)
+#E (a*)* - (0,0)(0,0)
+E (a*)* - (0,0)(?,?) RE2/Go
+E (a*)+ - (0,0)(0,0)
+#E (a*|b)* - (0,0)(0,0)
+E (a*|b)* - (0,0)(?,?) RE2/Go
+E (a+|b)* ab (0,2)(1,2)
+E (a+|b)+ ab (0,2)(1,2)
+E (a+|b)? ab (0,1)(0,1)
+BE [^ab]* cde (0,3)
+#E (^)* - (0,0)(0,0)
+E (^)* - (0,0)(?,?) RE2/Go
+BE a* NULL (0,0)
+E ([abc])*d abbbcd (0,6)(4,5)
+E ([abc])*bcd abcd (0,4)(0,1)
+E a|b|c|d|e e (0,1)
+E (a|b|c|d|e)f ef (0,2)(0,1)
+#E ((a*|b))* - (0,0)(0,0)(0,0)
+E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
+BE abcd*efg abcdefg (0,7)
+BE ab* xabyabbbz (1,3)
+BE ab* xayabbbz (1,2)
+E (ab|cd)e abcde (2,5)(2,4)
+BE [abhgefdc]ij hij (0,3)
+E (a|b)c*d abcd (1,4)(1,2)
+E (ab|ab*)bc abc (0,3)(0,1)
+E a([bc]*)c* abc (0,3)(1,3)
+E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
+E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
+E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
+E a[bcd]*dcdcde adcdcde (0,7)
+E (ab|a)b*c abc (0,3)(0,2)
+E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
+BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
+E ^a(bc+|b[eh])g|.h$ abh (1,3)
+E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
+E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
+E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
+E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
+BE multiple words multiple words yeah (0,14)
+E (.*)c(.*) abcde (0,5)(0,2)(3,5)
+BE abcd abcd (0,4)
+E a(bc)d abcd (0,4)(1,3)
+E a[-]?c ac (0,3)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
+E a+(b|c)*d+ aabcdd (0,6)(3,4)
+E ^.+$ vivi (0,4)
+E ^(.+)$ vivi (0,4)(0,4)
+E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
+E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
+E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
+E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
+E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
+E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
+E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
+E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
+E ((foo)|bar)!bas bar!bas (0,7)(0,3)
+E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
+E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
+E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
+E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
+E (foo|(bar))!bas foo!bas (0,7)(0,3)
+E (foo|bar)!bas bar!bas (0,7)(0,3)
+E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
+E (foo|bar)!bas foo!bas (0,7)(0,3)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
+E .*(/XXX).* /XXX (0,4)(0,4)
+E .*(\\XXX).* \XXX (0,4)(0,4)
+E \\XXX \XXX (0,4)
+E .*(/000).* /000 (0,4)(0,4)
+E .*(\\000).* \000 (0,4)(0,4)
+E \\000 \000 (0,4)
diff --git a/third_party/rust/regex/src/testdata/nullsubexpr.dat b/third_party/rust/regex/src/testdata/nullsubexpr.dat
new file mode 100644
index 0000000000..2e18fbb917
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/nullsubexpr.dat
@@ -0,0 +1,79 @@
+NOTE null subexpression matches : 2002-06-06
+
+E (a*)* a (0,1)(0,1)
+#E SAME x (0,0)(0,0)
+E SAME x (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a*)+ a (0,1)(0,1)
+E SAME x (0,0)(0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a+)* a (0,1)(0,1)
+E SAME x (0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a+)+ a (0,1)(0,1)
+E SAME x NOMATCH
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+
+E ([a]*)* a (0,1)(0,1)
+#E SAME x (0,0)(0,0)
+E SAME x (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E ([a]*)+ a (0,1)(0,1)
+E SAME x (0,0)(0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E ([^b]*)* a (0,1)(0,1)
+#E SAME b (0,0)(0,0)
+E SAME b (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaab (0,6)(0,6)
+E ([ab]*)* a (0,1)(0,1)
+E SAME aaaaaa (0,6)(0,6)
+E SAME ababab (0,6)(0,6)
+E SAME bababa (0,6)(0,6)
+E SAME b (0,1)(0,1)
+E SAME bbbbbb (0,6)(0,6)
+E SAME aaaabcde (0,5)(0,5)
+E ([^a]*)* b (0,1)(0,1)
+E SAME bbbbbb (0,6)(0,6)
+#E SAME aaaaaa (0,0)(0,0)
+E SAME aaaaaa (0,0)(?,?) RE2/Go
+E ([^ab]*)* ccccxx (0,6)(0,6)
+#E SAME ababab (0,0)(0,0)
+E SAME ababab (0,0)(?,?) RE2/Go
+
+E ((z)+|a)* zabcde (0,2)(1,2)
+
+#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
+#E (a) aaa (0,1)(0,1)
+#E (a*?) aaa (0,0)(0,0)
+#E (a)*? aaa (0,0)
+#E (a*?)*? aaa (0,0)
+#}
+
+B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
+B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
+B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
+B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
+B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
+B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
+B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
+B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
+
+#E (a*)*(x) x (0,1)(0,0)(0,1)
+E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
+E (a*)*(x) ax (0,2)(0,1)(1,2)
+E (a*)*(x) axa (0,2)(0,1)(1,2)
+
+E (a*)+(x) x (0,1)(0,0)(0,1)
+E (a*)+(x) ax (0,2)(0,1)(1,2)
+E (a*)+(x) axa (0,2)(0,1)(1,2)
+
+E (a*){2}(x) x (0,1)(0,0)(0,1)
+E (a*){2}(x) ax (0,2)(1,1)(1,2)
+E (a*){2}(x) axa (0,2)(1,1)(1,2)
diff --git a/third_party/rust/regex/src/testdata/repetition.dat b/third_party/rust/regex/src/testdata/repetition.dat
new file mode 100644
index 0000000000..3bb2121180
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/repetition.dat
@@ -0,0 +1,163 @@
+NOTE implicit vs. explicit repetitions : 2009-02-02
+
+# Glenn Fowler <gsf@research.att.com>
+# conforming matches (column 4) must match one of the following BREs
+# NOMATCH
+# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
+# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
+# i.e., each 3-tuple has two identical elements and one (?,?)
+
+E ((..)|(.)) NULL NOMATCH
+E ((..)|(.))((..)|(.)) NULL NOMATCH
+E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
+
+E ((..)|(.)){1} NULL NOMATCH
+E ((..)|(.)){2} NULL NOMATCH
+E ((..)|(.)){3} NULL NOMATCH
+
+E ((..)|(.))* NULL (0,0)
+
+E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
+E ((..)|(.))((..)|(.)) a NOMATCH
+E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
+
+E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
+E ((..)|(.)){2} a NOMATCH
+E ((..)|(.)){3} a NOMATCH
+
+E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
+
+E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
+E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
+
+E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
+E ((..)|(.)){3} aa NOMATCH
+
+E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
+
+E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
+E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
+
+E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
+#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
+E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
+E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
+
+#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
+E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
+
+E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
+
+E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
+#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
+E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
+
+E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
+
+E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
+
+E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
+#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
+E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
+
+#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
+E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
+
+E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
+
+E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
+E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
+
+E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
+
+NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
+
+# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
+# Linux/GLIBC gets the {8,} and {8,8} wrong.
+
+:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
+:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
+:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
+:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
+:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
+:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
+:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
+:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
+:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
+#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
+:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
+:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
+:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
+:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
+:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
+:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
+:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
+:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
+:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
+
+# These test a fixed bug in my regex-tdfa that did not keep the expanded
+# form properly grouped, so right association did the wrong thing with
+# these ambiguous patterns (crafted just to test my code when I became
+# suspicious of my implementation). The first subexpression should use
+# "ab" then "a" then "bcd".
+
+# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
+# results like (0,6)(4,5)(6,6).
+
+:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
+:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
+:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
+:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
+
+# The above worked on Linux/GLIBC but the following often fail.
+# They also trip up OS X / FreeBSD / NetBSD:
+
+#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
+#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
+#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
+:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
+:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
diff --git a/third_party/rust/regex/src/utf8.rs b/third_party/rust/regex/src/utf8.rs
new file mode 100644
index 0000000000..2dfd2c0d1d
--- /dev/null
+++ b/third_party/rust/regex/src/utf8.rs
@@ -0,0 +1,264 @@
+/// A few elementary UTF-8 encoding and decoding functions used by the matching
+/// engines.
+///
+/// In an ideal world, the matching engines operate on `&str` and we can just
+/// lean on the standard library for all our UTF-8 needs. However, to support
+/// byte based regexes (that can match on arbitrary bytes which may contain
+/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
+/// The standard library doesn't really recognize this use case, so we have
+/// to build it out ourselves.
+///
+/// Should this be factored out into a separate crate? It seems independently
+/// useful. There are other crates that already exist (e.g., `utf-8`) that have
+/// overlapping use cases. Not sure what to do.
+use std::char;
+
+const TAG_CONT: u8 = 0b1000_0000;
+const TAG_TWO: u8 = 0b1100_0000;
+const TAG_THREE: u8 = 0b1110_0000;
+const TAG_FOUR: u8 = 0b1111_0000;
+
+/// Returns the smallest possible index of the next valid UTF-8 sequence
+/// starting after `i`.
+pub fn next_utf8(text: &[u8], i: usize) -> usize {
+ let b = match text.get(i) {
+ None => return i + 1,
+ Some(&b) => b,
+ };
+ let inc = if b <= 0x7F {
+ 1
+ } else if b <= 0b110_11111 {
+ 2
+ } else if b <= 0b1110_1111 {
+ 3
+ } else {
+ 4
+ };
+ i + inc
+}
+
+/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
+///
+/// If no valid UTF-8 sequence could be found, then `None` is returned.
+/// Otherwise, the decoded codepoint and the number of bytes read is returned.
+/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
+/// 1, 2, 3 or 4.
+///
+/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
+/// codepoint that is out of range (surrogate codepoints are out of range) or
+/// is not the shortest possible UTF-8 sequence for that codepoint.
+#[inline]
+pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
+ let b0 = match src.get(0) {
+ None => return None,
+ Some(&b) if b <= 0x7F => return Some((b as char, 1)),
+ Some(&b) => b,
+ };
+ match b0 {
+ 0b110_00000..=0b110_11111 => {
+ if src.len() < 2 {
+ return None;
+ }
+ let b1 = src[1];
+ if 0b11_000000 & b1 != TAG_CONT {
+ return None;
+ }
+ let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
+ match cp {
+ 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
+ _ => None,
+ }
+ }
+ 0b1110_0000..=0b1110_1111 => {
+ if src.len() < 3 {
+ return None;
+ }
+ let (b1, b2) = (src[1], src[2]);
+ if 0b11_000000 & b1 != TAG_CONT {
+ return None;
+ }
+ if 0b11_000000 & b2 != TAG_CONT {
+ return None;
+ }
+ let cp = ((b0 & !TAG_THREE) as u32) << 12
+ | ((b1 & !TAG_CONT) as u32) << 6
+ | ((b2 & !TAG_CONT) as u32);
+ match cp {
+ // char::from_u32 will disallow surrogate codepoints.
+ 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
+ _ => None,
+ }
+ }
+ 0b11110_000..=0b11110_111 => {
+ if src.len() < 4 {
+ return None;
+ }
+ let (b1, b2, b3) = (src[1], src[2], src[3]);
+ if 0b11_000000 & b1 != TAG_CONT {
+ return None;
+ }
+ if 0b11_000000 & b2 != TAG_CONT {
+ return None;
+ }
+ if 0b11_000000 & b3 != TAG_CONT {
+ return None;
+ }
+ let cp = ((b0 & !TAG_FOUR) as u32) << 18
+ | ((b1 & !TAG_CONT) as u32) << 12
+ | ((b2 & !TAG_CONT) as u32) << 6
+ | ((b3 & !TAG_CONT) as u32);
+ match cp {
+ 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
+ _ => None,
+ }
+ }
+ _ => None,
+ }
+}
+
+/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
+/// of the first.
+pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
+ if src.is_empty() {
+ return None;
+ }
+ let mut start = src.len() - 1;
+ if src[start] <= 0x7F {
+ return Some((src[start] as char, 1));
+ }
+ while start > src.len().saturating_sub(4) {
+ start -= 1;
+ if is_start_byte(src[start]) {
+ break;
+ }
+ }
+ match decode_utf8(&src[start..]) {
+ None => None,
+ Some((_, n)) if n < src.len() - start => None,
+ Some((cp, n)) => Some((cp, n)),
+ }
+}
+
+fn is_start_byte(b: u8) -> bool {
+ b & 0b11_000000 != 0b1_0000000
+}
+
+#[cfg(test)]
+mod tests {
+ use std::str;
+
+ use quickcheck::quickcheck;
+
+ use super::{
+ decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
+ };
+
+ #[test]
+ fn prop_roundtrip() {
+ fn p(given_cp: char) -> bool {
+ let mut tmp = [0; 4];
+ let encoded_len = given_cp.encode_utf8(&mut tmp).len();
+ let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
+ encoded_len == got_len && given_cp == got_cp
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn prop_roundtrip_last() {
+ fn p(given_cp: char) -> bool {
+ let mut tmp = [0; 4];
+ let encoded_len = given_cp.encode_utf8(&mut tmp).len();
+ let (got_cp, got_len) =
+ decode_last_utf8(&tmp[..encoded_len]).unwrap();
+ encoded_len == got_len && given_cp == got_cp
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn prop_encode_matches_std() {
+ fn p(cp: char) -> bool {
+ let mut got = [0; 4];
+ let n = cp.encode_utf8(&mut got).len();
+ let expected = cp.to_string();
+ &got[..n] == expected.as_bytes()
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn prop_decode_matches_std() {
+ fn p(given_cp: char) -> bool {
+ let mut tmp = [0; 4];
+ let n = given_cp.encode_utf8(&mut tmp).len();
+ let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
+ let expected_cp =
+ str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
+ got_cp == expected_cp
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn prop_decode_last_matches_std() {
+ fn p(given_cp: char) -> bool {
+ let mut tmp = [0; 4];
+ let n = given_cp.encode_utf8(&mut tmp).len();
+ let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
+ let expected_cp = str::from_utf8(&tmp[..n])
+ .unwrap()
+ .chars()
+ .rev()
+ .next()
+ .unwrap();
+ got_cp == expected_cp
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn reject_invalid() {
+ // Invalid start byte
+ assert_eq!(decode_utf8(&[0xFF]), None);
+ // Surrogate pair
+ assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
+ // Invalid continuation byte.
+ assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
+ // Bad lengths
+ assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
+ assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
+ assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
+ // Not a minimal UTF-8 sequence
+ assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
+ assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
+ assert_eq!(
+ decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]),
+ None
+ );
+ }
+
+ #[test]
+ fn reject_invalid_last() {
+ // Invalid start byte
+ assert_eq!(decode_last_utf8(&[0xFF]), None);
+ // Surrogate pair
+ assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
+ // Bad lengths
+ assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
+ assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
+ assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
+ // Not a minimal UTF-8 sequence
+ assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
+ assert_eq!(
+ decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]),
+ None
+ );
+ assert_eq!(
+ decode_last_utf8(
+ &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]
+ ),
+ None
+ );
+ }
+}
diff --git a/third_party/rust/regex/test b/third_party/rust/regex/test
new file mode 100755
index 0000000000..b10564f128
--- /dev/null
+++ b/third_party/rust/regex/test
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+# This is a convenience script for running a broad swath of tests across
+# features. We don't test the complete space, since the complete space is quite
+# large. Hopefully once we migrate the test suite to better infrastructure
+# (like regex-automata), we'll be able to test more of the space.
+echo "===== DEFAULT FEATURES ==="
+cargo test
+
+echo "===== DOC TESTS ==="
+cargo test --doc
+
+features=(
+ "std"
+ "std unicode"
+ "std unicode-perl"
+ "std perf"
+ "std perf-cache"
+ "std perf-dfa"
+ "std perf-inline"
+ "std perf-literal"
+)
+for f in "${features[@]}"; do
+ echo "===== FEATURE: $f (default) ==="
+ cargo test --test default --no-default-features --features "$f"
+ echo "===== FEATURE: $f (default-bytes) ==="
+ cargo test --test default-bytes --no-default-features --features "$f"
+done
diff --git a/third_party/rust/regex/tests/api.rs b/third_party/rust/regex/tests/api.rs
new file mode 100644
index 0000000000..c7250a8a3a
--- /dev/null
+++ b/third_party/rust/regex/tests/api.rs
@@ -0,0 +1,234 @@
+#[test]
+fn empty_regex_empty_match() {
+ let re = regex!("");
+ assert_eq!(vec![(0, 0)], findall!(re, ""));
+}
+
+#[test]
+fn empty_regex_nonempty_match() {
+ let re = regex!("");
+ assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc"));
+}
+
+#[test]
+fn one_zero_length_match() {
+ let re = regex!(r"[0-9]*");
+ assert_eq!(vec![(0, 0), (1, 2), (3, 4)], findall!(re, "a1b2"));
+}
+
+#[test]
+fn many_zero_length_match() {
+ let re = regex!(r"[0-9]*");
+ assert_eq!(
+ vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)],
+ findall!(re, "a1bbb2")
+ );
+}
+
+#[test]
+fn many_sequential_zero_length_match() {
+ let re = regex!(r"[0-9]?");
+ assert_eq!(
+ vec![(0, 0), (1, 2), (2, 3), (4, 5), (6, 6)],
+ findall!(re, "a12b3c")
+ );
+}
+
+#[test]
+fn quoted_bracket_set() {
+ let re = regex!(r"([\x{5b}\x{5d}])");
+ assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
+ let re = regex!(r"([\[\]])");
+ assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
+}
+
+#[test]
+fn first_range_starts_with_left_bracket() {
+ let re = regex!(r"([\[-z])");
+ assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
+}
+
+#[test]
+fn range_ends_with_escape() {
+ let re = regex!(r"([\[-\x{5d}])");
+ assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
+}
+
+#[test]
+fn empty_match_find_iter() {
+ let re = regex!(r".*?");
+ assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc"));
+}
+
+#[test]
+fn empty_match_captures_iter() {
+ let re = regex!(r".*?");
+ let ms: Vec<_> = re
+ .captures_iter(text!("abc"))
+ .map(|c| c.get(0).unwrap())
+ .map(|m| (m.start(), m.end()))
+ .collect();
+ assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]);
+}
+
+#[test]
+fn capture_names() {
+ let re = regex!(r"(.)(?P<a>.)");
+ assert_eq!(3, re.captures_len());
+ assert_eq!((3, Some(3)), re.capture_names().size_hint());
+ assert_eq!(
+ vec![None, None, Some("a")],
+ re.capture_names().collect::<Vec<_>>()
+ );
+}
+
+#[test]
+fn regex_string() {
+ assert_eq!(r"[a-zA-Z0-9]+", regex!(r"[a-zA-Z0-9]+").as_str());
+ assert_eq!(r"[a-zA-Z0-9]+", &format!("{}", regex!(r"[a-zA-Z0-9]+")));
+ assert_eq!(r"[a-zA-Z0-9]+", &format!("{:?}", regex!(r"[a-zA-Z0-9]+")));
+}
+
+#[test]
+fn capture_index() {
+ let re = regex!(r"^(?P<name>.+)$");
+ let cap = re.captures(t!("abc")).unwrap();
+ assert_eq!(&cap[0], t!("abc"));
+ assert_eq!(&cap[1], t!("abc"));
+ assert_eq!(&cap["name"], t!("abc"));
+}
+
+#[test]
+#[should_panic]
+#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)]
+fn capture_index_panic_usize() {
+ let re = regex!(r"^(?P<name>.+)$");
+ let cap = re.captures(t!("abc")).unwrap();
+ let _ = cap[2];
+}
+
+#[test]
+#[should_panic]
+#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)]
+fn capture_index_panic_name() {
+ let re = regex!(r"^(?P<name>.+)$");
+ let cap = re.captures(t!("abc")).unwrap();
+ let _ = cap["bad name"];
+}
+
+#[test]
+fn capture_index_lifetime() {
+ // This is a test of whether the types on `caps["..."]` are general
+ // enough. If not, this will fail to typecheck.
+ fn inner(s: &str) -> usize {
+ let re = regex!(r"(?P<number>[0-9]+)");
+ let caps = re.captures(t!(s)).unwrap();
+ caps["number"].len()
+ }
+ assert_eq!(3, inner("123"));
+}
+
+#[test]
+fn capture_misc() {
+ let re = regex!(r"(.)(?P<a>a)?(.)(?P<b>.)");
+ let cap = re.captures(t!("abc")).unwrap();
+
+ assert_eq!(5, cap.len());
+
+ assert_eq!((0, 3), {
+ let m = cap.get(0).unwrap();
+ (m.start(), m.end())
+ });
+ assert_eq!(None, cap.get(2));
+ assert_eq!((2, 3), {
+ let m = cap.get(4).unwrap();
+ (m.start(), m.end())
+ });
+
+ assert_eq!(t!("abc"), match_text!(cap.get(0).unwrap()));
+ assert_eq!(None, cap.get(2));
+ assert_eq!(t!("c"), match_text!(cap.get(4).unwrap()));
+
+ assert_eq!(None, cap.name("a"));
+ assert_eq!(t!("c"), match_text!(cap.name("b").unwrap()));
+}
+
+#[test]
+fn sub_capture_matches() {
+ let re = regex!(r"([a-z])(([a-z])|([0-9]))");
+ let cap = re.captures(t!("a5")).unwrap();
+ let subs: Vec<_> = cap.iter().collect();
+
+ assert_eq!(5, subs.len());
+ assert!(subs[0].is_some());
+ assert!(subs[1].is_some());
+ assert!(subs[2].is_some());
+ assert!(subs[3].is_none());
+ assert!(subs[4].is_some());
+
+ assert_eq!(t!("a5"), match_text!(subs[0].unwrap()));
+ assert_eq!(t!("a"), match_text!(subs[1].unwrap()));
+ assert_eq!(t!("5"), match_text!(subs[2].unwrap()));
+ assert_eq!(t!("5"), match_text!(subs[4].unwrap()));
+}
+
+expand!(expand1, r"(?-u)(?P<foo>\w+)", "abc", "$foo", "abc");
+expand!(expand2, r"(?-u)(?P<foo>\w+)", "abc", "$0", "abc");
+expand!(expand3, r"(?-u)(?P<foo>\w+)", "abc", "$1", "abc");
+expand!(expand4, r"(?-u)(?P<foo>\w+)", "abc", "$$1", "$1");
+expand!(expand5, r"(?-u)(?P<foo>\w+)", "abc", "$$foo", "$foo");
+expand!(expand6, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$b$a", "123abc");
+expand!(expand7, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "z$bz$az", "z");
+expand!(
+ expand8,
+ r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)",
+ "abc 123",
+ ".$b.$a.",
+ ".123.abc."
+);
+expand!(
+ expand9,
+ r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)",
+ "abc 123",
+ " $b $a ",
+ " 123 abc "
+);
+expand!(expand10, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$bz$az", "");
+
+expand!(expand_name1, r"%(?P<Z>[a-z]+)", "%abc", "$Z%", "abc%");
+expand!(expand_name2, r"\[(?P<Z>[a-z]+)", "[abc", "$Z[", "abc[");
+expand!(expand_name3, r"\{(?P<Z>[a-z]+)", "{abc", "$Z{", "abc{");
+expand!(expand_name4, r"\}(?P<Z>[a-z]+)", "}abc", "$Z}", "abc}");
+expand!(expand_name5, r"%([a-z]+)", "%abc", "$1a%", "%");
+expand!(expand_name6, r"%([a-z]+)", "%abc", "${1}a%", "abca%");
+expand!(expand_name7, r"\[(?P<Z[>[a-z]+)", "[abc", "${Z[}[", "abc[");
+expand!(expand_name8, r"\[(?P<Z[>[a-z]+)", "[abc", "${foo}[", "[");
+expand!(expand_name9, r"\[(?P<Z[>[a-z]+)", "[abc", "${1a}[", "[");
+expand!(expand_name10, r"\[(?P<Z[>[a-z]+)", "[abc", "${#}[", "[");
+expand!(expand_name11, r"\[(?P<Z[>[a-z]+)", "[abc", "${$$}[", "[");
+
+split!(
+ split1,
+ r"(?-u)\s+",
+ "a b\nc\td\n\t e",
+ &[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")]
+);
+split!(
+ split2,
+ r"(?-u)\b",
+ "a b c",
+ &[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c"), t!("")]
+);
+split!(split3, r"a$", "a", &[t!(""), t!("")]);
+split!(split_none, r"-", r"a", &[t!("a")]);
+split!(split_trailing_blank, r"-", r"a-", &[t!("a"), t!("")]);
+split!(split_trailing_blanks, r"-", r"a--", &[t!("a"), t!(""), t!("")]);
+split!(split_empty, r"-", r"", &[t!("")]);
+
+splitn!(splitn_below_limit, r"-", r"a", 2, &[t!("a")]);
+splitn!(splitn_at_limit, r"-", r"a-b", 2, &[t!("a"), t!("b")]);
+splitn!(splitn_above_limit, r"-", r"a-b-c", 2, &[t!("a"), t!("b-c")]);
+splitn!(splitn_zero_limit, r"-", r"a-b", 0, empty_vec!());
+splitn!(splitn_trailing_blank, r"-", r"a-", 2, &[t!("a"), t!("")]);
+splitn!(splitn_trailing_separator, r"-", r"a--", 2, &[t!("a"), t!("-")]);
+splitn!(splitn_empty, r"-", r"", 1, &[t!("")]);
diff --git a/third_party/rust/regex/tests/api_str.rs b/third_party/rust/regex/tests/api_str.rs
new file mode 100644
index 0000000000..480116da73
--- /dev/null
+++ b/third_party/rust/regex/tests/api_str.rs
@@ -0,0 +1,34 @@
+// These tests don't really make sense with the bytes API, so we only test them
+// on the Unicode API.
+
+#[test]
+fn empty_match_unicode_find_iter() {
+ // Tests that we still yield byte ranges at valid UTF-8 sequence boundaries
+ // even when we're susceptible to empty width matches.
+ let re = regex!(r".*?");
+ assert_eq!(
+ vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)],
+ findall!(re, "Ⅰ1Ⅱ2")
+ );
+}
+
+#[test]
+fn empty_match_unicode_captures_iter() {
+ // Same as empty_match_unicode_find_iter, but tests capture iteration.
+ let re = regex!(r".*?");
+ let ms: Vec<_> = re
+ .captures_iter(text!("Ⅰ1Ⅱ2"))
+ .map(|c| c.get(0).unwrap())
+ .map(|m| (m.start(), m.end()))
+ .collect();
+ assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms);
+}
+
+#[test]
+fn match_as_str() {
+ let re = regex!(r"fo+");
+ let caps = re.captures("barfoobar").unwrap();
+ assert_eq!(caps.get(0).map(|m| m.as_str()), Some("foo"));
+ assert_eq!(caps.get(0).map(From::from), Some("foo"));
+ assert_eq!(caps.get(0).map(Into::into), Some("foo"));
+}
diff --git a/third_party/rust/regex/tests/bytes.rs b/third_party/rust/regex/tests/bytes.rs
new file mode 100644
index 0000000000..d05f138edf
--- /dev/null
+++ b/third_party/rust/regex/tests/bytes.rs
@@ -0,0 +1,107 @@
+// These are tests specifically crafted for regexes that can match arbitrary
+// bytes.
+
+// A silly wrapper to make it possible to write and match raw bytes.
+struct R<'a>(&'a [u8]);
+impl<'a> R<'a> {
+ fn as_bytes(&self) -> &'a [u8] {
+ self.0
+ }
+}
+
+mat!(word_boundary, r"(?-u) \b", " δ", None);
+#[cfg(feature = "unicode-perl")]
+mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1)));
+mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1)));
+#[cfg(feature = "unicode-perl")]
+mat!(word_not_boundary_unicode, r" \B", " δ", None);
+
+mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1)));
+#[cfg(feature = "unicode-perl")]
+mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3)));
+mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1)));
+#[cfg(feature = "unicode-perl")]
+mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8)));
+mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1)));
+#[cfg(feature = "unicode-perl")]
+mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4)));
+
+// The first `(.+)` matches two Unicode codepoints, but can't match the 5th
+// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
+// matches.
+mat!(
+ mixed1,
+ r"(.+)(?-u)(.+)",
+ R(b"\xCE\x93\xCE\x94\xFF"),
+ Some((0, 5)),
+ Some((0, 4)),
+ Some((4, 5))
+);
+
+mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1)));
+mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
+#[cfg(feature = "unicode-case")]
+mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
+mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
+
+mat!(negate_unicode, r"[^a]", "δ", Some((0, 2)));
+mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1)));
+
+// This doesn't match in a normal Unicode regex because the implicit preceding
+// `.*?` is Unicode aware.
+mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2)));
+mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2)));
+
+// Have fun with null bytes.
+mat!(
+ null_bytes,
+ r"(?-u)(?P<cstr>[^\x00]+)\x00",
+ R(b"foo\x00"),
+ Some((0, 4)),
+ Some((0, 3))
+);
+
+// Test that lookahead operators work properly in the face of invalid UTF-8.
+// See: https://github.com/rust-lang/regex/issues/277
+matiter!(
+ invalidutf8_anchor1,
+ r"(?-u)\xcc?^",
+ R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
+ (0, 0)
+);
+matiter!(
+ invalidutf8_anchor2,
+ r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$",
+ R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
+ (22, 22)
+);
+matiter!(
+ invalidutf8_anchor3,
+ r"(?-u)^|ddp\xff\xffdddddlQd@\x80",
+ R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
+ (0, 0)
+);
+
+// See https://github.com/rust-lang/regex/issues/303
+#[test]
+fn negated_full_byte_range() {
+ assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
+}
+
+matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ");
+matiter!(
+ word_boundary_ascii2,
+ r"(?-u:\B)",
+ "0\u{7EF5E}",
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5)
+);
+
+// See: https://github.com/rust-lang/regex/issues/264
+mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
+mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
+
+// See: https://github.com/rust-lang/regex/issues/271
+mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));
diff --git a/third_party/rust/regex/tests/consistent.rs b/third_party/rust/regex/tests/consistent.rs
new file mode 100644
index 0000000000..722f2a51a0
--- /dev/null
+++ b/third_party/rust/regex/tests/consistent.rs
@@ -0,0 +1,238 @@
+use regex::internal::ExecBuilder;
+
+/// Given a regex, check if all of the backends produce the same
+/// results on a number of different inputs.
+///
+/// For now this just throws quickcheck at the problem, which
+/// is not very good because it only really tests half of the
+/// problem space. It is pretty unlikely that a random string
+/// will match any given regex, so this will probably just
+/// be checking that the different backends fail in the same
+/// way. This is still worthwhile to test, but is definitely not
+/// the whole story.
+///
+/// TODO(ethan): In order to cover the other half of the problem
+/// space, we should generate a random matching string by inspecting
+/// the AST of the input regex. The right way to do this probably
+/// involves adding a custom Arbitrary instance around a couple
+/// of newtypes. That way we can respect the quickcheck size hinting
+/// and shrinking and whatnot.
+pub fn backends_are_consistent(re: &str) -> Result<u64, String> {
+ let standard_backends = vec![
+ (
+ "bounded_backtracking_re",
+ ExecBuilder::new(re)
+ .bounded_backtracking()
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))?,
+ ),
+ (
+ "pikevm_re",
+ ExecBuilder::new(re)
+ .nfa()
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))?,
+ ),
+ (
+ "default_re",
+ ExecBuilder::new(re)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))?,
+ ),
+ ];
+
+ let utf8bytes_backends = vec![
+ (
+ "bounded_backtracking_utf8bytes_re",
+ ExecBuilder::new(re)
+ .bounded_backtracking()
+ .bytes(true)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))?,
+ ),
+ (
+ "pikevm_utf8bytes_re",
+ ExecBuilder::new(re)
+ .nfa()
+ .bytes(true)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))?,
+ ),
+ (
+ "default_utf8bytes_re",
+ ExecBuilder::new(re)
+ .bytes(true)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))?,
+ ),
+ ];
+
+ let bytes_backends = vec![
+ (
+ "bounded_backtracking_bytes_re",
+ ExecBuilder::new(re)
+ .bounded_backtracking()
+ .only_utf8(false)
+ .build()
+ .map(|exec| exec.into_byte_regex())
+ .map_err(|err| format!("{}", err))?,
+ ),
+ (
+ "pikevm_bytes_re",
+ ExecBuilder::new(re)
+ .nfa()
+ .only_utf8(false)
+ .build()
+ .map(|exec| exec.into_byte_regex())
+ .map_err(|err| format!("{}", err))?,
+ ),
+ (
+ "default_bytes_re",
+ ExecBuilder::new(re)
+ .only_utf8(false)
+ .build()
+ .map(|exec| exec.into_byte_regex())
+ .map_err(|err| format!("{}", err))?,
+ ),
+ ];
+
+ Ok(string_checker::check_backends(&standard_backends)?
+ + string_checker::check_backends(&utf8bytes_backends)?
+ + bytes_checker::check_backends(&bytes_backends)?)
+}
+
+//
+// A consistency checker parameterized by the input type (&str or &[u8]).
+//
+
+macro_rules! checker {
+ ($module_name:ident, $regex_type:path, $mk_input:expr) => {
+ mod $module_name {
+ use quickcheck;
+ use quickcheck::{Arbitrary, TestResult};
+
+ pub fn check_backends(
+ backends: &[(&str, $regex_type)],
+ ) -> Result<u64, String> {
+ let mut total_passed = 0;
+ for regex in backends[1..].iter() {
+ total_passed += quickcheck_regex_eq(&backends[0], regex)?;
+ }
+
+ Ok(total_passed)
+ }
+
+ fn quickcheck_regex_eq(
+ &(name1, ref re1): &(&str, $regex_type),
+ &(name2, ref re2): &(&str, $regex_type),
+ ) -> Result<u64, String> {
+ quickcheck::QuickCheck::new()
+ .quicktest(RegexEqualityTest::new(
+ re1.clone(),
+ re2.clone(),
+ ))
+ .map_err(|err| {
+ format!(
+ "{}(/{}/) and {}(/{}/) are inconsistent.\
+ QuickCheck Err: {:?}",
+ name1, re1, name2, re2, err
+ )
+ })
+ }
+
+ struct RegexEqualityTest {
+ re1: $regex_type,
+ re2: $regex_type,
+ }
+ impl RegexEqualityTest {
+ fn new(re1: $regex_type, re2: $regex_type) -> Self {
+ RegexEqualityTest { re1: re1, re2: re2 }
+ }
+ }
+
+ impl quickcheck::Testable for RegexEqualityTest {
+ fn result(&self, gen: &mut quickcheck::Gen) -> TestResult {
+ let input = $mk_input(gen);
+ let input = &input;
+
+ if self.re1.find(&input) != self.re2.find(input) {
+ return TestResult::error(format!(
+ "find mismatch input={:?}",
+ input
+ ));
+ }
+
+ let cap1 = self.re1.captures(input);
+ let cap2 = self.re2.captures(input);
+ match (cap1, cap2) {
+ (None, None) => {}
+ (Some(cap1), Some(cap2)) => {
+ for (c1, c2) in cap1.iter().zip(cap2.iter()) {
+ if c1 != c2 {
+ return TestResult::error(format!(
+ "captures mismatch input={:?}",
+ input
+ ));
+ }
+ }
+ }
+ _ => {
+ return TestResult::error(format!(
+ "captures mismatch input={:?}",
+ input
+ ))
+ }
+ }
+
+ let fi1 = self.re1.find_iter(input);
+ let fi2 = self.re2.find_iter(input);
+ for (m1, m2) in fi1.zip(fi2) {
+ if m1 != m2 {
+ return TestResult::error(format!(
+ "find_iter mismatch input={:?}",
+ input
+ ));
+ }
+ }
+
+ let ci1 = self.re1.captures_iter(input);
+ let ci2 = self.re2.captures_iter(input);
+ for (cap1, cap2) in ci1.zip(ci2) {
+ for (c1, c2) in cap1.iter().zip(cap2.iter()) {
+ if c1 != c2 {
+ return TestResult::error(format!(
+ "captures_iter mismatch input={:?}",
+ input
+ ));
+ }
+ }
+ }
+
+ let s1 = self.re1.split(input);
+ let s2 = self.re2.split(input);
+ for (chunk1, chunk2) in s1.zip(s2) {
+ if chunk1 != chunk2 {
+ return TestResult::error(format!(
+ "split mismatch input={:?}",
+ input
+ ));
+ }
+ }
+
+ TestResult::from_bool(true)
+ }
+ }
+ } // mod
+ }; // rule case
+} // macro_rules!
+
+checker!(string_checker, ::regex::Regex, |gen| String::arbitrary(gen));
+checker!(bytes_checker, ::regex::bytes::Regex, |gen| Vec::<u8>::arbitrary(
+ gen
+));
diff --git a/third_party/rust/regex/tests/crates_regex.rs b/third_party/rust/regex/tests/crates_regex.rs
new file mode 100644
index 0000000000..200ec27b2d
--- /dev/null
+++ b/third_party/rust/regex/tests/crates_regex.rs
@@ -0,0 +1,3287 @@
+// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py'
+// on 2018-06-20 09:56:32.820354.
+
+// autoshutdown-0.1.0: r"\s*(\d+)(\w)\s*"
+consistent!(autoshutdown_0, r"\s*(\d+)(\w)\s*");
+
+// epub-1.1.1: r"/"
+consistent!(epub_0, r"/");
+
+// rpi-info-0.2.0: "^Revision\t+: ([0-9a-fA-F]+)"
+consistent!(rpi_info_0, "^Revision\t+: ([0-9a-fA-F]+)");
+
+// rpi-info-0.2.0: "Serial\t+: ([0-9a-fA-F]+)"
+consistent!(rpi_info_1, "Serial\t+: ([0-9a-fA-F]+)");
+
+// pnet_macros-0.21.0: r"^u([0-9]+)(be|le|he)?$"
+consistent!(pnet_macros_0, r"^u([0-9]+)(be|le|he)?$");
+
+// iban_validate-1.0.3: r"^[A-Z]{2}\d{2}[A-Z\d]{1,30}$"
+consistent!(iban_validate_0, r"^[A-Z]{2}\d{2}[A-Z\d]{1,30}$");
+
+// markifier-0.1.0: r".*\[(?P<percent>.+)%.*\].*"
+consistent!(markifier_0, r".*\[(?P<percent>.+)%.*\].*");
+
+// mallumo-0.3.0: r"(#include) (\S*)(.*)"
+consistent!(mallumo_0, r"(#include) (\S*)(.*)");
+
+// mallumo-0.3.0: r"(ERROR: \d+:)(\d+)(: )(.+)"
+consistent!(mallumo_1, r"(ERROR: \d+:)(\d+)(: )(.+)");
+
+// mallumo-0.3.0: r"(\d+\()(\d+)(?:\) : )(.+)"
+consistent!(mallumo_2, r"(\d+\()(\d+)(?:\) : )(.+)");
+
+// magnet_more-0.0.1: r"(.+?)(\[.*?\])?"
+consistent!(magnet_more_0, r"(.+?)(\[.*?\])?");
+
+// magnet_app-0.0.1: r":(?P<k>[a-zA-Z_]+)"
+consistent!(magnet_app_0, r":(?P<k>[a-zA-Z_]+)");
+
+// yubibomb-0.2.0: r"^\d{6}(?:\s*,\s*\d{6})*$"
+consistent!(yubibomb_0, r"^\d{6}(?:\s*,\s*\d{6})*$");
+
+// multirust-rs-0.0.4: r"[\\/]([^\\/?]+)(\?.*)?$"
+consistent!(multirust_rs_0, r"[\\/]([^\\/?]+)(\?.*)?$");
+
+// hueclient-0.3.2: "\"[a-z]*\":null"
+consistent!(hueclient_0, "\"[a-z]*\":null");
+
+// hueclient-0.3.2: ",+"
+consistent!(hueclient_1, ",+");
+
+// hueclient-0.3.2: ",\\}"
+consistent!(hueclient_2, ",\\}");
+
+// hueclient-0.3.2: "\\{,"
+consistent!(hueclient_3, "\\{,");
+
+// aerial-0.1.0: r"[a-zA-Z_\$][a-zA-Z_0-9]*"
+consistent!(aerial_0, r"[a-zA-Z_\$][a-zA-Z_0-9]*");
+
+// aerial-0.1.0: r"thi[sng]+"
+consistent!(aerial_1, r"thi[sng]+");
+
+// rvue-0.1.0: r"(.+)\s+\((.+?)\)"
+consistent!(rvue_0, r"(.+)\s+\((.+?)\)");
+
+// rvue-0.1.0: r"([\d\.]+)\s*out\s*of\s*([\d\.]+)"
+consistent!(rvue_1, r"([\d\.]+)\s*out\s*of\s*([\d\.]+)");
+
+// rvue-0.1.0: r"^([\d\.]+)\s*(?:\(\))?$"
+consistent!(rvue_2, r"^([\d\.]+)\s*(?:\(\))?$");
+
+// rvue-0.1.0: r"([\d\.]+)\s*Points\s*Possible"
+consistent!(rvue_3, r"([\d\.]+)\s*Points\s*Possible");
+
+// rvue-0.1.0: r"([\d\.]+)\s*/\s*([\d\.]+)"
+consistent!(rvue_4, r"([\d\.]+)\s*/\s*([\d\.]+)");
+
+// rvsim-0.1.0: r"_?([_a-z0-9]+)\s*:\s*([_a-z0-9]+)\s*[,)]"
+consistent!(rvsim_0, r"_?([_a-z0-9]+)\s*:\s*([_a-z0-9]+)\s*[,)]");
+
+// nereon-0.1.4: "(.*[^\\\\])\\{\\}(.*)"
+consistent!(nereon_0, "(.*[^\\\\])\\{\\}(.*)");
+
+// next_episode-0.3.0: r"((?i)^(.+).s(\d+)e(\d+).*)$"
+consistent!(next_episode_0, r"((?i)^(.+).s(\d+)e(\d+).*)$");
+
+// migrant_lib-0.19.2: r"[^a-z0-9-]+"
+consistent!(migrant_lib_0, r"[^a-z0-9-]+");
+
+// migrant_lib-0.19.2: r"[0-9]{14}_[a-z0-9-]+"
+consistent!(migrant_lib_1, r"[0-9]{14}_[a-z0-9-]+");
+
+// migrant_lib-0.19.2: r"([0-9]{14}_)?[a-z0-9-]+"
+consistent!(migrant_lib_2, r"([0-9]{14}_)?[a-z0-9-]+");
+
+// minipre-0.2.0: "$_"
+consistent!(minipre_0, "$_");
+
+// minifier-0.0.13: r">\s+<"
+consistent!(minifier_0, r">\s+<");
+
+// minifier-0.0.13: r"\s{2,}|[\r\n]"
+consistent!(minifier_1, r"\s{2,}|[\r\n]");
+
+// minifier-0.0.13: r"<(style|script)[\w|\s].*?>"
+consistent!(minifier_2, r"<(style|script)[\w|\s].*?>");
+
+// minifier-0.0.13: "<!--(.|\n)*?-->"
+consistent!(minifier_3, "<!--(.|\n)*?-->");
+
+// minifier-0.0.13: r"<\w.*?>"
+consistent!(minifier_4, r"<\w.*?>");
+
+// minifier-0.0.13: r" \s+|\s +"
+consistent!(minifier_5, r" \s+|\s +");
+
+// minifier-0.0.13: r"\w\s+\w"
+consistent!(minifier_6, r"\w\s+\w");
+
+// minifier-0.0.13: r"'\s+>"
+consistent!(minifier_7, r"'\s+>");
+
+// minifier-0.0.13: r"\d\s+>"
+consistent!(minifier_8, r"\d\s+>");
+
+// ggp-rs-0.1.2: r"(?P<relation>\([^)]+\))|(?P<prop>[a-zA-Z0-9_]+)"
+consistent!(ggp_rs_0, r"(?P<relation>\([^)]+\))|(?P<prop>[a-zA-Z0-9_]+)");
+
+// ggp-rs-0.1.2: r"\((.*)\)."
+consistent!(ggp_rs_1, r"\((.*)\).");
+
+// poe-superfilter-0.2.0: "[A-Za-z0-9_]"
+consistent!(poe_superfilter_0, "[A-Za-z0-9_]");
+
+// poke-a-mango-0.5.0: r"(\d+)x(\d+)"
+consistent!(poke_a_mango_0, r"(\d+)x(\d+)");
+
+// pop3-rs-0.1.0: r"(?P<nmsg>\d+) (?P<size>\d+)"
+consistent!(pop3_rs_0, r"(?P<nmsg>\d+) (?P<size>\d+)");
+
+// pop3-rs-0.1.0: r"(?P<msgid>\d+) (?P<uidl>[\x21-\x7E]{1,70})"
+consistent!(pop3_rs_1, r"(?P<msgid>\d+) (?P<uidl>[\x21-\x7E]{1,70})");
+
+// pop3-rs-0.1.0: r"(<.*>)\r\n$"
+consistent!(pop3_rs_2, r"(<.*>)\r\n$");
+
+// pop3-rs-0.1.0: r"^(?P<status>\+OK|-ERR) (?P<statustext>.*)"
+consistent!(pop3_rs_3, r"^(?P<status>\+OK|-ERR) (?P<statustext>.*)");
+
+// pop3-1.0.6: r"^\.\r\n$"
+consistent!(pop3_0, r"^\.\r\n$");
+
+// pop3-1.0.6: r"\+OK(.*)"
+consistent!(pop3_1, r"\+OK(.*)");
+
+// pop3-1.0.6: r"-ERR(.*)"
+consistent!(pop3_2, r"-ERR(.*)");
+
+// pop3-1.0.6: r"\+OK (\d+) (\d+)\r\n"
+consistent!(pop3_3, r"\+OK (\d+) (\d+)\r\n");
+
+// pop3-1.0.6: r"(\d+) ([\x21-\x7e]+)\r\n"
+consistent!(pop3_4, r"(\d+) ([\x21-\x7e]+)\r\n");
+
+// pop3-1.0.6: r"\+OK (\d+) ([\x21-\x7e]+)\r\n"
+consistent!(pop3_5, r"\+OK (\d+) ([\x21-\x7e]+)\r\n");
+
+// pop3-1.0.6: r"(\d+) (\d+)\r\n"
+consistent!(pop3_6, r"(\d+) (\d+)\r\n");
+
+// pop3-1.0.6: r"\+OK (\d+) (\d+)\r\n"
+consistent!(pop3_7, r"\+OK (\d+) (\d+)\r\n");
+
+// polk-1.1.3: "github:(\\w+)/?(\\w+)?"
+consistent!(polk_0, "github:(\\w+)/?(\\w+)?");
+
+// geochunk-0.1.5: "^[0-9]{5}"
+consistent!(geochunk_0, "^[0-9]{5}");
+
+// generic-dns-update-1.1.4: r"((?:(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?)\.){3}(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?))"
+consistent!(generic_dns_update_0, r"((?:(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?)\.){3}(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?))");
+
+// generic-dns-update-1.1.4: r"((([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}:[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){5}:([0-9A-Fa-f]{1,4}:)?[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){4}:([0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){3}:([0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){2}:([0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(([0-9A-Fa-f]{1,4}:){0,5}:((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(::([0-9A-Fa-f]{1,4}:){0,5}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|([0-9A-Fa-f]{1,4}::([0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})|(::([0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){1,7}:))"
+consistent!(generic_dns_update_1, r"((([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}:[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){5}:([0-9A-Fa-f]{1,4}:)?[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){4}:([0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){3}:([0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){2}:([0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(([0-9A-Fa-f]{1,4}:){0,5}:((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(::([0-9A-Fa-f]{1,4}:){0,5}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|([0-9A-Fa-f]{1,4}::([0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})|(::([0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){1,7}:))");
+
+// generic-dns-update-1.1.4: r"<value><string>([0-9.]*)</string></value>"
+consistent!(
+ generic_dns_update_2,
+ r"<value><string>([0-9.]*)</string></value>"
+);
+
+// generic-dns-update-1.1.4: r"<int>([0-9]+)</int>"
+consistent!(generic_dns_update_3, r"<int>([0-9]+)</int>");
+
+// generic-dns-update-1.1.4: r"<int>([0-9]+)</int>"
+consistent!(generic_dns_update_4, r"<int>([0-9]+)</int>");
+
+// generic-dns-update-1.1.4: r"<boolean>([0-1]*)</boolean>"
+consistent!(generic_dns_update_5, r"<boolean>([0-1]*)</boolean>");
+
+// generate-nix-pkg-0.3.0: r"(\d*)\.(\d*)\.(\d*)(-(\S*))?"
+consistent!(generate_nix_pkg_0, r"(\d*)\.(\d*)\.(\d*)(-(\S*))?");
+
+// generate-nix-pkg-0.3.0: r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?"
+consistent!(generate_nix_pkg_1, r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?");
+
+// genact-0.6.0: r"arch/([a-z0-9_])+/"
+consistent!(genact_0, r"arch/([a-z0-9_])+/");
+
+// genact-0.6.0: r"arch/([a-z0-9_])+/"
+consistent!(genact_1, r"arch/([a-z0-9_])+/");
+
+// cron_rs-0.1.6: r"^\s*((\*(/\d+)?)|[0-9-,/]+)(\s+((\*(/\d+)?)|[0-9-,/]+)){4,5}\s*$"
+consistent!(
+ cron_rs_0,
+ r"^\s*((\*(/\d+)?)|[0-9-,/]+)(\s+((\*(/\d+)?)|[0-9-,/]+)){4,5}\s*$"
+);
+
+// systemfd-0.3.0: r"^([a-zA-Z]+)::(.+)$"
+consistent!(systemfd_0, r"^([a-zA-Z]+)::(.+)$");
+
+// symbolic-debuginfo-5.0.2: "__?hidden#\\d+_"
+consistent!(symbolic_debuginfo_0, "__?hidden#\\d+_");
+
+// symbolic-minidump-5.0.2: r"^Linux ([^ ]+) (.*) \w+(?: GNU/Linux)?$"
+consistent!(symbolic_minidump_0, r"^Linux ([^ ]+) (.*) \w+(?: GNU/Linux)?$");
+
+// graphql-idl-parser-0.1.1: "^(?u:\\#)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+"
+consistent!(graphql_idl_parser_0, "^(?u:\\#)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+");
+
+// graphql-idl-parser-0.1.1: "^(?u:=)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+"
+consistent!(graphql_idl_parser_1, "^(?u:=)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+");
+
+// graphql-idl-parser-0.1.1: "^(?u:[A-Z_-_a-z])(?u:[0-9A-Z_-_a-z])*"
+consistent!(graphql_idl_parser_2, "^(?u:[A-Z_-_a-z])(?u:[0-9A-Z_-_a-z])*");
+
+// graphql-idl-parser-0.1.1: "^(?u:!)"
+consistent!(graphql_idl_parser_3, "^(?u:!)");
+
+// graphql-idl-parser-0.1.1: "^(?u:\\()"
+consistent!(graphql_idl_parser_4, "^(?u:\\()");
+
+// graphql-idl-parser-0.1.1: "^(?u:\\))"
+consistent!(graphql_idl_parser_5, "^(?u:\\))");
+
+// graphql-idl-parser-0.1.1: "^(?u:,)"
+consistent!(graphql_idl_parser_6, "^(?u:,)");
+
+// graphql-idl-parser-0.1.1: "^(?u::)"
+consistent!(graphql_idl_parser_7, "^(?u::)");
+
+// graphql-idl-parser-0.1.1: "^(?u:@)"
+consistent!(graphql_idl_parser_8, "^(?u:@)");
+
+// graphql-idl-parser-0.1.1: "^(?u:\\[)"
+consistent!(graphql_idl_parser_9, "^(?u:\\[)");
+
+// graphql-idl-parser-0.1.1: "^(?u:\\])"
+consistent!(graphql_idl_parser_10, "^(?u:\\])");
+
+// graphql-idl-parser-0.1.1: "^(?u:enum)"
+consistent!(graphql_idl_parser_11, "^(?u:enum)");
+
+// graphql-idl-parser-0.1.1: "^(?u:implements)"
+consistent!(graphql_idl_parser_12, "^(?u:implements)");
+
+// graphql-idl-parser-0.1.1: "^(?u:input)"
+consistent!(graphql_idl_parser_13, "^(?u:input)");
+
+// graphql-idl-parser-0.1.1: "^(?u:interface)"
+consistent!(graphql_idl_parser_14, "^(?u:interface)");
+
+// graphql-idl-parser-0.1.1: "^(?u:scalar)"
+consistent!(graphql_idl_parser_15, "^(?u:scalar)");
+
+// graphql-idl-parser-0.1.1: "^(?u:type)"
+consistent!(graphql_idl_parser_16, "^(?u:type)");
+
+// graphql-idl-parser-0.1.1: "^(?u:union)"
+consistent!(graphql_idl_parser_17, "^(?u:union)");
+
+// graphql-idl-parser-0.1.1: "^(?u:\\{)"
+consistent!(graphql_idl_parser_18, "^(?u:\\{)");
+
+// graphql-idl-parser-0.1.1: "^(?u:\\})"
+consistent!(graphql_idl_parser_19, "^(?u:\\})");
+
+// grimoire-0.1.0: r"(?s)/\*(?P<config>.*?)\*/"
+consistent!(grimoire_0, r"(?s)/\*(?P<config>.*?)\*/");
+
+// phonenumber-0.2.0+8.9.0: r"[\d]+(?:[~\x{2053}\x{223C}\x{FF5E}][\d]+)?"
+consistent!(phonenumber_0, r"[\d]+(?:[~\x{2053}\x{223C}\x{FF5E}][\d]+)?");
+
+// phonenumber-0.2.0+8.9.0: r"[, \[\]]"
+consistent!(phonenumber_1, r"[, \[\]]");
+
+// phonenumber-0.2.0+8.9.0: r"[\\/] *x"
+consistent!(phonenumber_2, r"[\\/] *x");
+
+// phonenumber-0.2.0+8.9.0: r"[[\P{N}&&\P{L}]&&[^#]]+$"
+consistent!(phonenumber_3, r"[[\P{N}&&\P{L}]&&[^#]]+$");
+
+// phonenumber-0.2.0+8.9.0: r"(?:.*?[A-Za-z]){3}.*"
+consistent!(phonenumber_4, r"(?:.*?[A-Za-z]){3}.*");
+
+// phonenumber-0.2.0+8.9.0: r"(\D+)"
+consistent!(phonenumber_5, r"(\D+)");
+
+// phonenumber-0.2.0+8.9.0: r"(\$\d)"
+consistent!(phonenumber_6, r"(\$\d)");
+
+// phonenumber-0.2.0+8.9.0: r"\(?\$1\)?"
+consistent!(phonenumber_7, r"\(?\$1\)?");
+
+// phone_number-0.1.0: r"\D"
+consistent!(phone_number_0, r"\D");
+
+// phone_number-0.1.0: r"^0+"
+consistent!(phone_number_1, r"^0+");
+
+// phone_number-0.1.0: r"^89"
+consistent!(phone_number_2, r"^89");
+
+// phone_number-0.1.0: r"^8+"
+consistent!(phone_number_3, r"^8+");
+
+// phile-0.1.4: r"^ *(\^_*\^) *$"
+consistent!(phile_0, r"^ *(\^_*\^) *$");
+
+// phile-0.1.4: r"^[_\p{XID_Start}]$"
+consistent!(phile_1, r"^[_\p{XID_Start}]$");
+
+// phile-0.1.4: r"^\p{XID_Continue}$"
+consistent!(phile_2, r"^\p{XID_Continue}$");
+
+// uritemplate-0.1.2: "%25(?P<hex>[0-9a-fA-F][0-9a-fA-F])"
+consistent!(uritemplate_0, "%25(?P<hex>[0-9a-fA-F][0-9a-fA-F])");
+
+// urdf-rs-0.4.2: "^package://(\\w+)/"
+consistent!(urdf_rs_0, "^package://(\\w+)/");
+
+// url-match-0.1.7: r"(?P<key>[?&.])"
+consistent!(url_match_0, r"(?P<key>[?&.])");
+
+// url-match-0.1.7: r":(?P<key>[a-zA-Z0-9_-]+)"
+consistent!(url_match_1, r":(?P<key>[a-zA-Z0-9_-]+)");
+
+// tsm-sys-0.1.0: r"hello world"
+consistent!(tsm_sys_0, r"hello world");
+
+// deb-version-0.1.0: "^(?:(?:(?:\\d+:).+)|(?:[^:]+))$"
+consistent!(deb_version_0, "^(?:(?:(?:\\d+:).+)|(?:[^:]+))$");
+
+// debcargo-2.1.0: r"^(?i)(a|an|the)\s+"
+consistent!(debcargo_0, r"^(?i)(a|an|the)\s+");
+
+// debcargo-2.1.0: r"^(?i)(rust\s+)?(implementation|library|tool|crate)\s+(of|to|for)\s+"
+consistent!(
+ debcargo_1,
+ r"^(?i)(rust\s+)?(implementation|library|tool|crate)\s+(of|to|for)\s+"
+);
+
+// feaders-0.2.0: r"^.*\.h$"
+consistent!(feaders_0, r"^.*\.h$");
+
+// feaders-0.2.0: r"^.*\.c$"
+consistent!(feaders_1, r"^.*\.c$");
+
+// feaders-0.2.0: r"^.*\.hpp$"
+consistent!(feaders_2, r"^.*\.hpp$");
+
+// feaders-0.2.0: r"^.*\.cc$"
+consistent!(feaders_3, r"^.*\.cc$");
+
+// feaders-0.2.0: r"^.*\.cpp$"
+consistent!(feaders_4, r"^.*\.cpp$");
+
+// hyperscan-0.1.6: r"CPtr\(\w+\)"
+consistent!(hyperscan_0, r"CPtr\(\w+\)");
+
+// hyperscan-0.1.6: r"^Version:\s(\d\.\d\.\d)\sFeatures:\s+(\w+)?\sMode:\s(\w+)$"
+consistent!(
+ hyperscan_1,
+ r"^Version:\s(\d\.\d\.\d)\sFeatures:\s+(\w+)?\sMode:\s(\w+)$"
+);
+
+// hyperscan-0.1.6: r"RawDatabase<Block>\{db: \w+\}"
+consistent!(hyperscan_2, r"RawDatabase<Block>\{db: \w+\}");
+
+// hyperscan-0.1.6: r"RawSerializedDatabase\{p: \w+, len: \d+\}"
+consistent!(hyperscan_3, r"RawSerializedDatabase\{p: \w+, len: \d+\}");
+
+// ucd-parse-0.1.1: r"[0-9A-F]+"
+consistent!(ucd_parse_0, r"[0-9A-F]+");
+
+// afsort-0.2.0: r".*"
+consistent!(afsort_0, r".*");
+
+// afsort-0.2.0: r".*"
+consistent!(afsort_1, r".*");
+
+// afsort-0.2.0: r".*"
+consistent!(afsort_2, r".*");
+
+// afsort-0.2.0: r".*"
+consistent!(afsort_3, r".*");
+
+// afsort-0.2.0: r".*"
+consistent!(afsort_4, r".*");
+
+// afsort-0.2.0: r".*"
+consistent!(afsort_5, r".*");
+
+// afsort-0.2.0: r"^[a-z]+$"
+consistent!(afsort_6, r"^[a-z]+$");
+
+// afsort-0.2.0: r"^[a-z]+$"
+consistent!(afsort_7, r"^[a-z]+$");
+
+// tin-summer-1.21.4: r"(\.git|\.pijul|_darcs|\.hg)$"
+consistent!(tin_summer_0, r"(\.git|\.pijul|_darcs|\.hg)$");
+
+// tin-drummer-1.0.1: r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"
+consistent!(tin_drummer_0, r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$");
+
+// tin-drummer-1.0.1: r".*?\.(stats|conf|h|out|cache.*|dat|pc|info|\.js)$"
+consistent!(
+ tin_drummer_1,
+ r".*?\.(stats|conf|h|out|cache.*|dat|pc|info|\.js)$"
+);
+
+// tin-drummer-1.0.1: r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"
+consistent!(tin_drummer_2, r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$");
+
+// tin-drummer-1.0.1: r".*?\.(stats|conf|h|out|cache.*|\.js)$"
+consistent!(tin_drummer_3, r".*?\.(stats|conf|h|out|cache.*|\.js)$");
+
+// tin-drummer-1.0.1: r"(\.git|\.pijul|_darcs|\.hg)$"
+consistent!(tin_drummer_4, r"(\.git|\.pijul|_darcs|\.hg)$");
+
+// tin-drummer-1.0.1: r".*?\.(dyn_o|out|d|hi|dyn_hi|dump-.*|p_hi|p_o|prof|tix)$"
+consistent!(
+ tin_drummer_5,
+ r".*?\.(dyn_o|out|d|hi|dyn_hi|dump-.*|p_hi|p_o|prof|tix)$"
+);
+
+// tin-drummer-1.0.1: r".*?\.(ibc)$"
+consistent!(tin_drummer_6, r".*?\.(ibc)$");
+
+// tin-drummer-1.0.1: r"\.stack-work|dist-newstyle"
+consistent!(tin_drummer_7, r"\.stack-work|dist-newstyle");
+
+// timmy-0.3.0: r"_NET_WM_PID\(CARDINAL\) = (\d+)"
+consistent!(timmy_0, r"_NET_WM_PID\(CARDINAL\) = (\d+)");
+
+// timmy-0.3.0: r"today|yesterday|now"
+consistent!(timmy_1, r"today|yesterday|now");
+
+// timmy-0.3.0: r"(?P<day>\d{1,2})/(?P<month>\d{1,2})(/(?P<year>\d{4}|\d{2}))?"
+consistent!(
+ timmy_2,
+ r"(?P<day>\d{1,2})/(?P<month>\d{1,2})(/(?P<year>\d{4}|\d{2}))?"
+);
+
+// timmy-0.3.0: r"(?P<n>\d+) (days?|ds?)(?P<ago>( ago)?)"
+consistent!(timmy_3, r"(?P<n>\d+) (days?|ds?)(?P<ago>( ago)?)");
+
+// timmy-0.3.0: r"(?P<hr>\d{2}):(?P<mins>\d{2})"
+consistent!(timmy_4, r"(?P<hr>\d{2}):(?P<mins>\d{2})");
+
+// tinfo-0.5.0: r"^(\d+): \d+ windows \(.*\) \[\d+x\d+\]( \(attached\))?"
+consistent!(
+ tinfo_0,
+ r"^(\d+): \d+ windows \(.*\) \[\d+x\d+\]( \(attached\))?"
+);
+
+// tinfo-0.5.0: r"^(\d+):(\d+): (.*) \((\d+) panes\) \[(\d+)x(\d+)\]"
+consistent!(tinfo_1, r"^(\d+):(\d+): (.*) \((\d+) panes\) \[(\d+)x(\d+)\]");
+
+// timespan-0.0.4: r"(?:\\\{start\\\}|\\\{end\\\})"
+consistent!(timespan_0, r"(?:\\\{start\\\}|\\\{end\\\})");
+
+// timespan-0.0.4: r"(.*)\s+-\s+(.*)"
+consistent!(timespan_1, r"(.*)\s+-\s+(.*)");
+
+// timespan-0.0.4: r"(.*)\s+(\w+)$"
+consistent!(timespan_2, r"(.*)\s+(\w+)$");
+
+// timespan-0.0.4: r"(.*)\s+(\w+)$"
+consistent!(timespan_3, r"(.*)\s+(\w+)$");
+
+// timespan-0.0.4: r"(.*)\s+-\s+(.*)"
+consistent!(timespan_4, r"(.*)\s+-\s+(.*)");
+
+// titlecase-0.10.0: r"[[:lower:]]"
+consistent!(titlecase_0, r"[[:lower:]]");
+
+// tight-0.1.3: r"^\d+ (day|week|month|year)s?$"
+consistent!(tight_0, r"^\d+ (day|week|month|year)s?$");
+
+// tight-0.1.3: r"^\d+ (day|week|month|year)s?$"
+consistent!(tight_1, r"^\d+ (day|week|month|year)s?$");
+
+// yaml-0.2.1: r"^[-+]?(0|[1-9][0-9_]*)$"
+consistent!(yaml_0, r"^[-+]?(0|[1-9][0-9_]*)$");
+
+// yaml-0.2.1: r"^([-+]?)0o?([0-7_]+)$"
+consistent!(yaml_1, r"^([-+]?)0o?([0-7_]+)$");
+
+// yaml-0.2.1: r"^([-+]?)0x([0-9a-fA-F_]+)$"
+consistent!(yaml_2, r"^([-+]?)0x([0-9a-fA-F_]+)$");
+
+// yaml-0.2.1: r"^([-+]?)0b([0-1_]+)$"
+consistent!(yaml_3, r"^([-+]?)0b([0-1_]+)$");
+
+// yaml-0.2.1: r"^([-+]?)(\.[0-9]+|[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+)?)$"
+consistent!(
+ yaml_4,
+ r"^([-+]?)(\.[0-9]+|[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+)?)$"
+);
+
+// yaml-0.2.1: r"^[+]?(\.inf|\.Inf|\.INF)$"
+consistent!(yaml_5, r"^[+]?(\.inf|\.Inf|\.INF)$");
+
+// yaml-0.2.1: r"^-(\.inf|\.Inf|\.INF)$"
+consistent!(yaml_6, r"^-(\.inf|\.Inf|\.INF)$");
+
+// yaml-0.2.1: r"^(\.nan|\.NaN|\.NAN)$"
+consistent!(yaml_7, r"^(\.nan|\.NaN|\.NAN)$");
+
+// yaml-0.2.1: r"^(null|Null|NULL|~)$"
+consistent!(yaml_8, r"^(null|Null|NULL|~)$");
+
+// yaml-0.2.1: r"^(true|True|TRUE|yes|Yes|YES)$"
+consistent!(yaml_9, r"^(true|True|TRUE|yes|Yes|YES)$");
+
+// yaml-0.2.1: r"^(false|False|FALSE|no|No|NO)$"
+consistent!(yaml_10, r"^(false|False|FALSE|no|No|NO)$");
+
+// kefia-0.1.0: r"(?m)^(\S+)/(\S+) (\S+)(?: \((.*)\))?$"
+consistent!(kefia_0, r"(?m)^(\S+)/(\S+) (\S+)(?: \((.*)\))?$");
+
+// risp-0.7.0: "^(\\s+|;.*?(\n|$))+"
+consistent!(risp_0, "^(\\s+|;.*?(\n|$))+");
+
+// risp-0.7.0: "^\".*?\""
+consistent!(risp_1, "^\".*?\"");
+
+// risp-0.7.0: r"^[^\s\{\}()\[\]]+"
+consistent!(risp_2, r"^[^\s\{\}()\[\]]+");
+
+// risp-0.7.0: r"^-?\d+"
+consistent!(risp_3, r"^-?\d+");
+
+// ripgrep-0.8.1: "^([0-9]+)([KMG])?$"
+consistent!(ripgrep_0, "^([0-9]+)([KMG])?$");
+
+// riquid-0.0.1: r"^\w+"
+consistent!(riquid_0, r"^\w+");
+
+// riquid-0.0.1: r"^\d+"
+consistent!(riquid_1, r"^\d+");
+
+// recursive_disassembler-2.1.2: r"\A(0x)?([a-fA-F0-9]+)\z"
+consistent!(recursive_disassembler_0, r"\A(0x)?([a-fA-F0-9]+)\z");
+
+// remake-0.1.0: r"^[a-zA-Z_][a-zA-Z0-9_]*"
+consistent!(remake_0, r"^[a-zA-Z_][a-zA-Z0-9_]*");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"
+consistent!(regex_decode_0, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"
+consistent!(regex_decode_1, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"
+consistent!(regex_decode_2, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"
+consistent!(regex_decode_3, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"
+consistent!(regex_decode_4, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"
+consistent!(regex_decode_5, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{2})\)"
+consistent!(regex_decode_6, r"'(?P<title>[^']+)'\s+\((?P<year>\d{2})\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"
+consistent!(regex_decode_7, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"
+consistent!(regex_decode_8, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"
+consistent!(regex_decode_9, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"
+consistent!(regex_decode_10, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"
+consistent!(regex_decode_11, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"
+consistent!(regex_decode_12, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)");
+
+// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"
+consistent!(regex_decode_13, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)");
+
+// regex-cache-0.2.0: "[0-9]{3}-[0-9]{3}-[0-9]{4}"
+consistent!(regex_cache_0, "[0-9]{3}-[0-9]{3}-[0-9]{4}");
+
+// regex-cache-0.2.0: r"^\d+$"
+consistent!(regex_cache_1, r"^\d+$");
+
+// regex-cache-0.2.0: r"^[a-z]+$"
+consistent!(regex_cache_2, r"^[a-z]+$");
+
+// regex-cache-0.2.0: r"^\d+$"
+consistent!(regex_cache_3, r"^\d+$");
+
+// regex-cache-0.2.0: r"^\d+$"
+consistent!(regex_cache_4, r"^\d+$");
+
+// regex_dfa-0.5.0: r"\d{4}-\d{2}-\d{2}"
+consistent!(regex_dfa_0, r"\d{4}-\d{2}-\d{2}");
+
+// reaper-2.0.0: r"^[0-9\p{L} _\\.]{3,16}$"
+consistent!(reaper_0, r"^[0-9\p{L} _\\.]{3,16}$");
+
+// retdec-0.1.0: r"^attachment; filename=(.+)$"
+consistent!(retdec_0, r"^attachment; filename=(.+)$");
+
+// renvsubst-0.1.2: r"(\\)(?P<head>\$[0-9A-Za-z_{])"
+consistent!(renvsubst_0, r"(\\)(?P<head>\$[0-9A-Za-z_{])");
+
+// renvsubst-0.1.2: r"\$([[:word:]]+)"
+consistent!(renvsubst_1, r"\$([[:word:]]+)");
+
+// renvsubst-0.1.2: r"\$\{([[:word:]]+)\}"
+consistent!(renvsubst_2, r"\$\{([[:word:]]+)\}");
+
+// rexpect-0.3.0: r"'[a-z]+'"
+consistent!(rexpect_0, r"'[a-z]+'");
+
+// rexpect-0.3.0: r"^\d{4}-\d{2}-\d{2}$"
+consistent!(rexpect_1, r"^\d{4}-\d{2}-\d{2}$");
+
+// rexpect-0.3.0: r"-\d{2}-"
+consistent!(rexpect_2, r"-\d{2}-");
+
+// luther-0.1.0: "^a(b|c)c*$"
+consistent!(luther_0, "^a(b|c)c*$");
+
+// little_boxes-1.6.0: r"(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]"
+consistent!(little_boxes_0, r"(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]");
+
+// libimagentrytag-0.8.0: "^[a-zA-Z]([a-zA-Z0-9_-]*)$"
+consistent!(libimagentrytag_0, "^[a-zA-Z]([a-zA-Z0-9_-]*)$");
+
+// libimaginteraction-0.8.0: r"^[Yy](\n?)$"
+consistent!(libimaginteraction_0, r"^[Yy](\n?)$");
+
+// libimaginteraction-0.8.0: r"^[Nn](\n?)$"
+consistent!(libimaginteraction_1, r"^[Nn](\n?)$");
+
+// libimagutil-0.8.0: "^(?P<KEY>([^=]*))=(.*)$"
+consistent!(libimagutil_0, "^(?P<KEY>([^=]*))=(.*)$");
+
+// libimagutil-0.8.0: "(.*)=(\"(?P<QVALUE>([^\"]*))\"|(?P<VALUE>(.*)))$"
+consistent!(libimagutil_1, "(.*)=(\"(?P<QVALUE>([^\"]*))\"|(?P<VALUE>(.*)))$");
+
+// linux_ip-0.1.0: r"\s+"
+consistent!(linux_ip_0, r"\s+");
+
+// linux_ip-0.1.0: r"\s*[\n\r]+\s*"
+consistent!(linux_ip_1, r"\s*[\n\r]+\s*");
+
+// linux_ip-0.1.0: r"^([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$"
+consistent!(linux_ip_2, r"^([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$");
+
+// linux_ip-0.1.0: r"^([0-9a-fA-F\.:/]+|default)\s+via\s+([a-z0-9\.:]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$"
+consistent!(linux_ip_3, r"^([0-9a-fA-F\.:/]+|default)\s+via\s+([a-z0-9\.:]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$");
+
+// linux_ip-0.1.0: r"^(blackhole)\s+([0-9a-fA-F\.:/]+)$"
+consistent!(linux_ip_4, r"^(blackhole)\s+([0-9a-fA-F\.:/]+)$");
+
+// linux_ip-0.1.0: r"^(unreachable)\s+([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s+(.*)$"
+consistent!(
+ linux_ip_5,
+ r"^(unreachable)\s+([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s+(.*)$"
+);
+
+// linux_ip-0.1.0: r"\s*[\n\r]+\s*"
+consistent!(linux_ip_6, r"\s*[\n\r]+\s*");
+
+// linux_ip-0.1.0: r"^\d+:\s+([a-zA-Z0-9\.-]+)(@\S+)*:\s+(.*)$"
+consistent!(linux_ip_7, r"^\d+:\s+([a-zA-Z0-9\.-]+)(@\S+)*:\s+(.*)$");
+
+// linux_ip-0.1.0: r"\s*link/ether\s+([a-f0-9:]+)\s+.*"
+consistent!(linux_ip_8, r"\s*link/ether\s+([a-f0-9:]+)\s+.*");
+
+// linux_ip-0.1.0: r"\s*inet[6]*\s+([0-9a-f:\./]+)\s+.*"
+consistent!(linux_ip_9, r"\s*inet[6]*\s+([0-9a-f:\./]+)\s+.*");
+
+// linky-0.1.4: r"[^\w -]"
+consistent!(linky_0, r"[^\w -]");
+
+// linky-0.1.4: r"^(.*):(\d+): [^ ]* ([^ ]*)$"
+consistent!(linky_1, r"^(.*):(\d+): [^ ]* ([^ ]*)$");
+
+// limonite-0.2.1: r"^(\d{4}-\d{2}-\d{2})-(\d{3})-(.+)$"
+consistent!(limonite_0, r"^(\d{4}-\d{2}-\d{2})-(\d{3})-(.+)$");
+
+// process-queue-0.1.1: r"^[a-zA-Z]+$"
+consistent!(process_queue_0, r"^[a-zA-Z]+$");
+
+// pronghorn-0.1.2: r"^\{([a-zA-Z_]+)\}$"
+consistent!(pronghorn_0, r"^\{([a-zA-Z_]+)\}$");
+
+// protocol-ftp-client-0.1.1: "(?m:^(\\d{3}) (.+)\r$)"
+consistent!(protocol_ftp_client_0, "(?m:^(\\d{3}) (.+)\r$)");
+
+// protocol-ftp-client-0.1.1: "\"(.+)\""
+consistent!(protocol_ftp_client_1, "\"(.+)\"");
+
+// protocol-ftp-client-0.1.1: "(\\w+) [Tt]ype: (\\w+)"
+consistent!(protocol_ftp_client_2, "(\\w+) [Tt]ype: (\\w+)");
+
+// protocol-ftp-client-0.1.1: "(?m:^(\\d{3})-.+\r$)"
+consistent!(protocol_ftp_client_3, "(?m:^(\\d{3})-.+\r$)");
+
+// protocol-ftp-client-0.1.1: "Entering Passive Mode \\((\\d+),(\\d+),(\\d+),(\\d+),(\\d+),(\\d+)\\)"
+consistent!(
+ protocol_ftp_client_4,
+ "Entering Passive Mode \\((\\d+),(\\d+),(\\d+),(\\d+),(\\d+),(\\d+)\\)"
+);
+
+// protocol-ftp-client-0.1.1: "(?m:^(.+)\r$)"
+consistent!(protocol_ftp_client_5, "(?m:^(.+)\r$)");
+
+// protocol-ftp-client-0.1.1: "^([d-])(?:[rwx-]{3}){3} +\\d+ +\\w+ +\\w+ +(\\d+) +(.+) +(.+)$"
+consistent!(
+ protocol_ftp_client_6,
+ "^([d-])(?:[rwx-]{3}){3} +\\d+ +\\w+ +\\w+ +(\\d+) +(.+) +(.+)$"
+);
+
+// article-date-extractor-0.1.1: r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})"
+consistent!(article_date_extractor_0, r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})");
+
+// article-date-extractor-0.1.1: r"(?i)publishdate|pubdate|timestamp|article_date|articledate|date"
+consistent!(
+ article_date_extractor_1,
+ r"(?i)publishdate|pubdate|timestamp|article_date|articledate|date"
+);
+
+// arthas_plugin-0.1.1: r"type\((.*)\)"
+consistent!(arthas_plugin_0, r"type\((.*)\)");
+
+// arthas_plugin-0.1.1: r"Vec<(.*)>"
+consistent!(arthas_plugin_1, r"Vec<(.*)>");
+
+// arthas_plugin-0.1.1: r"Option<(.*)>"
+consistent!(arthas_plugin_2, r"Option<(.*)>");
+
+// arthas_plugin-0.1.1: r"HashMap<[a-z0-9A-Z]+, *(.*)>"
+consistent!(arthas_plugin_3, r"HashMap<[a-z0-9A-Z]+, *(.*)>");
+
+// arthas_derive-0.1.0: "Vec *< *(.*) *>"
+consistent!(arthas_derive_0, "Vec *< *(.*) *>");
+
+// arthas_derive-0.1.0: r"Option *< *(.*) *>"
+consistent!(arthas_derive_1, r"Option *< *(.*) *>");
+
+// arthas_derive-0.1.0: r"HashMap *< *[a-z0-9A-Z]+ *, *(.*) *>"
+consistent!(arthas_derive_2, r"HashMap *< *[a-z0-9A-Z]+ *, *(.*) *>");
+
+// arpabet-0.2.0: r"^([\w\-\(\)\.']+)\s+([^\s].*)\s*$"
+consistent!(arpabet_0, r"^([\w\-\(\)\.']+)\s+([^\s].*)\s*$");
+
+// arpabet-0.2.0: r"^;;;\s+"
+consistent!(arpabet_1, r"^;;;\s+");
+
+// glossy_codegen-0.2.0: r"/\*.*?\*/|//.*"
+consistent!(glossy_codegen_0, r"/\*.*?\*/|//.*");
+
+// glossy_codegen-0.2.0: "^\\s*#\\s*include\\s+<([:print:]+)>\\s*$"
+consistent!(glossy_codegen_1, "^\\s*#\\s*include\\s+<([:print:]+)>\\s*$");
+
+// glossy_codegen-0.2.0: "^\\s*#\\s*include\\s+\"([:print:]+)\"\\s*$"
+consistent!(glossy_codegen_2, "^\\s*#\\s*include\\s+\"([:print:]+)\"\\s*$");
+
+// glossy_codegen-0.2.0: r"^\s*#\s*version\s+(\d+)"
+consistent!(glossy_codegen_3, r"^\s*#\s*version\s+(\d+)");
+
+// glossy_codegen-0.2.0: r"^\s*$"
+consistent!(glossy_codegen_4, r"^\s*$");
+
+// gluster-1.0.1: r"(?P<addr>via \S+)"
+consistent!(gluster_0, r"(?P<addr>via \S+)");
+
+// gluster-1.0.1: r"(?P<src>src \S+)"
+consistent!(gluster_1, r"(?P<src>src \S+)");
+
+// gl_helpers-0.1.7: r"(.*)\[\d+\]"
+consistent!(gl_helpers_0, r"(.*)\[\d+\]");
+
+// gl_helpers-0.1.7: r"(\d+).(\d+)"
+consistent!(gl_helpers_1, r"(\d+).(\d+)");
+
+// glr-parser-0.0.1: r"(?P<c>[\\\.\+\*\?\(\)\|\[\]\{\}\^\$])"
+consistent!(glr_parser_0, r"(?P<c>[\\\.\+\*\?\(\)\|\[\]\{\}\^\$])");
+
+// glr-parser-0.0.1: r"^\w+$"
+consistent!(glr_parser_1, r"^\w+$");
+
+// glr-parser-0.0.1: "'[^']+'"
+consistent!(glr_parser_2, "'[^']+'");
+
+// hoodlum-0.5.0: r"(?m)//.*"
+consistent!(hoodlum_0, r"(?m)//.*");
+
+// form-checker-0.2.2: r"^1\d{10}$"
+consistent!(form_checker_0, r"^1\d{10}$");
+
+// form-checker-0.2.2: r"(?i)^[\w.%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,4}$"
+consistent!(form_checker_1, r"(?i)^[\w.%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,4}$");
+
+// wikibase-0.2.0: r"(?P<user_agent>[a-zA-Z0-9-_]+/[0-9\.]+)"
+consistent!(wikibase_0, r"(?P<user_agent>[a-zA-Z0-9-_]+/[0-9\.]+)");
+
+// wifiscanner-0.3.6: r"Cell [0-9]{2,} - Address:"
+consistent!(wifiscanner_0, r"Cell [0-9]{2,} - Address:");
+
+// wifiscanner-0.3.6: r"([0-9a-zA-Z]{1}[0-9a-zA-Z]{1}[:]{1}){5}[0-9a-zA-Z]{1}[0-9a-zA-Z]{1}"
+consistent!(
+ wifiscanner_1,
+ r"([0-9a-zA-Z]{1}[0-9a-zA-Z]{1}[:]{1}){5}[0-9a-zA-Z]{1}[0-9a-zA-Z]{1}"
+);
+
+// wifiscanner-0.3.6: r"Signal level=(\d+)/100"
+consistent!(wifiscanner_2, r"Signal level=(\d+)/100");
+
+// bbcode-1.0.2: r"(?s)\[b\](.*?)\[/b\]"
+consistent!(bbcode_0, r"(?s)\[b\](.*?)\[/b\]");
+
+// bbcode-1.0.2: r"(?s)\[i\](.*?)\[/i\]"
+consistent!(bbcode_1, r"(?s)\[i\](.*?)\[/i\]");
+
+// bbcode-1.0.2: r"(?s)\[u\](.*?)\[/u\]"
+consistent!(bbcode_2, r"(?s)\[u\](.*?)\[/u\]");
+
+// bbcode-1.0.2: r"(?s)\[s\](.*?)\[/s\]"
+consistent!(bbcode_3, r"(?s)\[s\](.*?)\[/s\]");
+
+// bbcode-1.0.2: r"(?s)\[size=(\d+)](.*?)\[/size\]"
+consistent!(bbcode_4, r"(?s)\[size=(\d+)](.*?)\[/size\]");
+
+// bbcode-1.0.2: r"(?s)\[color=(.+)](.*?)\[/color\]"
+consistent!(bbcode_5, r"(?s)\[color=(.+)](.*?)\[/color\]");
+
+// bbcode-1.0.2: r"(?s)\[center\](.*?)\[/center\]"
+consistent!(bbcode_6, r"(?s)\[center\](.*?)\[/center\]");
+
+// bbcode-1.0.2: r"(?s)\[left\](.*?)\[/left\]"
+consistent!(bbcode_7, r"(?s)\[left\](.*?)\[/left\]");
+
+// bbcode-1.0.2: r"(?s)\[right\](.*?)\[/right\]"
+consistent!(bbcode_8, r"(?s)\[right\](.*?)\[/right\]");
+
+// bbcode-1.0.2: r"(?s)\[table\](.*?)\[/table\]"
+consistent!(bbcode_9, r"(?s)\[table\](.*?)\[/table\]");
+
+// bbcode-1.0.2: r"(?s)\[td\](.*?)\[/td\]"
+consistent!(bbcode_10, r"(?s)\[td\](.*?)\[/td\]");
+
+// bbcode-1.0.2: r"(?s)\[tr\](.*?)\[/tr\]"
+consistent!(bbcode_11, r"(?s)\[tr\](.*?)\[/tr\]");
+
+// bbcode-1.0.2: r"(?s)\[th\](.*?)\[/th\]"
+consistent!(bbcode_12, r"(?s)\[th\](.*?)\[/th\]");
+
+// bbcode-1.0.2: r"(?s)\[url\](.*?)\[/url\]"
+consistent!(bbcode_13, r"(?s)\[url\](.*?)\[/url\]");
+
+// bbcode-1.0.2: r"(?s)\[url=(.+)\](.*?)\[/url\]"
+consistent!(bbcode_14, r"(?s)\[url=(.+)\](.*?)\[/url\]");
+
+// bbcode-1.0.2: r"(?s)\[quote\](.*?)\[/quote\]"
+consistent!(bbcode_15, r"(?s)\[quote\](.*?)\[/quote\]");
+
+// bbcode-1.0.2: r"(?s)\[quote=(.+)\](.*?)\[/quote\]"
+consistent!(bbcode_16, r"(?s)\[quote=(.+)\](.*?)\[/quote\]");
+
+// bbcode-1.0.2: r"(?s)\[img=(\d+)x(\d+)(\b.*)?\](.*?)\[/img\]"
+consistent!(bbcode_17, r"(?s)\[img=(\d+)x(\d+)(\b.*)?\](.*?)\[/img\]");
+
+// bbcode-1.0.2: r"(?s)\[img=(.+)(\b.*)?\](.*?)\[/img\]"
+consistent!(bbcode_18, r"(?s)\[img=(.+)(\b.*)?\](.*?)\[/img\]");
+
+// bbcode-1.0.2: r"(?s)\[img(\b.*)?\](.*?)\[/img\]"
+consistent!(bbcode_19, r"(?s)\[img(\b.*)?\](.*?)\[/img\]");
+
+// bbcode-1.0.2: r"(?s)\[ol\](.*?)\[/ol\]"
+consistent!(bbcode_20, r"(?s)\[ol\](.*?)\[/ol\]");
+
+// bbcode-1.0.2: r"(?s)\[ul\](.*?)\[/ul\]"
+consistent!(bbcode_21, r"(?s)\[ul\](.*?)\[/ul\]");
+
+// bbcode-1.0.2: r"(?s)\[list\](.*?)\[/list\]"
+consistent!(bbcode_22, r"(?s)\[list\](.*?)\[/list\]");
+
+// bbcode-1.0.2: r"(?s)\[youtube\](.*?)\[/youtube\]"
+consistent!(bbcode_23, r"(?s)\[youtube\](.*?)\[/youtube\]");
+
+// bbcode-1.0.2: r"(?s)\[youtube=(\d+)x(\d+)\](.*?)\[/youtube\]"
+consistent!(bbcode_24, r"(?s)\[youtube=(\d+)x(\d+)\](.*?)\[/youtube\]");
+
+// bbcode-1.0.2: r"(?s)\[li\](.*?)\[/li\]"
+consistent!(bbcode_25, r"(?s)\[li\](.*?)\[/li\]");
+
+// block-utils-0.5.0: r"loop\d+"
+consistent!(block_utils_0, r"loop\d+");
+
+// block-utils-0.5.0: r"ram\d+"
+consistent!(block_utils_1, r"ram\d+");
+
+// block-utils-0.5.0: r"md\d+"
+consistent!(block_utils_2, r"md\d+");
+
+// kvvliveapi-0.1.0: r"^([1-9]) min$"
+consistent!(kvvliveapi_0, r"^([1-9]) min$");
+
+// rfc822_sanitizer-0.3.3: r"(\d{2}):(\d{2}):(\d{2})"
+consistent!(rfc822_sanitizer_0, r"(\d{2}):(\d{2}):(\d{2})");
+
+// rfc822_sanitizer-0.3.3: r"(\d{1,2}):(\d{1,2}):(\d{1,2})"
+consistent!(rfc822_sanitizer_1, r"(\d{1,2}):(\d{1,2}):(\d{1,2})");
+
+// faker-0.0.4: r"[2-9]"
+consistent!(faker_0, r"[2-9]");
+
+// faker-0.0.4: r"[1-9]"
+consistent!(faker_1, r"[1-9]");
+
+// faker-0.0.4: r"[0-9]"
+consistent!(faker_2, r"[0-9]");
+
+// faker-0.0.4: r"\d{10}"
+consistent!(faker_3, r"\d{10}");
+
+// faker-0.0.4: r"\d{1}"
+consistent!(faker_4, r"\d{1}");
+
+// faker-0.0.4: r"^\w+"
+consistent!(faker_5, r"^\w+");
+
+// faker-0.0.4: r"^\w+"
+consistent!(faker_6, r"^\w+");
+
+// faker-0.0.4: r"^(\w+\.? ?){2,3}$"
+consistent!(faker_7, r"^(\w+\.? ?){2,3}$");
+
+// faker-0.0.4: r"^[A-Z][a-z]+\.?$"
+consistent!(faker_8, r"^[A-Z][a-z]+\.?$");
+
+// faker-0.0.4: r"^[A-Z][A-Za-z]*\.?$"
+consistent!(faker_9, r"^[A-Z][A-Za-z]*\.?$");
+
+// faker-0.0.4: r"http://lorempixel.com/100/100/\w+"
+consistent!(faker_10, r"http://lorempixel.com/100/100/\w+");
+
+// faker-0.0.4: r"http://lorempixel.com/100/100/cats"
+consistent!(faker_11, r"http://lorempixel.com/100/100/cats");
+
+// fancy-regex-0.1.0: "(?i:ß)"
+consistent!(fancy_regex_0, "(?i:ß)");
+
+// fancy-regex-0.1.0: "(?i:\\x{0587})"
+consistent!(fancy_regex_1, "(?i:\\x{0587})");
+
+// fancy-regex-0.1.0: "^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})"
+consistent!(fancy_regex_2, "^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})");
+
+// fancy-prompt-0.1.5: r"/([^/])[^/]+/"
+consistent!(fancy_prompt_0, r"/([^/])[^/]+/");
+
+// fancy-prompt-0.1.5: r"^([^:]+):.*?(?::([^:]+))?$"
+consistent!(fancy_prompt_1, r"^([^:]+):.*?(?::([^:]+))?$");
+
+// fanta-0.2.0: r"^(/?__\w+__)/(.*)"
+consistent!(fanta_0, r"^(/?__\w+__)/(.*)");
+
+// fanta-cli-0.1.1: r"(.)([A-Z])"
+consistent!(fanta_cli_0, r"(.)([A-Z])");
+
+// fanta-cli-0.1.1: "\\{:[^\\s]+\\}"
+consistent!(fanta_cli_1, "\\{:[^\\s]+\\}");
+
+// amethyst_tools-0.7.1: "(?P<last>[^\r])\n"
+consistent!(amethyst_tools_0, "(?P<last>[^\r])\n");
+
+// amigo-0.3.1: r"^-?\d+(\.\d)?"
+consistent!(amigo_0, r"^-?\d+(\.\d)?");
+
+// amigo-0.3.1: r"^[a-zA-Z_]+[\w-]*[!?_]?"
+consistent!(amigo_1, r"^[a-zA-Z_]+[\w-]*[!?_]?");
+
+// amigo-0.3.1: r"^\("
+consistent!(amigo_2, r"^\(");
+
+// amigo-0.3.1: r"^\)"
+consistent!(amigo_3, r"^\)");
+
+// amigo-0.3.1: r"^\s+"
+consistent!(amigo_4, r"^\s+");
+
+// ethcore-logger-1.12.0: "\x1b\\[[^m]+m"
+consistent!(ethcore_logger_0, "\x1b\\[[^m]+m");
+
+// dash2html-1.0.1: r"__.*?__"
+consistent!(dash2html_0, r"__.*?__");
+
+// dash2html-1.0.1: r"(?i)@(?:time|clipboard|cursor|date)"
+consistent!(dash2html_1, r"(?i)@(?:time|clipboard|cursor|date)");
+
+// os_type-2.0.0: r"^Microsoft Windows \[Version\s(\d+\.\d+\.\d+)\]$"
+consistent!(os_type_0, r"^Microsoft Windows \[Version\s(\d+\.\d+\.\d+)\]$");
+
+// os_type-2.0.0: r"ProductName:\s([\w\s]+)\n"
+consistent!(os_type_1, r"ProductName:\s([\w\s]+)\n");
+
+// os_type-2.0.0: r"ProductVersion:\s(\w+\.\w+\.\w+)"
+consistent!(os_type_2, r"ProductVersion:\s(\w+\.\w+\.\w+)");
+
+// os_type-2.0.0: r"BuildVersion:\s(\w+)"
+consistent!(os_type_3, r"BuildVersion:\s(\w+)");
+
+// os_type-2.0.0: r"(\w+) Linux release"
+consistent!(os_type_4, r"(\w+) Linux release");
+
+// os_type-2.0.0: r"release\s([\w\.]+)"
+consistent!(os_type_5, r"release\s([\w\.]+)");
+
+// os_type-2.0.0: r"Distributor ID:\s(\w+)"
+consistent!(os_type_6, r"Distributor ID:\s(\w+)");
+
+// os_type-2.0.0: r"Release:\s([\w\.]+)"
+consistent!(os_type_7, r"Release:\s([\w\.]+)");
+
+// bindgen-0.37.0: r"typename type\-parameter\-\d+\-\d+::.+"
+consistent!(bindgen_0, r"typename type\-parameter\-\d+\-\d+::.+");
+
+// imap-0.8.1: "^+(.*)\r\n"
+consistent!(imap_0, "^+(.*)\r\n");
+
+// image-base64-0.1.0: r"^ffd8ffe0"
+consistent!(image_base64_0, r"^ffd8ffe0");
+
+// image-base64-0.1.0: r"^89504e47"
+consistent!(image_base64_1, r"^89504e47");
+
+// image-base64-0.1.0: r"^47494638"
+consistent!(image_base64_2, r"^47494638");
+
+// json-pointer-0.3.2: "^(/([^/~]|~[01])*)*$"
+consistent!(json_pointer_0, "^(/([^/~]|~[01])*)*$");
+
+// json-pointer-0.3.2: "^#(/([^/~%]|~[01]|%[0-9a-fA-F]{2})*)*$"
+consistent!(json_pointer_1, "^#(/([^/~%]|~[01]|%[0-9a-fA-F]{2})*)*$");
+
+// mysql_common-0.7.0: r"^5.5.5-(\d{1,2})\.(\d{1,2})\.(\d{1,3})-MariaDB"
+consistent!(mysql_common_0, r"^5.5.5-(\d{1,2})\.(\d{1,2})\.(\d{1,3})-MariaDB");
+
+// mysql_common-0.7.0: r"^(\d{1,2})\.(\d{1,2})\.(\d{1,3})(.*)"
+consistent!(mysql_common_1, r"^(\d{1,2})\.(\d{1,2})\.(\d{1,3})(.*)");
+
+// government_id-0.1.0: r"^[0-9]{4}[0-9A-Z]{2}[0-9]{3}$"
+consistent!(government_id_0, r"^[0-9]{4}[0-9A-Z]{2}[0-9]{3}$");
+
+// ohmers-0.1.1: r"UniqueIndexViolation: (\w+)"
+consistent!(ohmers_0, r"UniqueIndexViolation: (\w+)");
+
+// eliza-1.0.0: r"(.*) you are (.*)"
+consistent!(eliza_0, r"(.*) you are (.*)");
+
+// eliza-1.0.0: r"(.*) you are (.*)"
+consistent!(eliza_1, r"(.*) you are (.*)");
+
+// eliza-1.0.0: r"(.*) you are (.*)"
+consistent!(eliza_2, r"(.*) you are (.*)");
+
+// chema-0.0.5: "^\\s*\\*"
+consistent!(chema_0, "^\\s*\\*");
+
+// chema-0.0.5: "^\\s*@(\\w+)\\s+(.*)"
+consistent!(chema_1, "^\\s*@(\\w+)\\s+(.*)");
+
+// chord3-0.3.0: r"^\s*#"
+consistent!(chord3_0, r"^\s*#");
+
+// chord3-0.3.0: r"\{(?P<cmd>\w+)(?::?\s*(?P<arg>.*))?\}"
+consistent!(chord3_1, r"\{(?P<cmd>\w+)(?::?\s*(?P<arg>.*))?\}");
+
+// chord3-0.3.0: r"\{(eot|end_of_tab):?\s*"
+consistent!(chord3_2, r"\{(eot|end_of_tab):?\s*");
+
+// chord3-0.3.0: r"([^\[]*)(?:\[([^\]]*)\])?"
+consistent!(chord3_3, r"([^\[]*)(?:\[([^\]]*)\])?");
+
+// checkmail-0.1.1: "^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$"
+consistent!(checkmail_0, "^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$");
+
+// cntk-0.2.1: r"\b\w\w+\b"
+consistent!(cntk_0, r"\b\w\w+\b");
+
+// cntk-0.2.1: r"\b\w\w+\b"
+consistent!(cntk_1, r"\b\w\w+\b");
+
+// cniguru-0.1.0: r"\(id: (\d+)\)"
+consistent!(cniguru_0, r"\(id: (\d+)\)");
+
+// upm_lib-0.3.0: r"^(\d+)\.(\d+)\.(\d+)(?:-([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?(?:\+([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?$"
+consistent!(upm_lib_0, r"^(\d+)\.(\d+)\.(\d+)(?:-([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?(?:\+([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?$");
+
+// avro-0.2.1: r"^\s*(\*+(\s+))?"
+consistent!(avro_0, r"^\s*(\*+(\s+))?");
+
+// avro-0.2.1: r"^\s*(\*+)?"
+consistent!(avro_1, r"^\s*(\*+)?");
+
+// nomi-0.0.2: "[0-9]+"
+consistent!(nomi_0, "[0-9]+");
+
+// nodes-0.1.0: "([0-9]+)@(?:nodes|n)?:([^@]+)?"
+consistent!(nodes_0, "([0-9]+)@(?:nodes|n)?:([^@]+)?");
+
+// not-stakkr-1.0.0: r"(?i)in (\d+) (second|minute|hour|day|week)s?"
+consistent!(not_stakkr_0, r"(?i)in (\d+) (second|minute|hour|day|week)s?");
+
+// notetxt-0.0.1: "^([A-Za-z0-9 -_:]+)\n-+\n"
+consistent!(notetxt_0, "^([A-Za-z0-9 -_:]+)\n-+\n");
+
+// nail-0.1.0-pre.0: r"^-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?$"
+consistent!(nail_0, r"^-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?$");
+
+// nail-0.1.0-pre.0: r"^-?[0-9]+$"
+consistent!(nail_1, r"^-?[0-9]+$");
+
+// askalono-0.2.0: r"[^\w\s\pP]+"
+consistent!(askalono_0, r"[^\w\s\pP]+");
+
+// askalono-0.2.0: r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+"
+consistent!(askalono_1, r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+");
+
+// askalono-0.2.0: r"\p{Pd}+"
+consistent!(askalono_2, r"\p{Pd}+");
+
+// askalono-0.2.0: r"\p{Ps}+"
+consistent!(askalono_3, r"\p{Ps}+");
+
+// askalono-0.2.0: r"\p{Pe}+"
+consistent!(askalono_4, r"\p{Pe}+");
+
+// askalono-0.2.0: r"\p{Pc}+"
+consistent!(askalono_5, r"\p{Pc}+");
+
+// askalono-0.2.0: r"[©Ⓒⓒ]"
+consistent!(askalono_6, r"[©Ⓒⓒ]");
+
+// askalono-0.2.0: r"[\r\n\v\f]"
+consistent!(askalono_7, r"[\r\n\v\f]");
+
+// askalono-0.2.0: r"\n{3,}"
+consistent!(askalono_8, r"\n{3,}");
+
+// askalono-0.2.0: r"[^\w\s]+"
+consistent!(askalono_9, r"[^\w\s]+");
+
+// askalono-0.2.0: r"\s+"
+consistent!(askalono_10, r"\s+");
+
+// assembunny_plus-0.0.3: r"[^0-9a-zA-Z_]"
+consistent!(assembunny_plus_0, r"[^0-9a-zA-Z_]");
+
+// assembunny_plus-0.0.3: r"[0-9]"
+consistent!(assembunny_plus_1, r"[0-9]");
+
+// salt-compressor-0.4.0: r"(?m)^Minion (\S*) did not respond\. No job will be sent\.$"
+consistent!(
+ salt_compressor_0,
+ r"(?m)^Minion (\S*) did not respond\. No job will be sent\.$"
+);
+
+// sabisabi-0.4.1: r"</?[^>]+?>"
+consistent!(sabisabi_0, r"</?[^>]+?>");
+
+// sabisabi-0.4.1: r"\([^)]*\)"
+consistent!(sabisabi_1, r"\([^)]*\)");
+
+// sassers-0.13.5-h28: "@import \"([^\"]*)\";"
+consistent!(sassers_0, "@import \"([^\"]*)\";");
+
+// shadowsocks-0.6.2: r"[A-Za-z\d-]{1,63}$"
+consistent!(shadowsocks_0, r"[A-Za-z\d-]{1,63}$");
+
+// shkeleton-0.1.5: "[abc]+"
+consistent!(shkeleton_0, "[abc]+");
+
+// shellwords-0.1.0: r"([^A-Za-z0-9_\-.,:/@\n])"
+consistent!(shellwords_0, r"([^A-Za-z0-9_\-.,:/@\n])");
+
+// shellwords-0.1.0: r"\n"
+consistent!(shellwords_1, r"\n");
+
+// shush-0.1.5: "(?P<num>[0-9]+)(?P<units>[dhms])"
+consistent!(shush_0, "(?P<num>[0-9]+)(?P<units>[dhms])");
+
+// woothee-0.8.0: r"(?:Chrome|CrMo|CriOS)/([.0-9]+)"
+consistent!(woothee_0, r"(?:Chrome|CrMo|CriOS)/([.0-9]+)");
+
+// woothee-0.8.0: r"Vivaldi/([.0-9]+)"
+consistent!(woothee_1, r"Vivaldi/([.0-9]+)");
+
+// woothee-0.8.0: r"Firefox/([.0-9]+)"
+consistent!(woothee_2, r"Firefox/([.0-9]+)");
+
+// woothee-0.8.0: r"^Mozilla/[.0-9]+ \((?:Mobile|Tablet);(?:.*;)? rv:([.0-9]+)\) Gecko/[.0-9]+ Firefox/[.0-9]+$"
+consistent!(woothee_3, r"^Mozilla/[.0-9]+ \((?:Mobile|Tablet);(?:.*;)? rv:([.0-9]+)\) Gecko/[.0-9]+ Firefox/[.0-9]+$");
+
+// woothee-0.8.0: r"FxiOS/([.0-9]+)"
+consistent!(woothee_4, r"FxiOS/([.0-9]+)");
+
+// woothee-0.8.0: r"\(([^;)]+);FOMA;"
+consistent!(woothee_5, r"\(([^;)]+);FOMA;");
+
+// woothee-0.8.0: r"jig browser[^;]+; ([^);]+)"
+consistent!(woothee_6, r"jig browser[^;]+; ([^);]+)");
+
+// woothee-0.8.0: r"(?i)rss(?:reader|bar|[-_ /;()]|[ +]*/)"
+consistent!(woothee_7, r"(?i)rss(?:reader|bar|[-_ /;()]|[ +]*/)");
+
+// woothee-0.8.0: r"(?i)(?:bot|crawler|spider)(?:[-_ ./;@()]|$)"
+consistent!(woothee_8, r"(?i)(?:bot|crawler|spider)(?:[-_ ./;@()]|$)");
+
+// woothee-0.8.0: r"(?i)(?:feed|web) ?parser"
+consistent!(woothee_9, r"(?i)(?:feed|web) ?parser");
+
+// woothee-0.8.0: r"(?i)watch ?dog"
+consistent!(woothee_10, r"(?i)watch ?dog");
+
+// woothee-0.8.0: r"Edge/([.0-9]+)"
+consistent!(woothee_11, r"Edge/([.0-9]+)");
+
+// woothee-0.8.0: r"MSIE ([.0-9]+);"
+consistent!(woothee_12, r"MSIE ([.0-9]+);");
+
+// woothee-0.8.0: r"Version/([.0-9]+)"
+consistent!(woothee_13, r"Version/([.0-9]+)");
+
+// woothee-0.8.0: r"Opera[/ ]([.0-9]+)"
+consistent!(woothee_14, r"Opera[/ ]([.0-9]+)");
+
+// woothee-0.8.0: r"OPR/([.0-9]+)"
+consistent!(woothee_15, r"OPR/([.0-9]+)");
+
+// woothee-0.8.0: r"Version/([.0-9]+)"
+consistent!(woothee_16, r"Version/([.0-9]+)");
+
+// woothee-0.8.0: r"(?:SoftBank|Vodafone|J-PHONE)/[.0-9]+/([^ /;()]+)"
+consistent!(woothee_17, r"(?:SoftBank|Vodafone|J-PHONE)/[.0-9]+/([^ /;()]+)");
+
+// woothee-0.8.0: r"Trident/([.0-9]+);"
+consistent!(woothee_18, r"Trident/([.0-9]+);");
+
+// woothee-0.8.0: r" rv:([.0-9]+)"
+consistent!(woothee_19, r" rv:([.0-9]+)");
+
+// woothee-0.8.0: r"IEMobile/([.0-9]+);"
+consistent!(woothee_20, r"IEMobile/([.0-9]+);");
+
+// woothee-0.8.0: r"(?:WILLCOM|DDIPOCKET);[^/]+/([^ /;()]+)"
+consistent!(woothee_21, r"(?:WILLCOM|DDIPOCKET);[^/]+/([^ /;()]+)");
+
+// woothee-0.8.0: r"Windows ([ .a-zA-Z0-9]+)[;\\)]"
+consistent!(woothee_22, r"Windows ([ .a-zA-Z0-9]+)[;\\)]");
+
+// woothee-0.8.0: r"^Phone(?: OS)? ([.0-9]+)"
+consistent!(woothee_23, r"^Phone(?: OS)? ([.0-9]+)");
+
+// woothee-0.8.0: r"iP(hone;|ad;|od) .*like Mac OS X"
+consistent!(woothee_24, r"iP(hone;|ad;|od) .*like Mac OS X");
+
+// woothee-0.8.0: r"Version/([.0-9]+)"
+consistent!(woothee_25, r"Version/([.0-9]+)");
+
+// woothee-0.8.0: r"rv:(\d+\.\d+\.\d+)"
+consistent!(woothee_26, r"rv:(\d+\.\d+\.\d+)");
+
+// woothee-0.8.0: r"FreeBSD ([^;\)]+);"
+consistent!(woothee_27, r"FreeBSD ([^;\)]+);");
+
+// woothee-0.8.0: r"CrOS ([^\)]+)\)"
+consistent!(woothee_28, r"CrOS ([^\)]+)\)");
+
+// woothee-0.8.0: r"Android[- ](\d+\.\d+(?:\.\d+)?)"
+consistent!(woothee_29, r"Android[- ](\d+\.\d+(?:\.\d+)?)");
+
+// woothee-0.8.0: r"PSP \(PlayStation Portable\); ([.0-9]+)\)"
+consistent!(woothee_30, r"PSP \(PlayStation Portable\); ([.0-9]+)\)");
+
+// woothee-0.8.0: r"PLAYSTATION 3;? ([.0-9]+)\)"
+consistent!(woothee_31, r"PLAYSTATION 3;? ([.0-9]+)\)");
+
+// woothee-0.8.0: r"PlayStation Vita ([.0-9]+)\)"
+consistent!(woothee_32, r"PlayStation Vita ([.0-9]+)\)");
+
+// woothee-0.8.0: r"PlayStation 4 ([.0-9]+)\)"
+consistent!(woothee_33, r"PlayStation 4 ([.0-9]+)\)");
+
+// woothee-0.8.0: r"BB10(?:.+)Version/([.0-9]+) "
+consistent!(woothee_34, r"BB10(?:.+)Version/([.0-9]+) ");
+
+// woothee-0.8.0: r"BlackBerry(?:\d+)/([.0-9]+) "
+consistent!(woothee_35, r"BlackBerry(?:\d+)/([.0-9]+) ");
+
+// woothee-0.8.0: r"; CPU(?: iPhone)? OS (\d+_\d+(?:_\d+)?) like Mac OS X"
+consistent!(
+ woothee_36,
+ r"; CPU(?: iPhone)? OS (\d+_\d+(?:_\d+)?) like Mac OS X"
+);
+
+// woothee-0.8.0: r"Mac OS X (10[._]\d+(?:[._]\d+)?)(?:\)|;)"
+consistent!(woothee_37, r"Mac OS X (10[._]\d+(?:[._]\d+)?)(?:\)|;)");
+
+// woothee-0.8.0: r"^(?:Apache-HttpClient/|Jakarta Commons-HttpClient/|Java/)"
+consistent!(
+ woothee_38,
+ r"^(?:Apache-HttpClient/|Jakarta Commons-HttpClient/|Java/)"
+);
+
+// woothee-0.8.0: r"[- ]HttpClient(/|$)"
+consistent!(woothee_39, r"[- ]HttpClient(/|$)");
+
+// woothee-0.8.0: r"^(?:PHP|WordPress|CakePHP|PukiWiki|PECL::HTTP)(?:/| |$)"
+consistent!(
+ woothee_40,
+ r"^(?:PHP|WordPress|CakePHP|PukiWiki|PECL::HTTP)(?:/| |$)"
+);
+
+// woothee-0.8.0: r"(?:PEAR HTTP_Request|HTTP_Request)(?: class|2)"
+consistent!(woothee_41, r"(?:PEAR HTTP_Request|HTTP_Request)(?: class|2)");
+
+// woothee-0.8.0: r"(?:Rome Client |UnwindFetchor/|ia_archiver |Summify |PostRank/)"
+consistent!(
+ woothee_42,
+ r"(?:Rome Client |UnwindFetchor/|ia_archiver |Summify |PostRank/)"
+);
+
+// woothee-0.8.0: r"Sleipnir/([.0-9]+)"
+consistent!(woothee_43, r"Sleipnir/([.0-9]+)");
+
+// word_replace-0.0.3: r"@@[a-z|A-Z|\d]+@@"
+consistent!(word_replace_0, r"@@[a-z|A-Z|\d]+@@");
+
+// wordcount-0.1.0: r"\w+"
+consistent!(wordcount_0, r"\w+");
+
+// just-0.3.12: "^([^=]+)=(.*)$"
+consistent!(just_0, "^([^=]+)=(.*)$");
+
+// emote-0.1.0: r":[a-zA-Z_]+?:"
+consistent!(emote_0, r":[a-zA-Z_]+?:");
+
+// emojicons-1.0.1: r":([a-zA-Z0-9_+-]+):"
+consistent!(emojicons_0, r":([a-zA-Z0-9_+-]+):");
+
+// git2_codecommit-0.1.2: r"git-codecommit\.([a-z0-9-]+)\.amazonaws\.com"
+consistent!(
+ git2_codecommit_0,
+ r"git-codecommit\.([a-z0-9-]+)\.amazonaws\.com"
+);
+
+// git-workarea-3.1.2: r"^submodule\.(?P<name>.*)\.(?P<key>[^=]*)=(?P<value>.*)$"
+consistent!(
+ git_workarea_0,
+ r"^submodule\.(?P<name>.*)\.(?P<key>[^=]*)=(?P<value>.*)$"
+);
+
+// git-shell-enforce-directory-1.0.0: r"^(?P<command>git-(?:receive|upload)-pack) '(?P<path>.+)'$"
+consistent!(
+ git_shell_enforce_directory_0,
+ r"^(?P<command>git-(?:receive|upload)-pack) '(?P<path>.+)'$"
+);
+
+// git-journal-1.6.3: r"[ \n]:(.*?):"
+consistent!(git_journal_0, r"[ \n]:(.*?):");
+
+// git-find-0.3.2: r"^git@(?P<host>[[:alnum:]\._-]+):(?P<path>[[:alnum:]\._\-/]+).git$"
+consistent!(
+ git_find_0,
+ r"^git@(?P<host>[[:alnum:]\._-]+):(?P<path>[[:alnum:]\._\-/]+).git$"
+);
+
+// gitlab-api-0.6.0: r"private_token=\w{20}"
+consistent!(gitlab_api_0, r"private_token=\w{20}");
+
+// td-client-0.7.0: "^(http://|https://)"
+consistent!(td_client_0, "^(http://|https://)");
+
+// karaconv-0.3.0: r"--(?P<type>[a-zA-Z]+)-- (?P<contents>.*)"
+consistent!(karaconv_0, r"--(?P<type>[a-zA-Z]+)-- (?P<contents>.*)");
+
+// katana-1.0.2: r"(?P<comp>et al\.)(?:\.)"
+consistent!(katana_0, r"(?P<comp>et al\.)(?:\.)");
+
+// katana-1.0.2: r"\.{3}"
+consistent!(katana_1, r"\.{3}");
+
+// katana-1.0.2: r"(?P<number>[0-9]+)\.(?P<decimal>[0-9]+)"
+consistent!(katana_2, r"(?P<number>[0-9]+)\.(?P<decimal>[0-9]+)");
+
+// katana-1.0.2: r"\s\.(?P<nums>[0-9]+)"
+consistent!(katana_3, r"\s\.(?P<nums>[0-9]+)");
+
+// katana-1.0.2: r"(?:[A-Za-z]\.){2,}"
+consistent!(katana_4, r"(?:[A-Za-z]\.){2,}");
+
+// katana-1.0.2: r"(?P<init>[A-Z])(?P<point>\.)"
+consistent!(katana_5, r"(?P<init>[A-Z])(?P<point>\.)");
+
+// katana-1.0.2: r"(?P<title>[A-Z][a-z]{1,3})(\.)"
+consistent!(katana_6, r"(?P<title>[A-Z][a-z]{1,3})(\.)");
+
+// katana-1.0.2: r"&==&(?P<p>[.!?])"
+consistent!(katana_7, r"&==&(?P<p>[.!?])");
+
+// katana-1.0.2: r"&\^&(?P<p>[.!?])"
+consistent!(katana_8, r"&\^&(?P<p>[.!?])");
+
+// katana-1.0.2: r"&\*\*&(?P<p>[.!?])"
+consistent!(katana_9, r"&\*\*&(?P<p>[.!?])");
+
+// katana-1.0.2: r"&=&(?P<p>[.!?])"
+consistent!(katana_10, r"&=&(?P<p>[.!?])");
+
+// katana-1.0.2: r"&##&(?P<p>[.!?])"
+consistent!(katana_11, r"&##&(?P<p>[.!?])");
+
+// katana-1.0.2: r"&\$&(?P<p>[.!?])"
+consistent!(katana_12, r"&\$&(?P<p>[.!?])");
+
+// kailua_syntax-1.1.0: r"@(?:_|\d+(?:/\d+(?:-\d+)?)?)"
+consistent!(kailua_syntax_0, r"@(?:_|\d+(?:/\d+(?:-\d+)?)?)");
+
+// kailua_syntax-1.1.0: r"<(\d+)>"
+consistent!(kailua_syntax_1, r"<(\d+)>");
+
+// ftp-3.0.1: r"\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\)"
+consistent!(ftp_0, r"\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\)");
+
+// ftp-3.0.1: r"\b(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})\b"
+consistent!(ftp_1, r"\b(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})\b");
+
+// ftp-3.0.1: r"\s+(\d+)\s*$"
+consistent!(ftp_2, r"\s+(\d+)\s*$");
+
+// vat-0.1.0: r"<countryCode>(.*?)</countryCode>"
+consistent!(vat_0, r"<countryCode>(.*?)</countryCode>");
+
+// vat-0.1.0: r"<vatNumber>(.*?)</vatNumber>"
+consistent!(vat_1, r"<vatNumber>(.*?)</vatNumber>");
+
+// vat-0.1.0: r"<name>(.*?)</name>"
+consistent!(vat_2, r"<name>(.*?)</name>");
+
+// vat-0.1.0: r"<address>(?s)(.*?)(?-s)</address>"
+consistent!(vat_3, r"<address>(?s)(.*?)(?-s)</address>");
+
+// vat-0.1.0: r"<valid>(true|false)</valid>"
+consistent!(vat_4, r"<valid>(true|false)</valid>");
+
+// vat-0.1.0: r"^ATU\d{8}$"
+consistent!(vat_5, r"^ATU\d{8}$");
+
+// vat-0.1.0: r"^BE0?\d{9, 10}$"
+consistent!(vat_6, r"^BE0?\d{9, 10}$");
+
+// vat-0.1.0: r"^BG\d{9,10}$"
+consistent!(vat_7, r"^BG\d{9,10}$");
+
+// vat-0.1.0: r"^HR\d{11}$"
+consistent!(vat_8, r"^HR\d{11}$");
+
+// vat-0.1.0: r"^CY\d{8}[A-Z]$"
+consistent!(vat_9, r"^CY\d{8}[A-Z]$");
+
+// vat-0.1.0: r"^CZ\d{8,10}$"
+consistent!(vat_10, r"^CZ\d{8,10}$");
+
+// vat-0.1.0: r"^DK\d{8}$"
+consistent!(vat_11, r"^DK\d{8}$");
+
+// vat-0.1.0: r"^EE\d{9}$"
+consistent!(vat_12, r"^EE\d{9}$");
+
+// vat-0.1.0: r"^FI\d{8}$"
+consistent!(vat_13, r"^FI\d{8}$");
+
+// vat-0.1.0: r"^FR[A-HJ-NP-Z0-9][A-HJ-NP-Z0-9]\d{9}$"
+consistent!(vat_14, r"^FR[A-HJ-NP-Z0-9][A-HJ-NP-Z0-9]\d{9}$");
+
+// vat-0.1.0: r"^DE\d{9}$"
+consistent!(vat_15, r"^DE\d{9}$");
+
+// vat-0.1.0: r"^EL\d{9}$"
+consistent!(vat_16, r"^EL\d{9}$");
+
+// vat-0.1.0: r"^HU\d{8}$"
+consistent!(vat_17, r"^HU\d{8}$");
+
+// vat-0.1.0: r"^IE\d[A-Z0-9\+\*]\d{5}[A-Z]{1,2}$"
+consistent!(vat_18, r"^IE\d[A-Z0-9\+\*]\d{5}[A-Z]{1,2}$");
+
+// vat-0.1.0: r"^IT\d{11}$"
+consistent!(vat_19, r"^IT\d{11}$");
+
+// vat-0.1.0: r"^LV\d{11}$"
+consistent!(vat_20, r"^LV\d{11}$");
+
+// vat-0.1.0: r"^LT(\d{9}|\d{12})$"
+consistent!(vat_21, r"^LT(\d{9}|\d{12})$");
+
+// vat-0.1.0: r"^LU\d{8}$"
+consistent!(vat_22, r"^LU\d{8}$");
+
+// vat-0.1.0: r"^MT\d{8}$"
+consistent!(vat_23, r"^MT\d{8}$");
+
+// vat-0.1.0: r"^NL\d{9}B\d{2}$"
+consistent!(vat_24, r"^NL\d{9}B\d{2}$");
+
+// vat-0.1.0: r"^PL\d{10}$"
+consistent!(vat_25, r"^PL\d{10}$");
+
+// vat-0.1.0: r"^PT\d{9}$"
+consistent!(vat_26, r"^PT\d{9}$");
+
+// vat-0.1.0: r"^RO\d{2,10}$"
+consistent!(vat_27, r"^RO\d{2,10}$");
+
+// vat-0.1.0: r"^SK\d{10}$"
+consistent!(vat_28, r"^SK\d{10}$");
+
+// vat-0.1.0: r"^SI\d{8}$"
+consistent!(vat_29, r"^SI\d{8}$");
+
+// vat-0.1.0: r"^ES[A-Z0-9]\d{7}[A-Z0-9]$"
+consistent!(vat_30, r"^ES[A-Z0-9]\d{7}[A-Z0-9]$");
+
+// vat-0.1.0: r"^SE\d{10}01$"
+consistent!(vat_31, r"^SE\d{10}01$");
+
+// vat-0.1.0: r"^(GB(GD|HA)\d{3}|GB\d{9}|GB\d{12})$"
+consistent!(vat_32, r"^(GB(GD|HA)\d{3}|GB\d{9}|GB\d{12})$");
+
+// eve-0.1.1: r"\{\{(.*)\}\}"
+consistent!(eve_0, r"\{\{(.*)\}\}");
+
+// egc-0.1.2: "^mio"
+consistent!(egc_0, "^mio");
+
+// pew-0.2.3: ""
+consistent!(pew_0, "");
+
+// pew-0.2.3: ""
+consistent!(pew_1, "");
+
+// mob-0.4.3: "y"
+consistent!(mob_0, "y");
+
+// lit-0.2.8: "@([a-z]+)"
+consistent!(lit_0, "@([a-z]+)");
+
+// lit-0.2.8: "([A-Z-]+):(.*)"
+consistent!(lit_1, "([A-Z-]+):(.*)");
+
+// lit-0.2.8: "^[a-zA-Z_][a-zA-Z0-9_]*$"
+consistent!(lit_2, "^[a-zA-Z_][a-zA-Z0-9_]*$");
+
+// avm-1.0.1: r"\d+\.\d+\.\d+"
+consistent!(avm_0, r"\d+\.\d+\.\d+");
+
+// avm-1.0.1: r"\d+\.\d+\.\d+"
+consistent!(avm_1, r"\d+\.\d+\.\d+");
+
+// orm-0.2.0: r"^Vec<(.+)>$"
+consistent!(orm_0, r"^Vec<(.+)>$");
+
+// sgf-0.1.5: r"\\(\r\n|\n\r|\n|\r)"
+consistent!(sgf_0, r"\\(\r\n|\n\r|\n|\r)");
+
+// sgf-0.1.5: r"\\(.)"
+consistent!(sgf_1, r"\\(.)");
+
+// sgf-0.1.5: r"\r\n|\n\r|\n|\r"
+consistent!(sgf_2, r"\r\n|\n\r|\n|\r");
+
+// sgf-0.1.5: r"([\]\\:])"
+consistent!(sgf_3, r"([\]\\:])");
+
+// dok-0.2.0: "^Bearer realm=\"(.+?)\",service=\"(.+?)\",scope=\"(.+?)\"$"
+consistent!(
+ dok_0,
+ "^Bearer realm=\"(.+?)\",service=\"(.+?)\",scope=\"(.+?)\"$"
+);
+
+// d20-0.1.0: r"([+-]?\s*\d+[dD]\d+|[+-]?\s*\d+)"
+consistent!(d20_0, r"([+-]?\s*\d+[dD]\d+|[+-]?\s*\d+)");
+
+// dvb-0.3.0: "E"
+consistent!(dvb_0, "E");
+
+// dvb-0.3.0: "^F"
+consistent!(dvb_1, "^F");
+
+// dvb-0.3.0: "^S"
+consistent!(dvb_2, "^S");
+
+// ger-0.2.0: r"Change-Id: (I[a-f0-9]{40})$"
+consistent!(ger_0, r"Change-Id: (I[a-f0-9]{40})$");
+
+// ger-0.2.0: r"(refs|ref|fix|fixes|close|closes)\s+([A-Z]{2,5}-[0-9]{1,5})$"
+consistent!(
+ ger_1,
+ r"(refs|ref|fix|fixes|close|closes)\s+([A-Z]{2,5}-[0-9]{1,5})$"
+);
+
+// n5-0.2.1: r"(\d+)(\.(\d+))?(\.(\d+))?(.*)"
+consistent!(n5_0, r"(\d+)(\.(\d+))?(\.(\d+))?(.*)");
+
+// po-0.1.4: r"[A-Za-z0-9]"
+consistent!(po_0, r"[A-Za-z0-9]");
+
+// carnix-0.8.5: "path is (‘|')?([^’'\n]*)(’|')?"
+consistent!(carnix_0, "path is (‘|')?([^’'\n]*)(’|')?");
+
+// carnix-0.8.5: r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?(.*)?"
+consistent!(carnix_1, r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?(.*)?");
+
+// carnix-0.8.5: r"(\d*)\.(\d*)\.(\d*)(-(\S*))?"
+consistent!(carnix_2, r"(\d*)\.(\d*)\.(\d*)(-(\S*))?");
+
+// carnix-0.8.5: r"(\S*)-(\d*)\.(\d*)\.(\d*)(-(\S*))?"
+consistent!(carnix_3, r"(\S*)-(\d*)\.(\d*)\.(\d*)(-(\S*))?");
+
+// caseless-0.2.1: r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$"
+consistent!(caseless_0, r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$");
+
+// caseless-0.2.1: r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"
+consistent!(caseless_1, r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);");
+
+// cabot-0.2.0: "\r?\n\r?\n"
+consistent!(cabot_0, "\r?\n\r?\n");
+
+// cabot-0.2.0: "\r?\n"
+consistent!(cabot_1, "\r?\n");
+
+// card-validate-2.2.1: r"^600"
+consistent!(card_validate_0, r"^600");
+
+// card-validate-2.2.1: r"^5019"
+consistent!(card_validate_1, r"^5019");
+
+// card-validate-2.2.1: r"^4"
+consistent!(card_validate_2, r"^4");
+
+// card-validate-2.2.1: r"^(5[1-5]|2[2-7])"
+consistent!(card_validate_3, r"^(5[1-5]|2[2-7])");
+
+// card-validate-2.2.1: r"^3[47]"
+consistent!(card_validate_4, r"^3[47]");
+
+// card-validate-2.2.1: r"^3[0689]"
+consistent!(card_validate_5, r"^3[0689]");
+
+// card-validate-2.2.1: r"^6([045]|22)"
+consistent!(card_validate_6, r"^6([045]|22)");
+
+// card-validate-2.2.1: r"^(62|88)"
+consistent!(card_validate_7, r"^(62|88)");
+
+// card-validate-2.2.1: r"^35"
+consistent!(card_validate_8, r"^35");
+
+// card-validate-2.2.1: r"^[0-9]+$"
+consistent!(card_validate_9, r"^[0-9]+$");
+
+// cargo-testify-0.3.0: r"\d{1,} passed.*filtered out"
+consistent!(cargo_testify_0, r"\d{1,} passed.*filtered out");
+
+// cargo-testify-0.3.0: r"error(:|\[).*"
+consistent!(cargo_testify_1, r"error(:|\[).*");
+
+// cargo-wix-0.0.5: r"<(.*?)>"
+consistent!(cargo_wix_0, r"<(.*?)>");
+
+// cargo-wix-0.0.5: r"<(.*?)>"
+consistent!(cargo_wix_1, r"<(.*?)>");
+
+// cargo-wix-0.0.5: r"<(.*?)>"
+consistent!(cargo_wix_2, r"<(.*?)>");
+
+// cargo-wix-0.0.5: r"<(.*?)>"
+consistent!(cargo_wix_3, r"<(.*?)>");
+
+// cargo-incremental-0.1.23: r"(?m)^incremental: re-using (\d+) out of (\d+) modules$"
+consistent!(
+ cargo_incremental_0,
+ r"(?m)^incremental: re-using (\d+) out of (\d+) modules$"
+);
+
+// cargo-incremental-0.1.23: "(?m)(warning|error): (.*)\n --> ([^:]:\\d+:\\d+)$"
+consistent!(
+ cargo_incremental_1,
+ "(?m)(warning|error): (.*)\n --> ([^:]:\\d+:\\d+)$"
+);
+
+// cargo-incremental-0.1.23: r"(?m)^test (.*) \.\.\. (\w+)"
+consistent!(cargo_incremental_2, r"(?m)^test (.*) \.\.\. (\w+)");
+
+// cargo-incremental-0.1.23: r"(?m)(\d+) passed; (\d+) failed; (\d+) ignored; \d+ measured"
+consistent!(
+ cargo_incremental_3,
+ r"(?m)(\d+) passed; (\d+) failed; (\d+) ignored; \d+ measured"
+);
+
+// cargo-testjs-0.1.2: r"^[^-]+-[0-9a-f]+\.js$"
+consistent!(cargo_testjs_0, r"^[^-]+-[0-9a-f]+\.js$");
+
+// cargo-tarpaulin-0.6.2: r"\s*//"
+consistent!(cargo_tarpaulin_0, r"\s*//");
+
+// cargo-tarpaulin-0.6.2: r"/\*"
+consistent!(cargo_tarpaulin_1, r"/\*");
+
+// cargo-tarpaulin-0.6.2: r"\*/"
+consistent!(cargo_tarpaulin_2, r"\*/");
+
+// cargo-culture-kit-0.1.0: r"^fo"
+consistent!(cargo_culture_kit_0, r"^fo");
+
+// cargo-screeps-0.1.3: "\\s+"
+consistent!(cargo_screeps_0, "\\s+");
+
+// cargo-brew-0.1.4: r"`(\S+) v([0-9.]+)"
+consistent!(cargo_brew_0, r"`(\S+) v([0-9.]+)");
+
+// cargo-release-0.10.2: "^\\[.+\\]"
+consistent!(cargo_release_0, "^\\[.+\\]");
+
+// cargo-release-0.10.2: "^\\[\\[.+\\]\\]"
+consistent!(cargo_release_1, "^\\[\\[.+\\]\\]");
+
+// cargo-edit-0.3.0-beta.1: r"^https://github.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$"
+consistent!(
+ cargo_edit_0,
+ r"^https://github.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$"
+);
+
+// cargo-edit-0.3.0-beta.1: r"^https://gitlab.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$"
+consistent!(
+ cargo_edit_1,
+ r"^https://gitlab.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$"
+);
+
+// cargo-disassemble-0.1.1: ".*"
+consistent!(cargo_disassemble_0, ".*");
+
+// cargo-demangle-0.1.2: r"(?m)(?P<symbol>_ZN[0-9]+.*E)"
+consistent!(cargo_demangle_0, r"(?m)(?P<symbol>_ZN[0-9]+.*E)");
+
+// cargo-coverage-annotations-0.1.5: r"^\s*\}(?:\)*;?|\s*else\s*\{)$"
+consistent!(cargo_coverage_annotations_0, r"^\s*\}(?:\)*;?|\s*else\s*\{)$");
+
+// cargo-urlcrate-1.0.1: "[\u{001b}\u{009b}][\\[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-PRZcf-nqry=><]"
+consistent!(cargo_urlcrate_0, "[\u{001b}\u{009b}][\\[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-PRZcf-nqry=><]");
+
+// cargo-script-0.2.8: r"^\s*\*( |$)"
+consistent!(cargo_script_0, r"^\s*\*( |$)");
+
+// cargo-script-0.2.8: r"^(\s+)"
+consistent!(cargo_script_1, r"^(\s+)");
+
+// cargo-script-0.2.8: r"/\*|\*/"
+consistent!(cargo_script_2, r"/\*|\*/");
+
+// cargo-script-0.2.8: r"^\s*//!"
+consistent!(cargo_script_3, r"^\s*//!");
+
+// cargo-script-0.2.8: r"^#![^\[].*?(\r\n|\n)"
+consistent!(cargo_script_4, r"^#![^\[].*?(\r\n|\n)");
+
+// cargo-update-1.5.2: r"cargo-install-update\.exe-v.+"
+consistent!(cargo_update_0, r"cargo-install-update\.exe-v.+");
+
+// canteen-0.4.1: r"^<(?:(int|uint|str|float|path):)?([\w_][a-zA-Z0-9_]*)>$"
+consistent!(
+ canteen_0,
+ r"^<(?:(int|uint|str|float|path):)?([\w_][a-zA-Z0-9_]*)>$"
+);
+
+// thruster-cli-0.1.3: r"(.)([A-Z])"
+consistent!(thruster_cli_0, r"(.)([A-Z])");
+
+// thieves-cant-0.1.0: "([Z]+)$"
+consistent!(thieves_cant_0, "([Z]+)$");
+
+// codeowners-0.1.3: r"^@\S+/\S+"
+consistent!(codeowners_0, r"^@\S+/\S+");
+
+// codeowners-0.1.3: r"^@\S+"
+consistent!(codeowners_1, r"^@\S+");
+
+// codeowners-0.1.3: r"^\S+@\S+"
+consistent!(codeowners_2, r"^\S+@\S+");
+
+// conserve-0.4.2: r"^b0000 {21} complete 20[-0-9T:+]+\s +\d+s\n$"
+consistent!(conserve_0, r"^b0000 {21} complete 20[-0-9T:+]+\s +\d+s\n$");
+
+// commodore-0.3.0: r"(?P<greeting>\S+?) (?P<name>\S+?)$"
+consistent!(commodore_0, r"(?P<greeting>\S+?) (?P<name>\S+?)$");
+
+// corollary-0.3.0: r"([ \t]*)```haskell([\s\S]*?)```"
+consistent!(corollary_0, r"([ \t]*)```haskell([\s\S]*?)```");
+
+// corollary-0.3.0: r"\b((?:a|b|t)\d*)\b"
+consistent!(corollary_1, r"\b((?:a|b|t)\d*)\b");
+
+// colorizex-0.1.3: "NB"
+consistent!(colorizex_0, "NB");
+
+// colorstring-0.0.1: r"(?i)\[[a-z0-9_-]+\]"
+consistent!(colorstring_0, r"(?i)\[[a-z0-9_-]+\]");
+
+// colorstring-0.0.1: r"^(?i)(\[[a-z0-9_-]+\])+"
+consistent!(colorstring_1, r"^(?i)(\[[a-z0-9_-]+\])+");
+
+// cosmogony-0.3.0: "name:(.+)"
+consistent!(cosmogony_0, "name:(.+)");
+
+// cobalt-bin-0.12.1: r"(?m:^ {0,3}\[[^\]]+\]:.+$)"
+consistent!(cobalt_bin_0, r"(?m:^ {0,3}\[[^\]]+\]:.+$)");
+
+// comrak-0.2.12: r"[^\p{L}\p{M}\p{N}\p{Pc} -]"
+consistent!(comrak_0, r"[^\p{L}\p{M}\p{N}\p{Pc} -]");
+
+// content-blocker-0.2.3: ""
+consistent!(content_blocker_0, "");
+
+// content-blocker-0.2.3: "(?i)hi"
+consistent!(content_blocker_1, "(?i)hi");
+
+// content-blocker-0.2.3: "http[s]?://domain.org"
+consistent!(content_blocker_2, "http[s]?://domain.org");
+
+// content-blocker-0.2.3: "(?i)http[s]?://domain.org"
+consistent!(content_blocker_3, "(?i)http[s]?://domain.org");
+
+// content-blocker-0.2.3: "http://domain.org"
+consistent!(content_blocker_4, "http://domain.org");
+
+// content-blocker-0.2.3: "http://domain.org"
+consistent!(content_blocker_5, "http://domain.org");
+
+// content-blocker-0.2.3: "ad.html"
+consistent!(content_blocker_6, "ad.html");
+
+// content-blocker-0.2.3: "ad.html"
+consistent!(content_blocker_7, "ad.html");
+
+// content-blocker-0.2.3: "http://domain.org"
+consistent!(content_blocker_8, "http://domain.org");
+
+// content-blocker-0.2.3: "http://domain.org/nocookies.sjs"
+consistent!(content_blocker_9, "http://domain.org/nocookies.sjs");
+
+// content-blocker-0.2.3: "http://domain.org/nocookies.sjs"
+consistent!(content_blocker_10, "http://domain.org/nocookies.sjs");
+
+// content-blocker-0.2.3: "http://domain.org/hideme.jpg"
+consistent!(content_blocker_11, "http://domain.org/hideme.jpg");
+
+// content-blocker-0.2.3: "http://domain.org/ok.html"
+consistent!(content_blocker_12, "http://domain.org/ok.html");
+
+// content-blocker-0.2.3: "http://domain.org/ok.html\\?except_this=1"
+consistent!(content_blocker_13, "http://domain.org/ok.html\\?except_this=1");
+
+// victoria-dom-0.1.2: "[A-Za-z0-9=]"
+consistent!(victoria_dom_0, "[A-Za-z0-9=]");
+
+// numbat-1.0.0: r"^nsq://"
+consistent!(numbat_0, r"^nsq://");
+
+// airkorea-0.1.2: r"[\s\t\r\n]"
+consistent!(airkorea_0, r"[\s\t\r\n]");
+
+// airkorea-0.1.2: r"([\{\[,])|([\}\]])"
+consistent!(airkorea_1, r"([\{\[,])|([\}\]])");
+
+// airkorea-0.1.2: r"[^.\d]+$"
+consistent!(airkorea_2, r"[^.\d]+$");
+
+// rofl-0.0.1: r"\b"
+// consistent!(rofl_0, r"\b");
+
+// rogcat-0.2.15: r"--------- beginning of.*"
+consistent!(rogcat_0, r"--------- beginning of.*");
+
+// rogcat-0.2.15: r"a|e|i|o|u"
+consistent!(rogcat_1, r"a|e|i|o|u");
+
+// rogcat-0.2.15: r"^(\d+)([kMG])$"
+consistent!(rogcat_2, r"^(\d+)([kMG])$");
+
+// media_filename-0.1.4: "\\.([A-Za-z0-9]{2,4})$"
+consistent!(media_filename_0, "\\.([A-Za-z0-9]{2,4})$");
+
+// media_filename-0.1.4: "([0-9]{3,4}p|[0-9]{3,4}x[0-9]{3,4})"
+consistent!(media_filename_1, "([0-9]{3,4}p|[0-9]{3,4}x[0-9]{3,4})");
+
+// media_filename-0.1.4: "(?:^\\[([^]]+)\\]|- ?([^-]+)$)"
+consistent!(media_filename_2, "(?:^\\[([^]]+)\\]|- ?([^-]+)$)");
+
+// media_filename-0.1.4: "(?:[eE]([0-9]{2,3})|[^0-9A-Za-z]([0-9]{2,3})(?:v[0-9])?[^0-9A-Za-z])"
+consistent!(
+ media_filename_3,
+ "(?:[eE]([0-9]{2,3})|[^0-9A-Za-z]([0-9]{2,3})(?:v[0-9])?[^0-9A-Za-z])"
+);
+
+// media_filename-0.1.4: "[sS]([0-9]{1,2})"
+consistent!(media_filename_4, "[sS]([0-9]{1,2})");
+
+// media_filename-0.1.4: "((?i)(?:PPV.)?[HP]DTV|(?:HD)?CAM|BRRIP|[^a-z]TS[^a-z]|(?:PPV )?WEB.?DL(?: DVDRip)?|HDRip|DVDRip|CamRip|W[EB]BRip|BluRay|BD|DVD|DvDScr|hdtv)"
+consistent!(media_filename_5, "((?i)(?:PPV.)?[HP]DTV|(?:HD)?CAM|BRRIP|[^a-z]TS[^a-z]|(?:PPV )?WEB.?DL(?: DVDRip)?|HDRip|DVDRip|CamRip|W[EB]BRip|BluRay|BD|DVD|DvDScr|hdtv)");
+
+// media_filename-0.1.4: "((19[0-9]|20[01])[0-9])"
+consistent!(media_filename_6, "((19[0-9]|20[01])[0-9])");
+
+// media_filename-0.1.4: "((?i)xvid|x264|h\\.?264)"
+consistent!(media_filename_7, "((?i)xvid|x264|h\\.?264)");
+
+// media_filename-0.1.4: "((?i)MP3|DD5\\.?1|Dual[- ]Audio|LiNE|DTS|AAC(?:\\.?2\\.0)?|AC3(?:\\.5\\.1)?)"
+consistent!(media_filename_8, "((?i)MP3|DD5\\.?1|Dual[- ]Audio|LiNE|DTS|AAC(?:\\.?2\\.0)?|AC3(?:\\.5\\.1)?)");
+
+// media_filename-0.1.4: "\\[([0-9A-F]{8})\\]"
+consistent!(media_filename_9, "\\[([0-9A-F]{8})\\]");
+
+// termimage-0.3.2: r"(\d+)[xX](\d+)"
+consistent!(termimage_0, r"(\d+)[xX](\d+)");
+
+// teensy-0.1.0: r".*(\d{4}-\d{2}-\d{2}).*"
+consistent!(teensy_0, r".*(\d{4}-\d{2}-\d{2}).*");
+
+// telescreen-0.1.3: r"<@(.+)>"
+consistent!(telescreen_0, r"<@(.+)>");
+
+// tempus_fugit-0.4.4: r"^(\d+)"
+consistent!(tempus_fugit_0, r"^(\d+)");
+
+// fselect-0.4.1: "(\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)"
+consistent!(fselect_0, "(\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)");
+
+// fselect-0.4.1: "(%|_|\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)"
+consistent!(fselect_1, "(%|_|\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)");
+
+// fs_eventbridge-0.1.0: r"^([A-Z]+)(?:\s(.+))?\s*"
+consistent!(fs_eventbridge_0, r"^([A-Z]+)(?:\s(.+))?\s*");
+
+// joseki-0.0.1: r"(\w{1,2})\[(.+?)\]"
+consistent!(joseki_0, r"(\w{1,2})\[(.+?)\]");
+
+// tweetr-0.2.1: r"(?i)in (\d+) (second|minute|hour|day|week)s?"
+consistent!(tweetr_0, r"(?i)in (\d+) (second|minute|hour|day|week)s?");
+
+// bullet_core-0.1.1: "^(?u:[0-9])+"
+consistent!(bullet_core_0, "^(?u:[0-9])+");
+
+// bullet_core-0.1.1: "^(?u:[0-9])+(?u:\\.)(?u:[0-9])+"
+consistent!(bullet_core_1, "^(?u:[0-9])+(?u:\\.)(?u:[0-9])+");
+
+// bullet_core-0.1.1: "^(?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+"
+consistent!(bullet_core_2, "^(?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+");
+
+// bullet_core-0.1.1: "^(?u:d/d)((?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+)"
+consistent!(bullet_core_3, "^(?u:d/d)((?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+)");
+
+// bullet_core-0.1.1: "^(?u:\\()"
+consistent!(bullet_core_4, "^(?u:\\()");
+
+// bullet_core-0.1.1: "^(?u:\\))"
+consistent!(bullet_core_5, "^(?u:\\))");
+
+// bullet_core-0.1.1: "^(?u:\\*)"
+consistent!(bullet_core_6, "^(?u:\\*)");
+
+// bullet_core-0.1.1: "^(?u:\\+)"
+consistent!(bullet_core_7, "^(?u:\\+)");
+
+// bullet_core-0.1.1: "^(?u:,)"
+consistent!(bullet_core_8, "^(?u:,)");
+
+// bullet_core-0.1.1: "^(?u:\\-)"
+consistent!(bullet_core_9, "^(?u:\\-)");
+
+// bullet_core-0.1.1: "^(?u:/)"
+consistent!(bullet_core_10, "^(?u:/)");
+
+// bullet_core-0.1.1: "^(?u:\\[)"
+consistent!(bullet_core_11, "^(?u:\\[)");
+
+// bullet_core-0.1.1: "^(?u:\\])"
+consistent!(bullet_core_12, "^(?u:\\])");
+
+// bullet_core-0.1.1: "^(?u:\\^)"
+consistent!(bullet_core_13, "^(?u:\\^)");
+
+// bullet_core-0.1.1: "^(?u:·)"
+consistent!(bullet_core_14, "^(?u:·)");
+
+// actix-web-0.6.13: "//+"
+consistent!(actix_web_0, "//+");
+
+// actix-web-0.6.13: "//+"
+consistent!(actix_web_1, "//+");
+
+// althea_kernel_interface-0.1.0: r"(\S*) .* (\S*) (REACHABLE|STALE|DELAY)"
+consistent!(
+ althea_kernel_interface_0,
+ r"(\S*) .* (\S*) (REACHABLE|STALE|DELAY)"
+);
+
+// althea_kernel_interface-0.1.0: r"-s (.*) --ip6-dst (.*)/.* bcnt = (.*)"
+consistent!(
+ althea_kernel_interface_1,
+ r"-s (.*) --ip6-dst (.*)/.* bcnt = (.*)"
+);
+
+// alcibiades-0.3.0: r"\buci(?:\s|$)"
+consistent!(alcibiades_0, r"\buci(?:\s|$)");
+
+// ruma-identifiers-0.11.0: r"\A[a-z0-9._=-]+\z"
+consistent!(ruma_identifiers_0, r"\A[a-z0-9._=-]+\z");
+
+// rusqbin-0.2.3: r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})$"
+consistent!(rusqbin_0, r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})$");
+
+// rusqbin-0.2.3: r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})/requests/?$"
+consistent!(rusqbin_1, r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})/requests/?$");
+
+// rust-install-0.0.4: r"^(nightly|beta|stable)(?:-(\d{4}-\d{2}-\d{2}))?$"
+consistent!(
+ rust_install_0,
+ r"^(nightly|beta|stable)(?:-(\d{4}-\d{2}-\d{2}))?$"
+);
+
+// rust_inbox-0.0.5: "^+(.*)\r\n"
+consistent!(rust_inbox_0, "^+(.*)\r\n");
+
+// rust_inbox-0.0.5: r"^\* CAPABILITY (.*)\r\n"
+consistent!(rust_inbox_1, r"^\* CAPABILITY (.*)\r\n");
+
+// rust_inbox-0.0.5: r"^([a-zA-Z0-9]+) (OK|NO|BAD)(.*)"
+consistent!(rust_inbox_2, r"^([a-zA-Z0-9]+) (OK|NO|BAD)(.*)");
+
+// rust_inbox-0.0.5: r"^\* (\d+) EXISTS\r\n"
+consistent!(rust_inbox_3, r"^\* (\d+) EXISTS\r\n");
+
+// rust_inbox-0.0.5: r"^\* (\d+) RECENT\r\n"
+consistent!(rust_inbox_4, r"^\* (\d+) RECENT\r\n");
+
+// rust_inbox-0.0.5: r"^\* FLAGS (.+)\r\n"
+consistent!(rust_inbox_5, r"^\* FLAGS (.+)\r\n");
+
+// rust_inbox-0.0.5: r"^\* OK \[UNSEEN (\d+)\](.*)\r\n"
+consistent!(rust_inbox_6, r"^\* OK \[UNSEEN (\d+)\](.*)\r\n");
+
+// rust_inbox-0.0.5: r"^\* OK \[UIDVALIDITY (\d+)\](.*)\r\n"
+consistent!(rust_inbox_7, r"^\* OK \[UIDVALIDITY (\d+)\](.*)\r\n");
+
+// rust_inbox-0.0.5: r"^\* OK \[UIDNEXT (\d+)\](.*)\r\n"
+consistent!(rust_inbox_8, r"^\* OK \[UIDNEXT (\d+)\](.*)\r\n");
+
+// rust_inbox-0.0.5: r"^\* OK \[PERMANENTFLAGS (.+)\](.*)\r\n"
+consistent!(rust_inbox_9, r"^\* OK \[PERMANENTFLAGS (.+)\](.*)\r\n");
+
+// rustml-0.0.7: r"^[a-z]+ (\d+)$"
+consistent!(rustml_0, r"^[a-z]+ (\d+)$");
+
+// rustml-0.0.7: r"^[a-z]+ (\d+)$"
+consistent!(rustml_1, r"^[a-z]+ (\d+)$");
+
+// rustml-0.0.7: r"^[a-z]+ (\d+)$"
+consistent!(rustml_2, r"^[a-z]+ (\d+)$");
+
+// rustfmt-0.10.0: r"([^\\](\\\\)*)\\[\n\r][[:space:]]*"
+consistent!(rustfmt_0, r"([^\\](\\\\)*)\\[\n\r][[:space:]]*");
+
+// rustfmt-core-0.4.0: r"(^\s*$)|(^\s*//\s*rustfmt-[^:]+:\s*\S+)"
+consistent!(rustfmt_core_0, r"(^\s*$)|(^\s*//\s*rustfmt-[^:]+:\s*\S+)");
+
+// rustfmt-core-0.4.0: r"^## `([^`]+)`"
+consistent!(rustfmt_core_1, r"^## `([^`]+)`");
+
+// rustfmt-core-0.4.0: r"([^\\](\\\\)*)\\[\n\r][[:space:]]*"
+consistent!(rustfmt_core_2, r"([^\\](\\\\)*)\\[\n\r][[:space:]]*");
+
+// rustfmt-core-0.4.0: r"\s;"
+consistent!(rustfmt_core_3, r"\s;");
+
+// rust-enum-derive-0.4.0: r"^(0x)?([:digit:]+)$"
+consistent!(rust_enum_derive_0, r"^(0x)?([:digit:]+)$");
+
+// rust-enum-derive-0.4.0: r"^([:digit:]+)[:space:]*<<[:space:]*([:digit:]+)$"
+consistent!(
+ rust_enum_derive_1,
+ r"^([:digit:]+)[:space:]*<<[:space:]*([:digit:]+)$"
+);
+
+// rust-enum-derive-0.4.0: r"^[:space:]*([[:alnum:]_]+)([:space:]*=[:space:]*([:graph:]+))?[:space:]*,"
+consistent!(rust_enum_derive_2, r"^[:space:]*([[:alnum:]_]+)([:space:]*=[:space:]*([:graph:]+))?[:space:]*,");
+
+// rust-enum-derive-0.4.0: r"^#define[:space:]+([:graph:]+)[:space:]+([:graph:]+)"
+consistent!(
+ rust_enum_derive_3,
+ r"^#define[:space:]+([:graph:]+)[:space:]+([:graph:]+)"
+);
+
+// rustsourcebundler-0.2.0: r"^\s*pub mod (.+);$"
+consistent!(rustsourcebundler_0, r"^\s*pub mod (.+);$");
+
+// rustsourcebundler-0.2.0: r"^\s*pub mod (.+);$"
+consistent!(rustsourcebundler_1, r"^\s*pub mod (.+);$");
+
+// rustfmt-nightly-0.8.2: r"([^\\](\\\\)*)\\[\n\r][[:space:]]*"
+consistent!(rustfmt_nightly_0, r"([^\\](\\\\)*)\\[\n\r][[:space:]]*");
+
+// rustfmt-nightly-0.8.2: r"\s;"
+consistent!(rustfmt_nightly_1, r"\s;");
+
+// rustache-0.1.0: r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)"
+consistent!(rustache_0, r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)");
+
+// rustfilt-0.2.0: r"_ZN[\$\._[:alnum:]]*"
+consistent!(rustfilt_0, r"_ZN[\$\._[:alnum:]]*");
+
+// rustache-lists-0.1.2: r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)"
+consistent!(rustache_lists_0, r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)");
+
+// rural-0.7.3: "(.+)=(.+)"
+consistent!(rural_0, "(.+)=(.+)");
+
+// rural-0.7.3: "(.*):(.+)"
+consistent!(rural_1, "(.*):(.+)");
+
+// rural-0.7.3: "(.+):=(.+)"
+consistent!(rural_2, "(.+):=(.+)");
+
+// rural-0.7.3: "(.*)==(.+)"
+consistent!(rural_3, "(.*)==(.+)");
+
+// rusoto_credential-0.11.0: r"^\[([^\]]+)\]$"
+consistent!(rusoto_credential_0, r"^\[([^\]]+)\]$");
+
+// rumblebars-0.3.0: "([:blank:]*)$"
+consistent!(rumblebars_0, "([:blank:]*)$");
+
+// rumblebars-0.3.0: "(\r?\n)[:blank:]*(\\{\\{~?[#!/](?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z"
+consistent!(rumblebars_1, "(\r?\n)[:blank:]*(\\{\\{~?[#!/](?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z");
+
+// rumblebars-0.3.0: "(\r?\n[:blank:]*)(\\{\\{~?>(?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z"
+consistent!(
+ rumblebars_2,
+ "(\r?\n[:blank:]*)(\\{\\{~?>(?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z"
+);
+
+// rumblebars-0.3.0: "((?:[:blank:]|\r?\n)*)(\r?\n)[:blank:]*$"
+consistent!(rumblebars_3, "((?:[:blank:]|\r?\n)*)(\r?\n)[:blank:]*$");
+
+// rumblebars-0.3.0: "^([:blank:]*\r?\n)(.*)"
+consistent!(rumblebars_4, "^([:blank:]*\r?\n)(.*)");
+
+// diesel_cli-1.3.1: r"(?P<stamp>[\d-]*)_hello"
+consistent!(diesel_cli_0, r"(?P<stamp>[\d-]*)_hello");
+
+// dishub-0.1.1: r"(\d+)s"
+consistent!(dishub_0, r"(\d+)s");
+
+// spreadsheet_textconv-0.1.0: r"\n"
+consistent!(spreadsheet_textconv_0, r"\n");
+
+// spreadsheet_textconv-0.1.0: r"\r"
+consistent!(spreadsheet_textconv_1, r"\r");
+
+// spreadsheet_textconv-0.1.0: r"\t"
+consistent!(spreadsheet_textconv_2, r"\t");
+
+// split_aud-0.1.0: r"DELAY (-?\d+)ms"
+consistent!(split_aud_0, r"DELAY (-?\d+)ms");
+
+// split_aud-0.1.0: r"Trim\((\d+), ?(\d+)\)"
+consistent!(split_aud_1, r"Trim\((\d+), ?(\d+)\)");
+
+// spotrust-0.0.5: r"spotify:[a-z]+:[a-zA-Z0-9]+"
+consistent!(spotrust_0, r"spotify:[a-z]+:[a-zA-Z0-9]+");
+
+// spaceslugs-0.1.0: r"[^\x00-\x7F]"
+consistent!(spaceslugs_0, r"[^\x00-\x7F]");
+
+// spaceslugs-0.1.0: r"[']+"
+consistent!(spaceslugs_1, r"[']+");
+
+// spaceslugs-0.1.0: r"\W+"
+consistent!(spaceslugs_2, r"\W+");
+
+// spaceslugs-0.1.0: r"[ ]+"
+consistent!(spaceslugs_3, r"[ ]+");
+
+// space_email_api-0.1.1: "PHPSESSID=([0-9a-f]+)"
+consistent!(space_email_api_0, "PHPSESSID=([0-9a-f]+)");
+
+// lorikeet-0.7.0: "[^0-9.,]"
+consistent!(lorikeet_0, "[^0-9.,]");
+
+// claude-0.3.0: r"^(?:\b|(-)?)(\p{Currency_Symbol})?((?:(?:\d{1,3}[\.,])+\d{3})|\d+)(?:[\.,](\d{2}))?\b$"
+consistent!(claude_0, r"^(?:\b|(-)?)(\p{Currency_Symbol})?((?:(?:\d{1,3}[\.,])+\d{3})|\d+)(?:[\.,](\d{2}))?\b$");
+
+// clam-0.1.6: r"<%=\s*(.+?)\s*%>"
+consistent!(clam_0, r"<%=\s*(.+?)\s*%>");
+
+// classifier-0.0.3: r"(\s)"
+consistent!(classifier_0, r"(\s)");
+
+// click-0.3.2: r"(-----BEGIN .*-----\n)((?:(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)*\n)+)(-----END .*-----)"
+consistent!(click_0, r"(-----BEGIN .*-----\n)((?:(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)*\n)+)(-----END .*-----)");
+
+// click-0.3.2: r"-----BEGIN PRIVATE KEY-----"
+consistent!(click_1, r"-----BEGIN PRIVATE KEY-----");
+
+// ultrastar-txt-0.1.2: r"#([A-Z3a-z]*):(.*)"
+consistent!(ultrastar_txt_0, r"#([A-Z3a-z]*):(.*)");
+
+// ultrastar-txt-0.1.2: "^-\\s?(-?[0-9]+)\\s*$"
+consistent!(ultrastar_txt_1, "^-\\s?(-?[0-9]+)\\s*$");
+
+// ultrastar-txt-0.1.2: "^-\\s?(-?[0-9]+)\\s+(-?[0-9]+)"
+consistent!(ultrastar_txt_2, "^-\\s?(-?[0-9]+)\\s+(-?[0-9]+)");
+
+// ultrastar-txt-0.1.2: "^(.)\\s*(-?[0-9]+)\\s+(-?[0-9]+)\\s+(-?[0-9]+)\\s?(.*)"
+consistent!(
+ ultrastar_txt_3,
+ "^(.)\\s*(-?[0-9]+)\\s+(-?[0-9]+)\\s+(-?[0-9]+)\\s?(.*)"
+);
+
+// ultrastar-txt-0.1.2: "^P\\s?(-?[0-9]+)"
+consistent!(ultrastar_txt_4, "^P\\s?(-?[0-9]+)");
+
+// db-accelerate-2.0.0: r"^template\.add($|\..+$)"
+consistent!(db_accelerate_0, r"^template\.add($|\..+$)");
+
+// db-accelerate-2.0.0: r"^template\.sub($|\..+$)"
+consistent!(db_accelerate_1, r"^template\.sub($|\..+$)");
+
+// sterling-0.3.0: r"(\d+)([cegps])"
+consistent!(sterling_0, r"(\d+)([cegps])");
+
+// stache-0.2.0: r"[^\w]"
+consistent!(stache_0, r"[^\w]");
+
+// strukt-0.1.0: "\"([<>]?)([xcbB\\?hHiIlLqQfdspP]*)\""
+consistent!(strukt_0, "\"([<>]?)([xcbB\\?hHiIlLqQfdspP]*)\"");
+
+// steamid-ng-0.3.1: r"^STEAM_([0-4]):([0-1]):([0-9]{1,10})$"
+consistent!(steamid_ng_0, r"^STEAM_([0-4]):([0-1]):([0-9]{1,10})$");
+
+// steamid-ng-0.3.1: r"^\[([AGMPCgcLTIUai]):([0-4]):([0-9]{1,10})(:([0-9]+))?\]$"
+consistent!(
+ steamid_ng_1,
+ r"^\[([AGMPCgcLTIUai]):([0-4]):([0-9]{1,10})(:([0-9]+))?\]$"
+);
+
+// strscan-0.1.1: r"^\w+"
+consistent!(strscan_0, r"^\w+");
+
+// strscan-0.1.1: r"^\s+"
+consistent!(strscan_1, r"^\s+");
+
+// strscan-0.1.1: r"^\w+"
+consistent!(strscan_2, r"^\w+");
+
+// strscan-0.1.1: r"^\s+"
+consistent!(strscan_3, r"^\s+");
+
+// strscan-0.1.1: r"^(\w+)\s+"
+consistent!(strscan_4, r"^(\w+)\s+");
+
+// tk-carbon-0.2.0: r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$"
+consistent!(tk_carbon_0, r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$");
+
+// tk-carbon-0.2.0: r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$"
+consistent!(tk_carbon_1, r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$");
+
+// evalrs-0.0.10: r"extern\s+crate\s+([a-z0-9_]+)\s*;(\s*//(.+))?"
+consistent!(evalrs_0, r"extern\s+crate\s+([a-z0-9_]+)\s*;(\s*//(.+))?");
+
+// evalrs-0.0.10: r"(?m)^# "
+consistent!(evalrs_1, r"(?m)^# ");
+
+// evalrs-0.0.10: r"(?m)^\s*fn +main *\( *\)"
+consistent!(evalrs_2, r"(?m)^\s*fn +main *\( *\)");
+
+// evalrs-0.0.10: r"(extern\s+crate\s+[a-z0-9_]+\s*;)"
+consistent!(evalrs_3, r"(extern\s+crate\s+[a-z0-9_]+\s*;)");
+
+// gate_build-0.5.0: "(.*)_t([0-9]+)"
+consistent!(gate_build_0, "(.*)_t([0-9]+)");
+
+// rake-0.1.1: r"[^\P{P}-]|\s+-\s+"
+consistent!(rake_0, r"[^\P{P}-]|\s+-\s+");
+
+// rafy-0.2.1: r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*"
+consistent!(rafy_0, r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*");
+
+// raven-0.2.1: r"^(?P<protocol>.*?)://(?P<public_key>.*?):(?P<secret_key>.*?)@(?P<host>.*?)/(?P<path>.*/)?(?P<project_id>.*)$"
+consistent!(raven_0, r"^(?P<protocol>.*?)://(?P<public_key>.*?):(?P<secret_key>.*?)@(?P<host>.*?)/(?P<path>.*/)?(?P<project_id>.*)$");
+
+// rargs-0.2.0: r"\{[[:space:]]*[^{}]*[[:space:]]*\}"
+consistent!(rargs_0, r"\{[[:space:]]*[^{}]*[[:space:]]*\}");
+
+// rargs-0.2.0: r"^\{[[:space:]]*(?P<name>[[:word:]]*)[[:space:]]*\}$"
+consistent!(rargs_1, r"^\{[[:space:]]*(?P<name>[[:word:]]*)[[:space:]]*\}$");
+
+// rargs-0.2.0: r"^\{[[:space:]]*(?P<num>-?\d+)[[:space:]]*\}$"
+consistent!(rargs_2, r"^\{[[:space:]]*(?P<num>-?\d+)[[:space:]]*\}$");
+
+// rargs-0.2.0: r"^\{(?P<left>-?\d*)?\.\.(?P<right>-?\d*)?(?::(?P<sep>.*))?\}$"
+consistent!(
+ rargs_3,
+ r"^\{(?P<left>-?\d*)?\.\.(?P<right>-?\d*)?(?::(?P<sep>.*))?\}$"
+);
+
+// rargs-0.2.0: r"(.*?)[[:space:]]+|(.*?)$"
+consistent!(rargs_4, r"(.*?)[[:space:]]+|(.*?)$");
+
+// indradb-lib-0.15.0: r"[a-zA-Z0-9]{8}"
+consistent!(indradb_lib_0, r"[a-zA-Z0-9]{8}");
+
+// fungi-lang-0.1.50: r"::"
+consistent!(fungi_lang_0, r"::");
+
+// nickel-0.10.1: "/hello/(?P<name>[a-zA-Z]+)"
+consistent!(nickel_0, "/hello/(?P<name>[a-zA-Z]+)");
+
+// nickel-0.10.1: "/hello/(?P<name>[a-zA-Z]+)"
+consistent!(nickel_1, "/hello/(?P<name>[a-zA-Z]+)");
+
+// pact_verifier-0.4.0: r"\{(\w+)\}"
+consistent!(pact_verifier_0, r"\{(\w+)\}");
+
+// pact_matching-0.4.1: "application/.*json"
+consistent!(pact_matching_0, "application/.*json");
+
+// pact_matching-0.4.1: "application/json.*"
+consistent!(pact_matching_1, "application/json.*");
+
+// pact_matching-0.4.1: "application/.*xml"
+consistent!(pact_matching_2, "application/.*xml");
+
+// pangu-0.2.0: "([\"'\\(\\[\\{{<\u{201c}])(\\s*)(.+?)(\\s*)([\"'\\)\\]\\}}>\u{201d}])"
+consistent!(
+ pangu_0,
+ "([\"'\\(\\[\\{{<\u{201c}])(\\s*)(.+?)(\\s*)([\"'\\)\\]\\}}>\u{201d}])"
+);
+
+// pangu-0.2.0: "([\\(\\[\\{{<\u{201c}]+)(\\s*)(.+?)(\\s*)([\\)\\]\\}}>\u{201d}]+)"
+consistent!(
+ pangu_1,
+ "([\\(\\[\\{{<\u{201c}]+)(\\s*)(.+?)(\\s*)([\\)\\]\\}}>\u{201d}]+)"
+);
+
+// parser-haskell-0.2.0: r"\{-[\s\S]*?-\}"
+consistent!(parser_haskell_0, r"\{-[\s\S]*?-\}");
+
+// parser-haskell-0.2.0: r"(?m);+\s*$"
+consistent!(parser_haskell_1, r"(?m);+\s*$");
+
+// parser-haskell-0.2.0: r"(?m)^#(if|ifn?def|endif|else|include|elif).*"
+consistent!(parser_haskell_2, r"(?m)^#(if|ifn?def|endif|else|include|elif).*");
+
+// parser-haskell-0.2.0: r"'([^'\\]|\\[A-Z]{1,3}|\\.)'"
+consistent!(parser_haskell_3, r"'([^'\\]|\\[A-Z]{1,3}|\\.)'");
+
+// parser-haskell-0.2.0: r"forall\s+(.*?)\."
+consistent!(parser_haskell_4, r"forall\s+(.*?)\.");
+
+// html2md-0.2.1: "\\s{2,}"
+consistent!(html2md_0, "\\s{2,}");
+
+// html2md-0.2.1: "\\n{2,}"
+consistent!(html2md_1, "\\n{2,}");
+
+// html2md-0.2.1: "(?m)(\\S) $"
+consistent!(html2md_2, "(?m)(\\S) $");
+
+// html2md-0.2.1: "(?m)^[-*] "
+consistent!(html2md_3, "(?m)^[-*] ");
+
+// ovpnfile-0.1.2: r"#.*$"
+consistent!(ovpnfile_0, r"#.*$");
+
+// ovpnfile-0.1.2: r"^<(\S+)>"
+consistent!(ovpnfile_1, r"^<(\S+)>");
+
+// ovpnfile-0.1.2: r"^</(\S+)>"
+consistent!(ovpnfile_2, r"^</(\S+)>");
+
+// screenruster-saver-fractal-0.1.1: r"#([:xdigit:]{2})([:xdigit:]{2})([:xdigit:]{2})"
+consistent!(
+ screenruster_saver_fractal_0,
+ r"#([:xdigit:]{2})([:xdigit:]{2})([:xdigit:]{2})"
+);
+
+// scarlet-0.2.2: r"rgb\((?: *(\d{1,3}),)(?: *(\d{1,3}),)(?: *(\d{1,3}))\)"
+consistent!(
+ scarlet_0,
+ r"rgb\((?: *(\d{1,3}),)(?: *(\d{1,3}),)(?: *(\d{1,3}))\)"
+);
+
+// cpp_to_rust_generator-0.2.0: r"^([\w:]+)<(.+)>$"
+consistent!(cpp_to_rust_generator_0, r"^([\w:]+)<(.+)>$");
+
+// cpp_to_rust_generator-0.2.0: r"^type-parameter-(\d+)-(\d+)$"
+consistent!(cpp_to_rust_generator_1, r"^type-parameter-(\d+)-(\d+)$");
+
+// cpp_to_rust_generator-0.2.0: r"^([\w~]+)<[^<>]+>$"
+consistent!(cpp_to_rust_generator_2, r"^([\w~]+)<[^<>]+>$");
+
+// cpp_to_rust_generator-0.2.0: r"(signals|Q_SIGNALS)\s*:"
+consistent!(cpp_to_rust_generator_3, r"(signals|Q_SIGNALS)\s*:");
+
+// cpp_to_rust_generator-0.2.0: r"(slots|Q_SLOTS)\s*:"
+consistent!(cpp_to_rust_generator_4, r"(slots|Q_SLOTS)\s*:");
+
+// cpp_to_rust_generator-0.2.0: r"(public|protected|private)\s*:"
+consistent!(cpp_to_rust_generator_5, r"(public|protected|private)\s*:");
+
+// cpp_to_rust-0.5.3: r"^([\w:]+)<(.+)>$"
+consistent!(cpp_to_rust_0, r"^([\w:]+)<(.+)>$");
+
+// cpp_to_rust-0.5.3: r"^type-parameter-(\d+)-(\d+)$"
+consistent!(cpp_to_rust_1, r"^type-parameter-(\d+)-(\d+)$");
+
+// cpp_to_rust-0.5.3: r"^([\w~]+)<[^<>]+>$"
+consistent!(cpp_to_rust_2, r"^([\w~]+)<[^<>]+>$");
+
+// cpp_to_rust-0.5.3: r"(signals|Q_SIGNALS)\s*:"
+consistent!(cpp_to_rust_3, r"(signals|Q_SIGNALS)\s*:");
+
+// cpp_to_rust-0.5.3: r"(slots|Q_SLOTS)\s*:"
+consistent!(cpp_to_rust_4, r"(slots|Q_SLOTS)\s*:");
+
+// cpp_to_rust-0.5.3: r"(public|protected|private)\s*:"
+consistent!(cpp_to_rust_5, r"(public|protected|private)\s*:");
+
+// fritzbox_logs-0.2.0: "(\\d{2}\\.\\d{2}\\.\\d{2}) (\\d{2}:\\d{2}:\\d{2}) (.*)"
+consistent!(
+ fritzbox_logs_0,
+ "(\\d{2}\\.\\d{2}\\.\\d{2}) (\\d{2}:\\d{2}:\\d{2}) (.*)"
+);
+
+// fractal-matrix-api-3.29.0: r"mxc://(?P<server>[^/]+)/(?P<media>.+)"
+consistent!(fractal_matrix_api_0, r"mxc://(?P<server>[^/]+)/(?P<media>.+)");
+
+// smtp2go-0.1.4: r"^api-[a-zA-Z0-9]{32}$"
+consistent!(smtp2go_0, r"^api-[a-zA-Z0-9]{32}$");
+
+// pusher-0.3.1: r"^[-a-zA-Z0-9_=@,.;]+$"
+consistent!(pusher_0, r"^[-a-zA-Z0-9_=@,.;]+$");
+
+// pusher-0.3.1: r"\A\d+\.\d+\z"
+consistent!(pusher_1, r"\A\d+\.\d+\z");
+
+// bakervm-0.9.0: r"^\.(.+?) +?(.+)$"
+consistent!(bakervm_0, r"^\.(.+?) +?(.+)$");
+
+// bakervm-0.9.0: r"^\.([^\s]+)$"
+consistent!(bakervm_1, r"^\.([^\s]+)$");
+
+// bakervm-0.9.0: r"^include! +([^\s]+)$"
+consistent!(bakervm_2, r"^include! +([^\s]+)$");
+
+// bakervm-0.9.0: r"^@(\d+)$"
+consistent!(bakervm_3, r"^@(\d+)$");
+
+// bakervm-0.9.0: r"^true|false$"
+consistent!(bakervm_4, r"^true|false$");
+
+// bakervm-0.9.0: r"^(-?\d+)?\.[0-9]+$"
+consistent!(bakervm_5, r"^(-?\d+)?\.[0-9]+$");
+
+// bakervm-0.9.0: r"^(-?\d+)?$"
+consistent!(bakervm_6, r"^(-?\d+)?$");
+
+// bakervm-0.9.0: r"^#([0-9abcdefABCDEF]{6})$"
+consistent!(bakervm_7, r"^#([0-9abcdefABCDEF]{6})$");
+
+// bakervm-0.9.0: r"^'(.)'$"
+consistent!(bakervm_8, r"^'(.)'$");
+
+// bakervm-0.9.0: r"^\$vi\((\d+)\)$"
+consistent!(bakervm_9, r"^\$vi\((\d+)\)$");
+
+// bakervm-0.9.0: r"^\$key\((\d+)\)$"
+consistent!(bakervm_10, r"^\$key\((\d+)\)$");
+
+// banana-0.0.2: "(?P<type>[A-Z^']+) (?P<route>[^']+) HTTP/(?P<http>[^']+)"
+consistent!(
+ banana_0,
+ "(?P<type>[A-Z^']+) (?P<route>[^']+) HTTP/(?P<http>[^']+)"
+);
+
+// serial-key-2.0.0: r"[A-F0-9]{8}"
+consistent!(serial_key_0, r"[A-F0-9]{8}");
+
+// serde-hjson-0.8.1: "[\\\\\"\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]"
+consistent!(serde_hjson_0, "[\\\\\"\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]");
+
+// serde-hjson-0.8.1: "[\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]"
+consistent!(serde_hjson_1, "[\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]");
+
+// serde-hjson-0.8.1: "'''|[\x00-\x09\x0b\x0c\x0e-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]"
+consistent!(serde_hjson_2, "'''|[\x00-\x09\x0b\x0c\x0e-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]");
+
+// serde-odbc-0.1.0: r"/todos/(?P<id>\d+)"
+consistent!(serde_odbc_0, r"/todos/(?P<id>\d+)");
+
+// sentry-0.6.0: r"^(?:_<)?([a-zA-Z0-9_]+?)(?:\.\.|::)"
+consistent!(sentry_0, r"^(?:_<)?([a-zA-Z0-9_]+?)(?:\.\.|::)");
+
+// sentiment-0.1.1: r"[^a-zA-Z0 -]+"
+consistent!(sentiment_0, r"[^a-zA-Z0 -]+");
+
+// sentiment-0.1.1: r" {2,}"
+consistent!(sentiment_1, r" {2,}");
+
+// verilog-0.0.1: r"(?m)//.*"
+consistent!(verilog_0, r"(?m)//.*");
+
+// verex-0.2.2: "(?P<robot>C3PO)"
+consistent!(verex_0, "(?P<robot>C3PO)");
+
+// handlebars-0.32.4: ">|<|\"|&"
+consistent!(handlebars_0, ">|<|\"|&");
+
+// haikunator-0.1.2: r"^\w+-\w+-[0123456789]{4}$"
+consistent!(haikunator_0, r"^\w+-\w+-[0123456789]{4}$");
+
+// haikunator-0.1.2: r"^\w+@\w+@[0123456789]{4}$"
+consistent!(haikunator_1, r"^\w+@\w+@[0123456789]{4}$");
+
+// haikunator-0.1.2: r"^\w+-\w+-[0123456789abcdef]{4}$"
+consistent!(haikunator_2, r"^\w+-\w+-[0123456789abcdef]{4}$");
+
+// haikunator-0.1.2: r"^\w+-\w+-[0123456789忠犬ハチ公]{10}$"
+consistent!(haikunator_3, r"^\w+-\w+-[0123456789忠犬ハチ公]{10}$");
+
+// haikunator-0.1.2: r"^\w+-\w+$"
+consistent!(haikunator_4, r"^\w+-\w+$");
+
+// haikunator-0.1.2: r"^\w+-\w+-[foo]{4}$"
+consistent!(haikunator_5, r"^\w+-\w+-[foo]{4}$");
+
+// haikunator-0.1.2: r"^\w+-\w+-[0123456789忠犬ハチ公]{5}$"
+consistent!(haikunator_6, r"^\w+-\w+-[0123456789忠犬ハチ公]{5}$");
+
+// bobbin-cli-0.8.3: r"(.*)"
+consistent!(bobbin_cli_0, r"(.*)");
+
+// bobbin-cli-0.8.3: r"rustc (.*)"
+consistent!(bobbin_cli_1, r"rustc (.*)");
+
+// bobbin-cli-0.8.3: r"cargo (.*)"
+consistent!(bobbin_cli_2, r"cargo (.*)");
+
+// bobbin-cli-0.8.3: r"xargo (.*)\n"
+consistent!(bobbin_cli_3, r"xargo (.*)\n");
+
+// bobbin-cli-0.8.3: r"Open On-Chip Debugger (.*)"
+consistent!(bobbin_cli_4, r"Open On-Chip Debugger (.*)");
+
+// bobbin-cli-0.8.3: r"arm-none-eabi-gcc \(GNU Tools for ARM Embedded Processors[^\)]*\) (.*)"
+consistent!(
+ bobbin_cli_5,
+ r"arm-none-eabi-gcc \(GNU Tools for ARM Embedded Processors[^\)]*\) (.*)"
+);
+
+// bobbin-cli-0.8.3: r"(?m).*\nBasic Open Source SAM-BA Application \(BOSSA\) Version (.*)\n"
+consistent!(
+ bobbin_cli_6,
+ r"(?m).*\nBasic Open Source SAM-BA Application \(BOSSA\) Version (.*)\n"
+);
+
+// bobbin-cli-0.8.3: r"(?m)SEGGER J-Link Commander (.*)\n"
+consistent!(bobbin_cli_7, r"(?m)SEGGER J-Link Commander (.*)\n");
+
+// bobbin-cli-0.8.3: r"(?m)Teensy Loader, Command Line, Version (.*)\n"
+consistent!(bobbin_cli_8, r"(?m)Teensy Loader, Command Line, Version (.*)\n");
+
+// bobbin-cli-0.8.3: r"dfu-util (.*)\n"
+consistent!(bobbin_cli_9, r"dfu-util (.*)\n");
+
+// borsholder-0.9.1: r"^/static/[\w.]+$"
+consistent!(borsholder_0, r"^/static/[\w.]+$");
+
+// borsholder-0.9.1: r"^/timeline/([0-9]+)$"
+consistent!(borsholder_1, r"^/timeline/([0-9]+)$");
+
+// fblog-1.0.1: "\u{001B}\\[[\\d;]*[^\\d;]"
+consistent!(fblog_0, "\u{001B}\\[[\\d;]*[^\\d;]");
+
+// fblog-1.0.1: "\u{001B}\\[[\\d;]*[^\\d;]"
+consistent!(fblog_1, "\u{001B}\\[[\\d;]*[^\\d;]");
+
+// toml-query-0.6.0: r"^\[\d+\]$"
+consistent!(toml_query_0, r"^\[\d+\]$");
+
+// todo-txt-1.1.0: r" (?P<key>[^\s]+):(?P<value>[^\s^/]+)"
+consistent!(todo_txt_0, r" (?P<key>[^\s]+):(?P<value>[^\s^/]+)");
+
+// findr-0.1.5: r"\band\b"
+consistent!(findr_0, r"\band\b");
+
+// findr-0.1.5: r"\bor\b"
+consistent!(findr_1, r"\bor\b");
+
+// findr-0.1.5: r"\bnot\b"
+consistent!(findr_2, r"\bnot\b");
+
+// file-sniffer-3.0.1: r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"
+consistent!(file_sniffer_0, r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$");
+
+// file-sniffer-3.0.1: r".*?\.(stats|conf|h|cache.*|dat|pc|info)$"
+consistent!(file_sniffer_1, r".*?\.(stats|conf|h|cache.*|dat|pc|info)$");
+
+// file-sniffer-3.0.1: r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"
+consistent!(file_sniffer_2, r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$");
+
+// file-sniffer-3.0.1: r".*?\.(stats|conf|h|cache.*)$"
+consistent!(file_sniffer_3, r".*?\.(stats|conf|h|cache.*)$");
+
+// file-sniffer-3.0.1: r"(\.git|\.pijul|_darcs|\.hg)$"
+consistent!(file_sniffer_4, r"(\.git|\.pijul|_darcs|\.hg)$");
+
+// file_logger-0.1.0: "test"
+consistent!(file_logger_0, "test");
+
+// file_scanner-0.2.0: r"foo"
+consistent!(file_scanner_0, r"foo");
+
+// file_scanner-0.2.0: r"a+b"
+consistent!(file_scanner_1, r"a+b");
+
+// file_scanner-0.2.0: r"a[ab]*b"
+consistent!(file_scanner_2, r"a[ab]*b");
+
+// file_scanner-0.2.0: r"\s+"
+consistent!(file_scanner_3, r"\s+");
+
+// file_scanner-0.2.0: r"\s+"
+consistent!(file_scanner_4, r"\s+");
+
+// cellsplit-0.2.1: r"^\s*([^\s]+) %cellsplit<\d+>$"
+consistent!(cellsplit_0, r"^\s*([^\s]+) %cellsplit<\d+>$");
+
+// cellsplit-0.2.1: r"^\s*([^\s]+) %cellsplit<\d+>$"
+consistent!(cellsplit_1, r"^\s*([^\s]+) %cellsplit<\d+>$");
+
+// aterm-0.20.0: r"^[+\-]?[0-9]+"
+consistent!(aterm_0, r"^[+\-]?[0-9]+");
+
+// aterm-0.20.0: r"^[+\-]?[0-9]+\.[0-9]*([eE][+\-]?[0-9]+)?"
+consistent!(aterm_1, r"^[+\-]?[0-9]+\.[0-9]*([eE][+\-]?[0-9]+)?");
+
+// atarashii_imap-0.3.0: r"^[*] OK"
+consistent!(atarashii_imap_0, r"^[*] OK");
+
+// atarashii_imap-0.3.0: r"FLAGS\s\((.+)\)"
+consistent!(atarashii_imap_1, r"FLAGS\s\((.+)\)");
+
+// atarashii_imap-0.3.0: r"\[PERMANENTFLAGS\s\((.+)\)\]"
+consistent!(atarashii_imap_2, r"\[PERMANENTFLAGS\s\((.+)\)\]");
+
+// atarashii_imap-0.3.0: r"\[UIDVALIDITY\s(\d+)\]"
+consistent!(atarashii_imap_3, r"\[UIDVALIDITY\s(\d+)\]");
+
+// atarashii_imap-0.3.0: r"(\d+)\sEXISTS"
+consistent!(atarashii_imap_4, r"(\d+)\sEXISTS");
+
+// atarashii_imap-0.3.0: r"(\d+)\sRECENT"
+consistent!(atarashii_imap_5, r"(\d+)\sRECENT");
+
+// atarashii_imap-0.3.0: r"\[UNSEEN\s(\d+)\]"
+consistent!(atarashii_imap_6, r"\[UNSEEN\s(\d+)\]");
+
+// atarashii_imap-0.3.0: r"\[UIDNEXT\s(\d+)\]"
+consistent!(atarashii_imap_7, r"\[UIDNEXT\s(\d+)\]");
+
+// editorconfig-1.0.0: r"\\(\{|\})"
+consistent!(editorconfig_0, r"\\(\{|\})");
+
+// editorconfig-1.0.0: r"(^|[^\\])\\\|"
+consistent!(editorconfig_1, r"(^|[^\\])\\\|");
+
+// editorconfig-1.0.0: r"\[([^\]]*)$"
+consistent!(editorconfig_2, r"\[([^\]]*)$");
+
+// editorconfig-1.0.0: r"\[(.*/.*)\]"
+consistent!(editorconfig_3, r"\[(.*/.*)\]");
+
+// editorconfig-1.0.0: r"\{(-?\d+\\\.\\\.-?\d+)\}"
+consistent!(editorconfig_4, r"\{(-?\d+\\\.\\\.-?\d+)\}");
+
+// editorconfig-1.0.0: r"\{([^,]+)\}"
+consistent!(editorconfig_5, r"\{([^,]+)\}");
+
+// editorconfig-1.0.0: r"\{(([^\}].*)?(,|\|)(.*[^\\])?)\}"
+consistent!(editorconfig_6, r"\{(([^\}].*)?(,|\|)(.*[^\\])?)\}");
+
+// editorconfig-1.0.0: r"^/"
+consistent!(editorconfig_7, r"^/");
+
+// editorconfig-1.0.0: r"(^|[^\\])(\{|\})"
+consistent!(editorconfig_8, r"(^|[^\\])(\{|\})");
+
+// edmunge-1.0.0: "^#!.*\n"
+consistent!(edmunge_0, "^#!.*\n");
+
+// unicode_names2_macros-0.2.0: r"\\N\{(.*?)(?:\}|$)"
+consistent!(unicode_names2_macros_0, r"\\N\{(.*?)(?:\}|$)");
+
+// unidiff-0.2.1: r"^--- (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?"
+consistent!(
+ unidiff_0,
+ r"^--- (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?"
+);
+
+// unidiff-0.2.1: r"^\+\+\+ (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?"
+consistent!(
+ unidiff_1,
+ r"^\+\+\+ (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?"
+);
+
+// unidiff-0.2.1: r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)"
+consistent!(unidiff_2, r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)");
+
+// unidiff-0.2.1: r"^(?P<line_type>[- \n\+\\]?)(?P<value>.*)"
+consistent!(unidiff_3, r"^(?P<line_type>[- \n\+\\]?)(?P<value>.*)");
+
+// slippy-map-tiles-0.13.1: "/?(?P<zoom>[0-9]?[0-9])/(?P<x>[0-9]{1,10})/(?P<y>[0-9]{1,10})(\\.[a-zA-Z]{3,4})?$"
+consistent!(slippy_map_tiles_0, "/?(?P<zoom>[0-9]?[0-9])/(?P<x>[0-9]{1,10})/(?P<y>[0-9]{1,10})(\\.[a-zA-Z]{3,4})?$");
+
+// slippy-map-tiles-0.13.1: r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$"
+consistent!(slippy_map_tiles_1, r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$");
+
+// slippy-map-tiles-0.13.1: r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$"
+consistent!(slippy_map_tiles_2, r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$");
+
+// sonos-0.1.2: r"^https?://(.+?):1400/xml"
+consistent!(sonos_0, r"^https?://(.+?):1400/xml");
+
+// validator_derive-0.7.0: r"^[a-z]{2}$"
+consistent!(validator_derive_0, r"^[a-z]{2}$");
+
+// validator_derive-0.7.0: r"[a-z]{2}"
+consistent!(validator_derive_1, r"[a-z]{2}");
+
+// validator_derive-0.7.0: r"[a-z]{2}"
+consistent!(validator_derive_2, r"[a-z]{2}");
+
+// nginx-config-0.8.0: r"one of \d+ options"
+consistent!(nginx_config_0, r"one of \d+ options");
+
+// waltz-0.4.0: r"[\s,]"
+consistent!(waltz_0, r"[\s,]");
+
+// warheadhateus-0.2.1: r"^aws_access_key_id = (.*)"
+consistent!(warheadhateus_0, r"^aws_access_key_id = (.*)");
+
+// warheadhateus-0.2.1: r"^aws_secret_access_key = (.*)"
+consistent!(warheadhateus_1, r"^aws_secret_access_key = (.*)");
+
+// warheadhateus-0.2.1: r"^aws_access_key_id = (.*)"
+consistent!(warheadhateus_2, r"^aws_access_key_id = (.*)");
+
+// warheadhateus-0.2.1: r"^aws_secret_access_key = (.*)"
+consistent!(warheadhateus_3, r"^aws_secret_access_key = (.*)");
+
+// jieba-rs-0.2.2: r"([\u{4E00}-\u{9FD5}a-zA-Z0-9+#&\._%]+)"
+consistent!(jieba_rs_0, r"([\u{4E00}-\u{9FD5}a-zA-Z0-9+#&\._%]+)");
+
+// jieba-rs-0.2.2: r"(\r\n|\s)"
+consistent!(jieba_rs_1, r"(\r\n|\s)");
+
+// jieba-rs-0.2.2: "([\u{4E00}-\u{9FD5}]+)"
+consistent!(jieba_rs_2, "([\u{4E00}-\u{9FD5}]+)");
+
+// jieba-rs-0.2.2: r"[^a-zA-Z0-9+#\n]"
+consistent!(jieba_rs_3, r"[^a-zA-Z0-9+#\n]");
+
+// jieba-rs-0.2.2: r"([\u{4E00}-\u{9FD5}]+)"
+consistent!(jieba_rs_4, r"([\u{4E00}-\u{9FD5}]+)");
+
+// jieba-rs-0.2.2: r"([a-zA-Z0-9]+(?:.\d+)?%?)"
+consistent!(jieba_rs_5, r"([a-zA-Z0-9]+(?:.\d+)?%?)");
+
+// lalrpop-0.15.2: r"Span\([0-9 ,]*\)"
+consistent!(lalrpop_0, r"Span\([0-9 ,]*\)");
+
+// lalrpop-snap-0.15.2: r"Span\([0-9 ,]*\)"
+consistent!(lalrpop_snap_0, r"Span\([0-9 ,]*\)");
+
+// nlp-tokenize-0.1.0: r"[\S]+"
+consistent!(nlp_tokenize_0, r"[\S]+");
+
+// kbgpg-0.1.2: "[[:xdigit:]][70]"
+consistent!(kbgpg_0, "[[:xdigit:]][70]");
+
+// cdbd-0.1.1: r"^((?P<address>.*):)?(?P<port>\d+)$"
+consistent!(cdbd_0, r"^((?P<address>.*):)?(?P<port>\d+)$");
+
+// mbutiles-0.1.1: r"[\w\s=+-/]+\((\{(.|\n)*\})\);?"
+consistent!(mbutiles_0, r"[\w\s=+-/]+\((\{(.|\n)*\})\);?");
+
+// extrahop-0.2.5: r"^-\d+(?:ms|s|m|h|d|w|y)?$"
+consistent!(extrahop_0, r"^-\d+(?:ms|s|m|h|d|w|y)?$");
+
+// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$"
+consistent!(pippin_0, "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$");
+
+// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$"
+consistent!(
+ pippin_1,
+ "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$"
+);
+
+// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$"
+consistent!(pippin_2, "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$");
+
+// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$"
+consistent!(
+ pippin_3,
+ "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$"
+);
+
+// pippin-0.1.0: "^.*pn(0|[1-9][0-9]*)(-ss(0|[1-9][0-9]*)(\\.pip|-cl(0|[1-9][0-9]*)\\.piplog))?$"
+consistent!(pippin_4, "^.*pn(0|[1-9][0-9]*)(-ss(0|[1-9][0-9]*)(\\.pip|-cl(0|[1-9][0-9]*)\\.piplog))?$");
+
+// pippin-0.1.0: "^(.*)-ss(?:0|[1-9][0-9]*)(?:\\.pip|-cl(?:0|[1-9][0-9]*)\\.piplog)$"
+consistent!(
+ pippin_5,
+ "^(.*)-ss(?:0|[1-9][0-9]*)(?:\\.pip|-cl(?:0|[1-9][0-9]*)\\.piplog)$"
+);
+
+// pinyin-0.3.0: r"(?i)[āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň]"
+consistent!(
+ pinyin_0,
+ r"(?i)[āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň]"
+);
+
+// pinyin-0.3.0: r"([aeoiuvnm])([0-4])$"
+consistent!(pinyin_1, r"([aeoiuvnm])([0-4])$");
+
+// duration-parser-0.2.0: r"(?P<value>\d+)(?P<units>[a-z])"
+consistent!(duration_parser_0, r"(?P<value>\d+)(?P<units>[a-z])");
+
+// dutree-0.2.7: r"^\d+\D?$"
+consistent!(dutree_0, r"^\d+\D?$");
+
+// djangohashers-0.3.0: r"^[A-Za-z0-9]*$"
+consistent!(djangohashers_0, r"^[A-Za-z0-9]*$");
+
+// rtag-0.3.5: r"^[A-Z][A-Z0-9]{2,}$"
+consistent!(rtag_0, r"^[A-Z][A-Z0-9]{2,}$");
+
+// rtag-0.3.5: r"^http://www\.emusic\.com"
+consistent!(rtag_1, r"^http://www\.emusic\.com");
+
+// rtag-0.3.5: r"^[A-Z][A-Z0-9]{2,}"
+consistent!(rtag_2, r"^[A-Z][A-Z0-9]{2,}");
+
+// rtag-0.3.5: r"(^[\x{0}|\x{feff}|\x{fffe}]*|[\x{0}|\x{feff}|\x{fffe}]*$)"
+consistent!(
+ rtag_3,
+ r"(^[\x{0}|\x{feff}|\x{fffe}]*|[\x{0}|\x{feff}|\x{fffe}]*$)"
+);
+
+// rtow-0.1.0: r"(\d+)[xX](\d+)"
+consistent!(rtow_0, r"(\d+)[xX](\d+)");
+
+// pleingres-sql-plugin-0.1.0: r"\$([a-zA-Z0-9_]+)"
+consistent!(pleingres_sql_plugin_0, r"\$([a-zA-Z0-9_]+)");
+
+// dono-2.0.0: "[\\n]+"
+consistent!(dono_0, "[\\n]+");
+
+// dono-2.0.0: "(?m)^\\n"
+consistent!(dono_1, "(?m)^\\n");
+
+// dono-2.0.0: "(?m)^\\n"
+consistent!(dono_2, "(?m)^\\n");
+
+// ssb-common-0.3.0: r"^[0-9A-Za-z\+/]{43}=\.ed25519$"
+consistent!(ssb_common_0, r"^[0-9A-Za-z\+/]{43}=\.ed25519$");
+
+// ssb-common-0.3.0: r"^[0-9A-Za-z\+/]{86}==\.ed25519$"
+consistent!(ssb_common_1, r"^[0-9A-Za-z\+/]{86}==\.ed25519$");
+
+// ssb-common-0.3.0: r"^[0-9A-Za-z\+/]{43}=\.sha256$"
+consistent!(ssb_common_2, r"^[0-9A-Za-z\+/]{43}=\.sha256$");
+
+// mozversion-0.1.3: r"^(?P<major>\d+)\.(?P<minor>\d+)(?:\.(?P<patch>\d+))?(?:(?P<pre0>[a-z]+)(?P<pre1>\d*))?$"
+consistent!(mozversion_0, r"^(?P<major>\d+)\.(?P<minor>\d+)(?:\.(?P<patch>\d+))?(?:(?P<pre0>[a-z]+)(?P<pre1>\d*))?$");
+
+// monger-0.5.6: r"^(\d+)\.(\d+)$"
+consistent!(monger_0, r"^(\d+)\.(\d+)$");
+
+// mongo_rub-0.0.2: r"^[rv]2\.6"
+consistent!(mongo_rub_0, r"^[rv]2\.6");
+
+// flow-0.3.5: "body value"
+consistent!(flow_0, "body value");
+
+// flow-0.3.5: "start marker"
+consistent!(flow_1, "start marker");
+
+// flow-0.3.5: "end marker"
+consistent!(flow_2, "end marker");
+
+// flow-0.3.5: "body value"
+consistent!(flow_3, "body value");
+
+// vobsub-0.2.3: "^([A-Za-z/ ]+): (.*)"
+consistent!(vobsub_0, "^([A-Za-z/ ]+): (.*)");
+
+// voidmap-1.1.2: r"#([^\s=]+)*"
+consistent!(voidmap_0, r"#([^\s=]+)*");
+
+// voidmap-1.1.2: r"#(\S+)*"
+consistent!(voidmap_1, r"#(\S+)*");
+
+// voidmap-1.1.2: r"#prio=(\d+)"
+consistent!(voidmap_2, r"#prio=(\d+)");
+
+// voidmap-1.1.2: r"\[(\S+)\]"
+consistent!(voidmap_3, r"\[(\S+)\]");
+
+// voidmap-1.1.2: r"#limit=(\d+)"
+consistent!(voidmap_4, r"#limit=(\d+)");
+
+// voidmap-1.1.2: r"#tagged=(\S+)"
+consistent!(voidmap_5, r"#tagged=(\S+)");
+
+// voidmap-1.1.2: r"#rev\b"
+consistent!(voidmap_6, r"#rev\b");
+
+// voidmap-1.1.2: r"#done\b"
+consistent!(voidmap_7, r"#done\b");
+
+// voidmap-1.1.2: r"#open\b"
+consistent!(voidmap_8, r"#open\b");
+
+// voidmap-1.1.2: r"#since=(\S+)"
+consistent!(voidmap_9, r"#since=(\S+)");
+
+// voidmap-1.1.2: r"#until=(\S+)"
+consistent!(voidmap_10, r"#until=(\S+)");
+
+// voidmap-1.1.2: r"#plot=(\S+)"
+consistent!(voidmap_11, r"#plot=(\S+)");
+
+// voidmap-1.1.2: r"#n=(\d+)"
+consistent!(voidmap_12, r"#n=(\d+)");
+
+// voidmap-1.1.2: r"(\S+)"
+consistent!(voidmap_13, r"(\S+)");
+
+// voidmap-1.1.2: r"(?P<y>\d+)y"
+consistent!(voidmap_14, r"(?P<y>\d+)y");
+
+// voidmap-1.1.2: r"(?P<m>\d+)m"
+consistent!(voidmap_15, r"(?P<m>\d+)m");
+
+// voidmap-1.1.2: r"(?P<w>\d+)w"
+consistent!(voidmap_16, r"(?P<w>\d+)w");
+
+// voidmap-1.1.2: r"(?P<d>\d+)d"
+consistent!(voidmap_17, r"(?P<d>\d+)d");
+
+// voidmap-1.1.2: r"(?P<h>\d+)h"
+consistent!(voidmap_18, r"(?P<h>\d+)h");
+
+// voidmap-1.1.2: r"C-(.)"
+consistent!(voidmap_19, r"C-(.)");
+
+// qt_generator-0.2.0: r"^\.\./qt[^/]+/"
+consistent!(qt_generator_0, r"^\.\./qt[^/]+/");
+
+// qt_generator-0.2.0: "(href|src)=\"([^\"]*)\""
+consistent!(qt_generator_1, "(href|src)=\"([^\"]*)\"");
+
+// kryptos-0.6.1: r"[01]{5}"
+consistent!(kryptos_0, r"[01]{5}");
+
+// cifar_10_loader-0.2.0: "data_batch_[1-5].bin"
+consistent!(cifar_10_loader_0, "data_batch_[1-5].bin");
+
+// cifar_10_loader-0.2.0: "test_batch.bin"
+consistent!(cifar_10_loader_1, "test_batch.bin");
+
+// circadian-0.6.0: r"^\d+.\d+s$"
+consistent!(circadian_0, r"^\d+.\d+s$");
+
+// circadian-0.6.0: r"^\d+:\d+$"
+consistent!(circadian_1, r"^\d+:\d+$");
+
+// circadian-0.6.0: r"^\d+:\d+m$"
+consistent!(circadian_2, r"^\d+:\d+m$");
+
+// cicada-0.8.1: r"!!"
+consistent!(cicada_0, r"!!");
+
+// cicada-0.8.1: r"^([^`]*)`([^`]+)`(.*)$"
+consistent!(cicada_1, r"^([^`]*)`([^`]+)`(.*)$");
+
+// cicada-0.8.1: r"\*+"
+consistent!(cicada_2, r"\*+");
+
+// cicada-0.8.1: r"([^\$]*)\$\{?([A-Za-z0-9\?\$_]+)\}?(.*)"
+consistent!(cicada_3, r"([^\$]*)\$\{?([A-Za-z0-9\?\$_]+)\}?(.*)");
+
+// cicada-0.8.1: r"^ *alias +([a-zA-Z0-9_\.-]+)=(.*)$"
+consistent!(cicada_4, r"^ *alias +([a-zA-Z0-9_\.-]+)=(.*)$");
+
+// vterm-sys-0.1.0: r"hi"
+consistent!(vterm_sys_0, r"hi");
+
+// skim-0.5.0: r".*?\t"
+consistent!(skim_0, r".*?\t");
+
+// skim-0.5.0: r".*?[\t ]"
+consistent!(skim_1, r".*?[\t ]");
+
+// skim-0.5.0: r"(\{-?[0-9.,q]*?})"
+consistent!(skim_2, r"(\{-?[0-9.,q]*?})");
+
+// skim-0.5.0: r"[ \t\n]+"
+consistent!(skim_3, r"[ \t\n]+");
+
+// skim-0.5.0: r"[ \t\n]+"
+consistent!(skim_4, r"[ \t\n]+");
+
+// skim-0.5.0: r"([^ |]+( +\| +[^ |]*)+)|( +)"
+consistent!(skim_5, r"([^ |]+( +\| +[^ |]*)+)|( +)");
+
+// skim-0.5.0: r" +\| +"
+consistent!(skim_6, r" +\| +");
+
+// skim-0.5.0: r"^(?P<left>-?\d+)?(?P<sep>\.\.)?(?P<right>-?\d+)?$"
+consistent!(skim_7, r"^(?P<left>-?\d+)?(?P<sep>\.\.)?(?P<right>-?\d+)?$");
+
+// skim-0.5.0: ","
+consistent!(skim_8, ",");
+
+// skim-0.5.0: ".*?,"
+consistent!(skim_9, ".*?,");
+
+// skim-0.5.0: ".*?,"
+consistent!(skim_10, ".*?,");
+
+// skim-0.5.0: ","
+consistent!(skim_11, ",");
+
+// skim-0.5.0: r"\x1B\[(?:([0-9]+;[0-9]+[Hf])|([0-9]+[ABCD])|(s|u|2J|K)|([0-9;]*m)|(=[0-9]+[hI]))"
+consistent!(skim_12, r"\x1B\[(?:([0-9]+;[0-9]+[Hf])|([0-9]+[ABCD])|(s|u|2J|K)|([0-9;]*m)|(=[0-9]+[hI]))");
+
+// egg-mode-text-1.14.7: r"[-_./]\z"
+consistent!(egg_mode_text_0, r"[-_./]\z");
+
+// java-properties-1.1.1: "^[ \t\r\n\x0c]*[#!]"
+consistent!(java_properties_0, "^[ \t\r\n\x0c]*[#!]");
+
+// java-properties-1.1.1: r"^[ \t\x0c]*[#!][^\r\n]*$"
+consistent!(java_properties_1, r"^[ \t\x0c]*[#!][^\r\n]*$");
+
+// java-properties-1.1.1: r"^([ \t\x0c]*[:=][ \t\x0c]*|[ \t\x0c]+)$"
+consistent!(java_properties_2, r"^([ \t\x0c]*[:=][ \t\x0c]*|[ \t\x0c]+)$");
+
+// ipaddress-0.1.2: r":.+\."
+consistent!(ipaddress_0, r":.+\.");
+
+// ipaddress-0.1.2: r"\."
+consistent!(ipaddress_1, r"\.");
+
+// ipaddress-0.1.2: r":"
+consistent!(ipaddress_2, r":");
+
+// iptables-0.2.2: r"v(\d+)\.(\d+)\.(\d+)"
+consistent!(iptables_0, r"v(\d+)\.(\d+)\.(\d+)");
+
+// rsure-0.8.1: r"^([^-]+)-(.*)\.dat\.gz$"
+consistent!(rsure_0, r"^([^-]+)-(.*)\.dat\.gz$");
+
+// rs-jsonpath-0.1.0: "^(.*?)(<=|<|==|>=|>)(.*?)$"
+consistent!(rs_jsonpath_0, "^(.*?)(<=|<|==|>=|>)(.*?)$");
+
+// oatie-0.3.0: r"(\n|^)(\w+):([\n\w\W]+?)(\n(?:\w)|(\n\]))"
+consistent!(oatie_0, r"(\n|^)(\w+):([\n\w\W]+?)(\n(?:\w)|(\n\]))");
+
+// weld-0.2.0: "#.*$"
+consistent!(weld_0, "#.*$");
+
+// weld-0.2.0: r"^[A-Za-z$_][A-Za-z0-9$_]*$"
+consistent!(weld_1, r"^[A-Za-z$_][A-Za-z0-9$_]*$");
+
+// weld-0.2.0: r"^[0-9]+[cC]$"
+consistent!(weld_2, r"^[0-9]+[cC]$");
+
+// weld-0.2.0: r"^0b[0-1]+[cC]$"
+consistent!(weld_3, r"^0b[0-1]+[cC]$");
+
+// weld-0.2.0: r"^0x[0-9a-fA-F]+[cC]$"
+consistent!(weld_4, r"^0x[0-9a-fA-F]+[cC]$");
+
+// weld-0.2.0: r"^[0-9]+$"
+consistent!(weld_5, r"^[0-9]+$");
+
+// weld-0.2.0: r"^0b[0-1]+$"
+consistent!(weld_6, r"^0b[0-1]+$");
+
+// weld-0.2.0: r"^0x[0-9a-fA-F]+$"
+consistent!(weld_7, r"^0x[0-9a-fA-F]+$");
+
+// weld-0.2.0: r"^[0-9]+[lL]$"
+consistent!(weld_8, r"^[0-9]+[lL]$");
+
+// weld-0.2.0: r"^0b[0-1]+[lL]$"
+consistent!(weld_9, r"^0b[0-1]+[lL]$");
+
+// weld-0.2.0: r"^0x[0-9a-fA-F]+[lL]$"
+consistent!(weld_10, r"^0x[0-9a-fA-F]+[lL]$");
+
+// webgl_generator-0.1.0: "([(, ])enum\\b"
+consistent!(webgl_generator_0, "([(, ])enum\\b");
+
+// webgl_generator-0.1.0: "\\bAcquireResourcesCallback\\b"
+consistent!(webgl_generator_1, "\\bAcquireResourcesCallback\\b");
+
+// weave-0.2.0: r"^(\d+)(,(\d+))?([acd]).*$"
+consistent!(weave_0, r"^(\d+)(,(\d+))?([acd]).*$");
+
+// wemo-0.0.12: r"<BinaryState>(\d)(\|-?\d+)*</BinaryState>"
+consistent!(wemo_0, r"<BinaryState>(\d)(\|-?\d+)*</BinaryState>");
+
+// webscale-0.9.4: r"(http[s]?://[^\s]+)"
+consistent!(webscale_0, r"(http[s]?://[^\s]+)");
+
+// svgrep-1.1.0: r"^\d+.*$"
+consistent!(svgrep_0, r"^\d+.*$");
+
+// ignore-0.4.2: r"^[\pL\pN]+$"
+consistent!(ignore_0, r"^[\pL\pN]+$");
+
+// ommui_string_patterns-0.1.2: r"^([A-Za-z][0-9A-Za-z_]*)?$"
+consistent!(ommui_string_patterns_0, r"^([A-Za-z][0-9A-Za-z_]*)?$");
+
+// ommui_string_patterns-0.1.2: r"^(\S+(?:.*\S)?)?$"
+consistent!(ommui_string_patterns_1, r"^(\S+(?:.*\S)?)?$");
+
+// opcua-types-0.3.0: "^(?P<min>[0-9]{1,10})(:(?P<max>[0-9]{1,10}))?$"
+consistent!(opcua_types_0, "^(?P<min>[0-9]{1,10})(:(?P<max>[0-9]{1,10}))?$");
+
+// opcua-types-0.3.0: r"^(ns=(?P<ns>[0-9]+);)?(?P<t>[isgb])=(?P<v>.+)$"
+consistent!(opcua_types_1, r"^(ns=(?P<ns>[0-9]+);)?(?P<t>[isgb])=(?P<v>.+)$");
+
+// open_read_later-1.1.1: r"^(.+?)\s*:\s*(.+)$"
+consistent!(open_read_later_0, r"^(.+?)\s*:\s*(.+)$");
+
+// youtube-downloader-0.1.0: r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*"
+consistent!(youtube_downloader_0, r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*");
+
+// yobot-0.1.1: "."
+consistent!(yobot_0, ".");
+
+// yobot-0.1.1: r"."
+consistent!(yobot_1, r".");
+
+// yobot-0.1.1: r".+"
+consistent!(yobot_2, r".+");
+
+// yobot-0.1.1: r"."
+consistent!(yobot_3, r".");
+
+// ubiquity-0.1.5: r"foo"
+consistent!(ubiquity_0, r"foo");
+
+// ubiquity-0.1.5: r"/target/"
+consistent!(ubiquity_1, r"/target/");
+
+// ubiquity-0.1.5: r".DS_Store"
+consistent!(ubiquity_2, r".DS_Store");
+
+// qasm-1.0.0: r"//.*"
+consistent!(qasm_0, r"//.*");
+
+// drill-0.3.5: r"\{\{ *([a-z\._]+) *\}\}"
+consistent!(drill_0, r"\{\{ *([a-z\._]+) *\}\}");
+
+// queryst-2.0.0: r"^([^\]\[]+)"
+consistent!(queryst_0, r"^([^\]\[]+)");
+
+// queryst-2.0.0: r"(\[[^\]\[]*\])"
+consistent!(queryst_1, r"(\[[^\]\[]*\])");
+
+// qui-vive-0.1.0: r"^/(\w+)$"
+consistent!(qui_vive_0, r"^/(\w+)$");
+
+// qui-vive-0.1.0: r"^/key$"
+consistent!(qui_vive_1, r"^/key$");
+
+// qui-vive-0.1.0: r"^/key/(\w+)$"
+consistent!(qui_vive_2, r"^/key/(\w+)$");
+
+// qui-vive-0.1.0: r"^/url$"
+consistent!(qui_vive_3, r"^/url$");
+
+// qui-vive-0.1.0: r"^/url/(\w+)$"
+consistent!(qui_vive_4, r"^/url/(\w+)$");
+
+// qui-vive-0.1.0: r"^/inv$"
+consistent!(qui_vive_5, r"^/inv$");
+
+// qui-vive-0.1.0: r"^/inv/(\w+)$"
+consistent!(qui_vive_6, r"^/inv/(\w+)$");
+
+// subdiff-0.1.0: r"\b"
+// consistent!(subdiff_0, r"\b");
+
+// substudy-0.4.5: r"^(\d+)/(\d+)$"
+consistent!(substudy_0, r"^(\d+)/(\d+)$");
+
+// substudy-0.4.5: r"\s+"
+consistent!(substudy_1, r"\s+");
+
+// substudy-0.4.5: r"<[a-z/][^>]*>"
+consistent!(substudy_2, r"<[a-z/][^>]*>");
+
+// substudy-0.4.5: r"(\([^)]*\)|♪[^♪]*♪|[A-Z]{2,} ?:)"
+consistent!(substudy_3, r"(\([^)]*\)|♪[^♪]*♪|[A-Z]{2,} ?:)");
+
+// substudy-0.4.5: r"\s+"
+consistent!(substudy_4, r"\s+");
+
+// isbnid-0.1.3: r"^(\d(-| )?){9}(x|X|\d|(\d(-| )?){3}\d)$"
+consistent!(isbnid_0, r"^(\d(-| )?){9}(x|X|\d|(\d(-| )?){3}\d)$");
+
+// isbnid-0.1.3: r"[^0-9X]"
+consistent!(isbnid_1, r"[^0-9X]");
+
+// ispc-0.3.5: r"Intel\(r\) SPMD Program Compiler \(ispc\), (\d+\.\d+\.\d+)"
+consistent!(
+ ispc_0,
+ r"Intel\(r\) SPMD Program Compiler \(ispc\), (\d+\.\d+\.\d+)"
+);
diff --git a/third_party/rust/regex/tests/crazy.rs b/third_party/rust/regex/tests/crazy.rs
new file mode 100644
index 0000000000..293ac1ae72
--- /dev/null
+++ b/third_party/rust/regex/tests/crazy.rs
@@ -0,0 +1,459 @@
+mat!(ascii_literal, r"a", "a", Some((0, 1)));
+
+// Some crazy expressions from regular-expressions.info.
+mat!(
+ match_ranges,
+ r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
+ "num: 255",
+ Some((5, 8))
+);
+mat!(
+ match_ranges_not,
+ r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
+ "num: 256",
+ None
+);
+mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3)));
+mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3)));
+mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4)));
+mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None);
+mat!(
+ match_email,
+ r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
+ "mine is jam.slam@gmail.com ",
+ Some((8, 26))
+);
+mat!(
+ match_email_not,
+ r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
+ "mine is jam.slam@gmail ",
+ None
+);
+mat!(
+ match_email_big,
+ r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
+ "mine is jam.slam@gmail.com ",
+ Some((8, 26))
+);
+mat!(
+ match_date1,
+ r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
+ "1900-01-01",
+ Some((0, 10))
+);
+mat!(
+ match_date2,
+ r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
+ "1900-00-01",
+ None
+);
+mat!(
+ match_date3,
+ r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
+ "1900-13-01",
+ None
+);
+
+// Do some crazy dancing with the start/end assertions.
+matiter!(match_start_end_empty, r"^$", "", (0, 0));
+matiter!(match_start_end_empty_many_1, r"^$^$^$", "", (0, 0));
+matiter!(match_start_end_empty_many_2, r"^^^$$$", "", (0, 0));
+matiter!(match_start_end_empty_rev, r"$^", "", (0, 0));
+matiter!(
+ match_start_end_empty_rep,
+ r"(?:^$)*",
+ "a\nb\nc",
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5)
+);
+matiter!(
+ match_start_end_empty_rep_rev,
+ r"(?:$^)*",
+ "a\nb\nc",
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5)
+);
+
+// Test negated character classes.
+mat!(negclass_letters, r"[^ac]", "acx", Some((2, 3)));
+mat!(negclass_letter_comma, r"[^a,]", "a,x", Some((2, 3)));
+mat!(negclass_letter_space, r"[^a[:space:]]", "a x", Some((2, 3)));
+mat!(negclass_comma, r"[^,]", ",,x", Some((2, 3)));
+mat!(negclass_space, r"[^[:space:]]", " a", Some((1, 2)));
+mat!(negclass_space_comma, r"[^,[:space:]]", ", a", Some((2, 3)));
+mat!(negclass_comma_space, r"[^[:space:],]", " ,a", Some((2, 3)));
+mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));
+
+// Test that repeated empty expressions don't loop forever.
+mat!(lazy_many_many, r"((?:.*)*?)=", "a=b", Some((0, 2)));
+mat!(lazy_many_optional, r"((?:.?)*?)=", "a=b", Some((0, 2)));
+mat!(lazy_one_many_many, r"((?:.*)+?)=", "a=b", Some((0, 2)));
+mat!(lazy_one_many_optional, r"((?:.?)+?)=", "a=b", Some((0, 2)));
+mat!(lazy_range_min_many, r"((?:.*){1,}?)=", "a=b", Some((0, 2)));
+mat!(lazy_range_many, r"((?:.*){1,2}?)=", "a=b", Some((0, 2)));
+mat!(greedy_many_many, r"((?:.*)*)=", "a=b", Some((0, 2)));
+mat!(greedy_many_optional, r"((?:.?)*)=", "a=b", Some((0, 2)));
+mat!(greedy_one_many_many, r"((?:.*)+)=", "a=b", Some((0, 2)));
+mat!(greedy_one_many_optional, r"((?:.?)+)=", "a=b", Some((0, 2)));
+mat!(greedy_range_min_many, r"((?:.*){1,})=", "a=b", Some((0, 2)));
+mat!(greedy_range_many, r"((?:.*){1,2})=", "a=b", Some((0, 2)));
+
+// Test that we handle various flavors of empty expressions.
+matiter!(match_empty1, r"", "", (0, 0));
+matiter!(match_empty2, r"", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty3, r"()", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty4, r"()*", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty5, r"()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty6, r"()?", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty7, r"()()", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty8, r"()+|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty9, r"z|()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty10, r"()+|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty11, r"b|()+", "abc", (0, 0), (1, 2), (3, 3));
+matiter!(match_empty12, r"|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty13, r"b|", "abc", (0, 0), (1, 2), (3, 3));
+matiter!(match_empty14, r"|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty15, r"z|", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty16, r"|", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty17, r"||", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty18, r"||z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty19, r"(?:)|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty20, r"b|(?:)", "abc", (0, 0), (1, 2), (3, 3));
+matiter!(match_empty21, r"(?:|)", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty22, r"(?:|)|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
+matiter!(match_empty23, r"a(?:)|b", "abc", (0, 1), (1, 2));
+
+// Test that the DFA can handle pathological cases.
+// (This should result in the DFA's cache being flushed too frequently, which
+// should cause it to quit and fall back to the NFA algorithm.)
+#[test]
+fn dfa_handles_pathological_case() {
+ fn ones_and_zeroes(count: usize) -> String {
+ use rand::rngs::SmallRng;
+ use rand::{Rng, SeedableRng};
+
+ let mut rng = SmallRng::from_entropy();
+ let mut s = String::new();
+ for _ in 0..count {
+ if rng.gen() {
+ s.push('1');
+ } else {
+ s.push('0');
+ }
+ }
+ s
+ }
+
+ let re = regex!(r"[01]*1[01]{20}$");
+ let text = {
+ let mut pieces = ones_and_zeroes(100_000);
+ pieces.push('1');
+ pieces.push_str(&ones_and_zeroes(20));
+ pieces
+ };
+ assert!(re.is_match(text!(&*text)));
+}
+
+#[test]
+fn nest_limit_makes_it_parse() {
+ use regex::RegexBuilder;
+
+ RegexBuilder::new(
+ r#"(?-u)
+ 2(?:
+ [45]\d{3}|
+ 7(?:
+ 1[0-267]|
+ 2[0-289]|
+ 3[0-29]|
+ 4[01]|
+ 5[1-3]|
+ 6[013]|
+ 7[0178]|
+ 91
+ )|
+ 8(?:
+ 0[125]|
+ [139][1-6]|
+ 2[0157-9]|
+ 41|
+ 6[1-35]|
+ 7[1-5]|
+ 8[1-8]|
+ 90
+ )|
+ 9(?:
+ 0[0-2]|
+ 1[0-4]|
+ 2[568]|
+ 3[3-6]|
+ 5[5-7]|
+ 6[0167]|
+ 7[15]|
+ 8[0146-9]
+ )
+ )\d{4}|
+ 3(?:
+ 12?[5-7]\d{2}|
+ 0(?:
+ 2(?:
+ [025-79]\d|
+ [348]\d{1,2}
+ )|
+ 3(?:
+ [2-4]\d|
+ [56]\d?
+ )
+ )|
+ 2(?:
+ 1\d{2}|
+ 2(?:
+ [12]\d|
+ [35]\d{1,2}|
+ 4\d?
+ )
+ )|
+ 3(?:
+ 1\d{2}|
+ 2(?:
+ [2356]\d|
+ 4\d{1,2}
+ )
+ )|
+ 4(?:
+ 1\d{2}|
+ 2(?:
+ 2\d{1,2}|
+ [47]|
+ 5\d{2}
+ )
+ )|
+ 5(?:
+ 1\d{2}|
+ 29
+ )|
+ [67]1\d{2}|
+ 8(?:
+ 1\d{2}|
+ 2(?:
+ 2\d{2}|
+ 3|
+ 4\d
+ )
+ )
+ )\d{3}|
+ 4(?:
+ 0(?:
+ 2(?:
+ [09]\d|
+ 7
+ )|
+ 33\d{2}
+ )|
+ 1\d{3}|
+ 2(?:
+ 1\d{2}|
+ 2(?:
+ [25]\d?|
+ [348]\d|
+ [67]\d{1,2}
+ )
+ )|
+ 3(?:
+ 1\d{2}(?:
+ \d{2}
+ )?|
+ 2(?:
+ [045]\d|
+ [236-9]\d{1,2}
+ )|
+ 32\d{2}
+ )|
+ 4(?:
+ [18]\d{2}|
+ 2(?:
+ [2-46]\d{2}|
+ 3
+ )|
+ 5[25]\d{2}
+ )|
+ 5(?:
+ 1\d{2}|
+ 2(?:
+ 3\d|
+ 5
+ )
+ )|
+ 6(?:
+ [18]\d{2}|
+ 2(?:
+ 3(?:
+ \d{2}
+ )?|
+ [46]\d{1,2}|
+ 5\d{2}|
+ 7\d
+ )|
+ 5(?:
+ 3\d?|
+ 4\d|
+ [57]\d{1,2}|
+ 6\d{2}|
+ 8
+ )
+ )|
+ 71\d{2}|
+ 8(?:
+ [18]\d{2}|
+ 23\d{2}|
+ 54\d{2}
+ )|
+ 9(?:
+ [18]\d{2}|
+ 2[2-5]\d{2}|
+ 53\d{1,2}
+ )
+ )\d{3}|
+ 5(?:
+ 02[03489]\d{2}|
+ 1\d{2}|
+ 2(?:
+ 1\d{2}|
+ 2(?:
+ 2(?:
+ \d{2}
+ )?|
+ [457]\d{2}
+ )
+ )|
+ 3(?:
+ 1\d{2}|
+ 2(?:
+ [37](?:
+ \d{2}
+ )?|
+ [569]\d{2}
+ )
+ )|
+ 4(?:
+ 1\d{2}|
+ 2[46]\d{2}
+ )|
+ 5(?:
+ 1\d{2}|
+ 26\d{1,2}
+ )|
+ 6(?:
+ [18]\d{2}|
+ 2|
+ 53\d{2}
+ )|
+ 7(?:
+ 1|
+ 24
+ )\d{2}|
+ 8(?:
+ 1|
+ 26
+ )\d{2}|
+ 91\d{2}
+ )\d{3}|
+ 6(?:
+ 0(?:
+ 1\d{2}|
+ 2(?:
+ 3\d{2}|
+ 4\d{1,2}
+ )
+ )|
+ 2(?:
+ 2[2-5]\d{2}|
+ 5(?:
+ [3-5]\d{2}|
+ 7
+ )|
+ 8\d{2}
+ )|
+ 3(?:
+ 1|
+ 2[3478]
+ )\d{2}|
+ 4(?:
+ 1|
+ 2[34]
+ )\d{2}|
+ 5(?:
+ 1|
+ 2[47]
+ )\d{2}|
+ 6(?:
+ [18]\d{2}|
+ 6(?:
+ 2(?:
+ 2\d|
+ [34]\d{2}
+ )|
+ 5(?:
+ [24]\d{2}|
+ 3\d|
+ 5\d{1,2}
+ )
+ )
+ )|
+ 72[2-5]\d{2}|
+ 8(?:
+ 1\d{2}|
+ 2[2-5]\d{2}
+ )|
+ 9(?:
+ 1\d{2}|
+ 2[2-6]\d{2}
+ )
+ )\d{3}|
+ 7(?:
+ (?:
+ 02|
+ [3-589]1|
+ 6[12]|
+ 72[24]
+ )\d{2}|
+ 21\d{3}|
+ 32
+ )\d{3}|
+ 8(?:
+ (?:
+ 4[12]|
+ [5-7]2|
+ 1\d?
+ )|
+ (?:
+ 0|
+ 3[12]|
+ [5-7]1|
+ 217
+ )\d
+ )\d{4}|
+ 9(?:
+ [35]1|
+ (?:
+ [024]2|
+ 81
+ )\d|
+ (?:
+ 1|
+ [24]1
+ )\d{2}
+ )\d{3}
+ "#,
+ )
+ .build()
+ .unwrap();
+}
diff --git a/third_party/rust/regex/tests/flags.rs b/third_party/rust/regex/tests/flags.rs
new file mode 100644
index 0000000000..c33b82d434
--- /dev/null
+++ b/third_party/rust/regex/tests/flags.rs
@@ -0,0 +1,31 @@
+mat!(match_flag_case, "(?-u)(?i)abc", "ABC", Some((0, 3)));
+mat!(match_flag_weird_case, "(?-u)(?i)a(?-i)bc", "Abc", Some((0, 3)));
+mat!(match_flag_weird_case_not, "(?-u)(?i)a(?-i)bc", "ABC", None);
+mat!(match_flag_case_dotnl, "(?-u)(?is)a(?u:.)", "A\n", Some((0, 2)));
+mat!(
+ match_flag_case_dotnl_toggle,
+ "(?-u)(?is)a(?u:.)(?-is)a(?u:.)",
+ "A\nab",
+ Some((0, 4))
+);
+mat!(
+ match_flag_case_dotnl_toggle_not,
+ "(?-u)(?is)a(?u:.)(?-is)a(?u:.)",
+ "A\na\n",
+ None
+);
+mat!(
+ match_flag_case_dotnl_toggle_ok,
+ "(?-u)(?is)a(?u:.)(?-is:a(?u:.))?",
+ "A\na\n",
+ Some((0, 2))
+);
+mat!(
+ match_flag_multi,
+ r"(?-u)(?m)(?:^\d+$\n?)+",
+ "123\n456\n789",
+ Some((0, 11))
+);
+mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1)));
+mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2)));
+mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2)));
diff --git a/third_party/rust/regex/tests/fowler.rs b/third_party/rust/regex/tests/fowler.rs
new file mode 100644
index 0000000000..7f56a758d3
--- /dev/null
+++ b/third_party/rust/regex/tests/fowler.rs
@@ -0,0 +1,1588 @@
+// DO NOT EDIT. Automatically generated by 'scripts/regex-match-tests.py'
+// on 2019-09-02 11:07:37.849994.
+
+// Tests from basic.dat
+mat!(match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18)));
+mat!(match_basic_4, r"a...b", r"abababbb", Some((2, 7)));
+mat!(match_basic_5, r"XXXXXX", r"..XXXXXX", Some((2, 8)));
+mat!(match_basic_6, r"\)", r"()", Some((1, 2)));
+mat!(match_basic_7, r"a]", r"a]a", Some((0, 2)));
+mat!(match_basic_9, r"\}", r"}", Some((0, 1)));
+mat!(match_basic_10, r"\]", r"]", Some((0, 1)));
+mat!(match_basic_12, r"]", r"]", Some((0, 1)));
+mat!(match_basic_15, r"^a", r"ax", Some((0, 1)));
+mat!(match_basic_16, r"\^a", r"a^a", Some((1, 3)));
+mat!(match_basic_17, r"a\^", r"a^", Some((0, 2)));
+mat!(match_basic_18, r"a$", r"aa", Some((1, 2)));
+mat!(match_basic_19, r"a\$", r"a$", Some((0, 2)));
+mat!(match_basic_20, r"^$", r"", Some((0, 0)));
+mat!(match_basic_21, r"$^", r"", Some((0, 0)));
+mat!(match_basic_22, r"a($)", r"aa", Some((1, 2)), Some((2, 2)));
+mat!(match_basic_23, r"a*(^a)", r"aa", Some((0, 1)), Some((0, 1)));
+mat!(match_basic_24, r"(..)*(...)*", r"a", Some((0, 0)));
+mat!(match_basic_25, r"(..)*(...)*", r"abcd", Some((0, 4)), Some((2, 4)));
+mat!(
+ match_basic_26,
+ r"(ab|a)(bc|c)",
+ r"abc",
+ Some((0, 3)),
+ Some((0, 2)),
+ Some((2, 3))
+);
+mat!(match_basic_27, r"(ab)c|abc", r"abc", Some((0, 3)), Some((0, 2)));
+mat!(match_basic_28, r"a{0}b", r"ab", Some((1, 2)));
+mat!(
+ match_basic_29,
+ r"(a*)(b?)(b+)b{3}",
+ r"aaabbbbbbb",
+ Some((0, 10)),
+ Some((0, 3)),
+ Some((3, 4)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_30,
+ r"(a*)(b{0,1})(b{1,})b{3}",
+ r"aaabbbbbbb",
+ Some((0, 10)),
+ Some((0, 3)),
+ Some((3, 4)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_32,
+ r"((a|a)|a)",
+ r"a",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1))
+);
+mat!(
+ match_basic_33,
+ r"(a*)(a|aa)",
+ r"aaaa",
+ Some((0, 4)),
+ Some((0, 3)),
+ Some((3, 4))
+);
+mat!(match_basic_34, r"a*(a.|aa)", r"aaaa", Some((0, 4)), Some((2, 4)));
+mat!(
+ match_basic_35,
+ r"a(b)|c(d)|a(e)f",
+ r"aef",
+ Some((0, 3)),
+ None,
+ None,
+ Some((1, 2))
+);
+mat!(match_basic_36, r"(a|b)?.*", r"b", Some((0, 1)), Some((0, 1)));
+mat!(match_basic_37, r"(a|b)c|a(b|c)", r"ac", Some((0, 2)), Some((0, 1)));
+mat!(
+ match_basic_38,
+ r"(a|b)c|a(b|c)",
+ r"ab",
+ Some((0, 2)),
+ None,
+ Some((1, 2))
+);
+mat!(match_basic_39, r"(a|b)*c|(a|ab)*c", r"abc", Some((0, 3)), Some((1, 2)));
+mat!(match_basic_40, r"(a|b)*c|(a|ab)*c", r"xc", Some((1, 2)));
+mat!(
+ match_basic_41,
+ r"(.a|.b).*|.*(.a|.b)",
+ r"xa",
+ Some((0, 2)),
+ Some((0, 2))
+);
+mat!(match_basic_42, r"a?(ab|ba)ab", r"abab", Some((0, 4)), Some((0, 2)));
+mat!(match_basic_43, r"a?(ac{0}b|ba)ab", r"abab", Some((0, 4)), Some((0, 2)));
+mat!(match_basic_44, r"ab|abab", r"abbabab", Some((0, 2)));
+mat!(match_basic_45, r"aba|bab|bba", r"baaabbbaba", Some((5, 8)));
+mat!(match_basic_46, r"aba|bab", r"baaabbbaba", Some((6, 9)));
+mat!(
+ match_basic_47,
+ r"(aa|aaa)*|(a|aaaaa)",
+ r"aa",
+ Some((0, 2)),
+ Some((0, 2))
+);
+mat!(
+ match_basic_48,
+ r"(a.|.a.)*|(a|.a...)",
+ r"aa",
+ Some((0, 2)),
+ Some((0, 2))
+);
+mat!(match_basic_49, r"ab|a", r"xabc", Some((1, 3)));
+mat!(match_basic_50, r"ab|a", r"xxabc", Some((2, 4)));
+mat!(
+ match_basic_51,
+ r"(?i)(?-u)(Ab|cD)*",
+ r"aBcD",
+ Some((0, 4)),
+ Some((2, 4))
+);
+mat!(match_basic_52, r"[^-]", r"--a", Some((2, 3)));
+mat!(match_basic_53, r"[a-]*", r"--a", Some((0, 3)));
+mat!(match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4)));
+mat!(
+ match_basic_55,
+ r":::1:::0:|:::1:1:0:",
+ r":::0:::1:::1:::0:",
+ Some((8, 17))
+);
+mat!(
+ match_basic_56,
+ r":::1:::0:|:::1:1:1:",
+ r":::0:::1:::1:::0:",
+ Some((8, 17))
+);
+mat!(match_basic_57, r"[[:upper:]]", r"A", Some((0, 1)));
+mat!(match_basic_58, r"[[:lower:]]+", r"`az{", Some((1, 3)));
+mat!(match_basic_59, r"[[:upper:]]+", r"@AZ[", Some((1, 3)));
+mat!(
+ match_basic_65,
+ r"
+",
+ r"
+",
+ Some((0, 1))
+);
+mat!(
+ match_basic_66,
+ r"
+",
+ r"
+",
+ Some((0, 1))
+);
+mat!(
+ match_basic_67,
+ r"[^a]",
+ r"
+",
+ Some((0, 1))
+);
+mat!(
+ match_basic_68,
+ r"
+a",
+ r"
+a",
+ Some((0, 2))
+);
+mat!(
+ match_basic_69,
+ r"(a)(b)(c)",
+ r"abc",
+ Some((0, 3)),
+ Some((0, 1)),
+ Some((1, 2)),
+ Some((2, 3))
+);
+mat!(match_basic_70, r"xxx", r"xxx", Some((0, 3)));
+mat!(
+ match_basic_71,
+ r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)",
+ r"feb 6,",
+ Some((0, 6))
+);
+mat!(
+ match_basic_72,
+ r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)",
+ r"2/7",
+ Some((0, 3))
+);
+mat!(
+ match_basic_73,
+ r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)",
+ r"feb 1,Feb 6",
+ Some((5, 11))
+);
+mat!(
+ match_basic_74,
+ r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))",
+ r"x",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1))
+);
+mat!(
+ match_basic_75,
+ r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*",
+ r"xx",
+ Some((0, 2)),
+ Some((1, 2)),
+ Some((1, 2))
+);
+mat!(
+ match_basic_76,
+ r"a?(ab|ba)*",
+ r"ababababababababababababababababababababababababababababababababababababababababa",
+ Some((0, 81)),
+ Some((79, 81))
+);
+mat!(
+ match_basic_77,
+ r"abaa|abbaa|abbbaa|abbbbaa",
+ r"ababbabbbabbbabbbbabbbbaa",
+ Some((18, 25))
+);
+mat!(
+ match_basic_78,
+ r"abaa|abbaa|abbbaa|abbbbaa",
+ r"ababbabbbabbbabbbbabaa",
+ Some((18, 22))
+);
+mat!(
+ match_basic_79,
+ r"aaac|aabc|abac|abbc|baac|babc|bbac|bbbc",
+ r"baaabbbabac",
+ Some((7, 11))
+);
+mat!(match_basic_80, r".*", r"", Some((0, 2)));
+mat!(
+ match_basic_81,
+ r"aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll",
+ r"XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa",
+ Some((53, 57))
+);
+mat!(match_basic_83, r"a*a*a*a*a*b", r"aaaaaaaaab", Some((0, 10)));
+mat!(match_basic_84, r"^", r"", Some((0, 0)));
+mat!(match_basic_85, r"$", r"", Some((0, 0)));
+mat!(match_basic_86, r"^$", r"", Some((0, 0)));
+mat!(match_basic_87, r"^a$", r"a", Some((0, 1)));
+mat!(match_basic_88, r"abc", r"abc", Some((0, 3)));
+mat!(match_basic_89, r"abc", r"xabcy", Some((1, 4)));
+mat!(match_basic_90, r"abc", r"ababc", Some((2, 5)));
+mat!(match_basic_91, r"ab*c", r"abc", Some((0, 3)));
+mat!(match_basic_92, r"ab*bc", r"abc", Some((0, 3)));
+mat!(match_basic_93, r"ab*bc", r"abbc", Some((0, 4)));
+mat!(match_basic_94, r"ab*bc", r"abbbbc", Some((0, 6)));
+mat!(match_basic_95, r"ab+bc", r"abbc", Some((0, 4)));
+mat!(match_basic_96, r"ab+bc", r"abbbbc", Some((0, 6)));
+mat!(match_basic_97, r"ab?bc", r"abbc", Some((0, 4)));
+mat!(match_basic_98, r"ab?bc", r"abc", Some((0, 3)));
+mat!(match_basic_99, r"ab?c", r"abc", Some((0, 3)));
+mat!(match_basic_100, r"^abc$", r"abc", Some((0, 3)));
+mat!(match_basic_101, r"^abc", r"abcc", Some((0, 3)));
+mat!(match_basic_102, r"abc$", r"aabc", Some((1, 4)));
+mat!(match_basic_103, r"^", r"abc", Some((0, 0)));
+mat!(match_basic_104, r"$", r"abc", Some((3, 3)));
+mat!(match_basic_105, r"a.c", r"abc", Some((0, 3)));
+mat!(match_basic_106, r"a.c", r"axc", Some((0, 3)));
+mat!(match_basic_107, r"a.*c", r"axyzc", Some((0, 5)));
+mat!(match_basic_108, r"a[bc]d", r"abd", Some((0, 3)));
+mat!(match_basic_109, r"a[b-d]e", r"ace", Some((0, 3)));
+mat!(match_basic_110, r"a[b-d]", r"aac", Some((1, 3)));
+mat!(match_basic_111, r"a[-b]", r"a-", Some((0, 2)));
+mat!(match_basic_112, r"a[b-]", r"a-", Some((0, 2)));
+mat!(match_basic_113, r"a]", r"a]", Some((0, 2)));
+mat!(match_basic_114, r"a[]]b", r"a]b", Some((0, 3)));
+mat!(match_basic_115, r"a[^bc]d", r"aed", Some((0, 3)));
+mat!(match_basic_116, r"a[^-b]c", r"adc", Some((0, 3)));
+mat!(match_basic_117, r"a[^]b]c", r"adc", Some((0, 3)));
+mat!(match_basic_118, r"ab|cd", r"abc", Some((0, 2)));
+mat!(match_basic_119, r"ab|cd", r"abcd", Some((0, 2)));
+mat!(match_basic_120, r"a\(b", r"a(b", Some((0, 3)));
+mat!(match_basic_121, r"a\(*b", r"ab", Some((0, 2)));
+mat!(match_basic_122, r"a\(*b", r"a((b", Some((0, 4)));
+mat!(
+ match_basic_123,
+ r"((a))",
+ r"abc",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1))
+);
+mat!(
+ match_basic_124,
+ r"(a)b(c)",
+ r"abc",
+ Some((0, 3)),
+ Some((0, 1)),
+ Some((2, 3))
+);
+mat!(match_basic_125, r"a+b+c", r"aabbabc", Some((4, 7)));
+mat!(match_basic_126, r"a*", r"aaa", Some((0, 3)));
+mat!(match_basic_128, r"(a*)*", r"-", Some((0, 0)), None);
+mat!(match_basic_129, r"(a*)+", r"-", Some((0, 0)), Some((0, 0)));
+mat!(match_basic_131, r"(a*|b)*", r"-", Some((0, 0)), None);
+mat!(match_basic_132, r"(a+|b)*", r"ab", Some((0, 2)), Some((1, 2)));
+mat!(match_basic_133, r"(a+|b)+", r"ab", Some((0, 2)), Some((1, 2)));
+mat!(match_basic_134, r"(a+|b)?", r"ab", Some((0, 1)), Some((0, 1)));
+mat!(match_basic_135, r"[^ab]*", r"cde", Some((0, 3)));
+mat!(match_basic_137, r"(^)*", r"-", Some((0, 0)), None);
+mat!(match_basic_138, r"a*", r"", Some((0, 0)));
+mat!(match_basic_139, r"([abc])*d", r"abbbcd", Some((0, 6)), Some((4, 5)));
+mat!(match_basic_140, r"([abc])*bcd", r"abcd", Some((0, 4)), Some((0, 1)));
+mat!(match_basic_141, r"a|b|c|d|e", r"e", Some((0, 1)));
+mat!(match_basic_142, r"(a|b|c|d|e)f", r"ef", Some((0, 2)), Some((0, 1)));
+mat!(match_basic_144, r"((a*|b))*", r"-", Some((0, 0)), None, None);
+mat!(match_basic_145, r"abcd*efg", r"abcdefg", Some((0, 7)));
+mat!(match_basic_146, r"ab*", r"xabyabbbz", Some((1, 3)));
+mat!(match_basic_147, r"ab*", r"xayabbbz", Some((1, 2)));
+mat!(match_basic_148, r"(ab|cd)e", r"abcde", Some((2, 5)), Some((2, 4)));
+mat!(match_basic_149, r"[abhgefdc]ij", r"hij", Some((0, 3)));
+mat!(match_basic_150, r"(a|b)c*d", r"abcd", Some((1, 4)), Some((1, 2)));
+mat!(match_basic_151, r"(ab|ab*)bc", r"abc", Some((0, 3)), Some((0, 1)));
+mat!(match_basic_152, r"a([bc]*)c*", r"abc", Some((0, 3)), Some((1, 3)));
+mat!(
+ match_basic_153,
+ r"a([bc]*)(c*d)",
+ r"abcd",
+ Some((0, 4)),
+ Some((1, 3)),
+ Some((3, 4))
+);
+mat!(
+ match_basic_154,
+ r"a([bc]+)(c*d)",
+ r"abcd",
+ Some((0, 4)),
+ Some((1, 3)),
+ Some((3, 4))
+);
+mat!(
+ match_basic_155,
+ r"a([bc]*)(c+d)",
+ r"abcd",
+ Some((0, 4)),
+ Some((1, 2)),
+ Some((2, 4))
+);
+mat!(match_basic_156, r"a[bcd]*dcdcde", r"adcdcde", Some((0, 7)));
+mat!(match_basic_157, r"(ab|a)b*c", r"abc", Some((0, 3)), Some((0, 2)));
+mat!(
+ match_basic_158,
+ r"((a)(b)c)(d)",
+ r"abcd",
+ Some((0, 4)),
+ Some((0, 3)),
+ Some((0, 1)),
+ Some((1, 2)),
+ Some((3, 4))
+);
+mat!(match_basic_159, r"[A-Za-z_][A-Za-z0-9_]*", r"alpha", Some((0, 5)));
+mat!(match_basic_160, r"^a(bc+|b[eh])g|.h$", r"abh", Some((1, 3)));
+mat!(
+ match_basic_161,
+ r"(bc+d$|ef*g.|h?i(j|k))",
+ r"effgz",
+ Some((0, 5)),
+ Some((0, 5))
+);
+mat!(
+ match_basic_162,
+ r"(bc+d$|ef*g.|h?i(j|k))",
+ r"ij",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((1, 2))
+);
+mat!(
+ match_basic_163,
+ r"(bc+d$|ef*g.|h?i(j|k))",
+ r"reffgz",
+ Some((1, 6)),
+ Some((1, 6))
+);
+mat!(
+ match_basic_164,
+ r"(((((((((a)))))))))",
+ r"a",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((0, 1))
+);
+mat!(
+ match_basic_165,
+ r"multiple words",
+ r"multiple words yeah",
+ Some((0, 14))
+);
+mat!(
+ match_basic_166,
+ r"(.*)c(.*)",
+ r"abcde",
+ Some((0, 5)),
+ Some((0, 2)),
+ Some((3, 5))
+);
+mat!(match_basic_167, r"abcd", r"abcd", Some((0, 4)));
+mat!(match_basic_168, r"a(bc)d", r"abcd", Some((0, 4)), Some((1, 3)));
+mat!(match_basic_169, r"a[-]?c", r"ac", Some((0, 3)));
+mat!(
+ match_basic_170,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Qaddafi",
+ Some((0, 15)),
+ None,
+ Some((10, 12))
+);
+mat!(
+ match_basic_171,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Mo'ammar Gadhafi",
+ Some((0, 16)),
+ None,
+ Some((11, 13))
+);
+mat!(
+ match_basic_172,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Kaddafi",
+ Some((0, 15)),
+ None,
+ Some((10, 12))
+);
+mat!(
+ match_basic_173,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Qadhafi",
+ Some((0, 15)),
+ None,
+ Some((10, 12))
+);
+mat!(
+ match_basic_174,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Gadafi",
+ Some((0, 14)),
+ None,
+ Some((10, 11))
+);
+mat!(
+ match_basic_175,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Mu'ammar Qadafi",
+ Some((0, 15)),
+ None,
+ Some((11, 12))
+);
+mat!(
+ match_basic_176,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Moamar Gaddafi",
+ Some((0, 14)),
+ None,
+ Some((9, 11))
+);
+mat!(
+ match_basic_177,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Mu'ammar Qadhdhafi",
+ Some((0, 18)),
+ None,
+ Some((13, 15))
+);
+mat!(
+ match_basic_178,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Khaddafi",
+ Some((0, 16)),
+ None,
+ Some((11, 13))
+);
+mat!(
+ match_basic_179,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Ghaddafy",
+ Some((0, 16)),
+ None,
+ Some((11, 13))
+);
+mat!(
+ match_basic_180,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Ghadafi",
+ Some((0, 15)),
+ None,
+ Some((11, 12))
+);
+mat!(
+ match_basic_181,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Ghaddafi",
+ Some((0, 16)),
+ None,
+ Some((11, 13))
+);
+mat!(
+ match_basic_182,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muamar Kaddafi",
+ Some((0, 14)),
+ None,
+ Some((9, 11))
+);
+mat!(
+ match_basic_183,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Quathafi",
+ Some((0, 16)),
+ None,
+ Some((11, 13))
+);
+mat!(
+ match_basic_184,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Muammar Gheddafi",
+ Some((0, 16)),
+ None,
+ Some((11, 13))
+);
+mat!(
+ match_basic_185,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Moammar Khadafy",
+ Some((0, 15)),
+ None,
+ Some((11, 12))
+);
+mat!(
+ match_basic_186,
+ r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
+ r"Moammar Qudhafi",
+ Some((0, 15)),
+ None,
+ Some((10, 12))
+);
+mat!(match_basic_187, r"a+(b|c)*d+", r"aabcdd", Some((0, 6)), Some((3, 4)));
+mat!(match_basic_188, r"^.+$", r"vivi", Some((0, 4)));
+mat!(match_basic_189, r"^(.+)$", r"vivi", Some((0, 4)), Some((0, 4)));
+mat!(
+ match_basic_190,
+ r"^([^!.]+).att.com!(.+)$",
+ r"gryphon.att.com!eby",
+ Some((0, 19)),
+ Some((0, 7)),
+ Some((16, 19))
+);
+mat!(
+ match_basic_191,
+ r"^([^!]+!)?([^!]+)$",
+ r"bas",
+ Some((0, 3)),
+ None,
+ Some((0, 3))
+);
+mat!(
+ match_basic_192,
+ r"^([^!]+!)?([^!]+)$",
+ r"bar!bas",
+ Some((0, 7)),
+ Some((0, 4)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_193,
+ r"^([^!]+!)?([^!]+)$",
+ r"foo!bas",
+ Some((0, 7)),
+ Some((0, 4)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_194,
+ r"^.+!([^!]+!)([^!]+)$",
+ r"foo!bar!bas",
+ Some((0, 11)),
+ Some((4, 8)),
+ Some((8, 11))
+);
+mat!(
+ match_basic_195,
+ r"((foo)|(bar))!bas",
+ r"bar!bas",
+ Some((0, 7)),
+ Some((0, 3)),
+ None,
+ Some((0, 3))
+);
+mat!(
+ match_basic_196,
+ r"((foo)|(bar))!bas",
+ r"foo!bar!bas",
+ Some((4, 11)),
+ Some((4, 7)),
+ None,
+ Some((4, 7))
+);
+mat!(
+ match_basic_197,
+ r"((foo)|(bar))!bas",
+ r"foo!bas",
+ Some((0, 7)),
+ Some((0, 3)),
+ Some((0, 3))
+);
+mat!(
+ match_basic_198,
+ r"((foo)|bar)!bas",
+ r"bar!bas",
+ Some((0, 7)),
+ Some((0, 3))
+);
+mat!(
+ match_basic_199,
+ r"((foo)|bar)!bas",
+ r"foo!bar!bas",
+ Some((4, 11)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_200,
+ r"((foo)|bar)!bas",
+ r"foo!bas",
+ Some((0, 7)),
+ Some((0, 3)),
+ Some((0, 3))
+);
+mat!(
+ match_basic_201,
+ r"(foo|(bar))!bas",
+ r"bar!bas",
+ Some((0, 7)),
+ Some((0, 3)),
+ Some((0, 3))
+);
+mat!(
+ match_basic_202,
+ r"(foo|(bar))!bas",
+ r"foo!bar!bas",
+ Some((4, 11)),
+ Some((4, 7)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_203,
+ r"(foo|(bar))!bas",
+ r"foo!bas",
+ Some((0, 7)),
+ Some((0, 3))
+);
+mat!(
+ match_basic_204,
+ r"(foo|bar)!bas",
+ r"bar!bas",
+ Some((0, 7)),
+ Some((0, 3))
+);
+mat!(
+ match_basic_205,
+ r"(foo|bar)!bas",
+ r"foo!bar!bas",
+ Some((4, 11)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_206,
+ r"(foo|bar)!bas",
+ r"foo!bas",
+ Some((0, 7)),
+ Some((0, 3))
+);
+mat!(
+ match_basic_207,
+ r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$",
+ r"foo!bar!bas",
+ Some((0, 11)),
+ Some((0, 11)),
+ None,
+ None,
+ Some((4, 8)),
+ Some((8, 11))
+);
+mat!(
+ match_basic_208,
+ r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$",
+ r"bas",
+ Some((0, 3)),
+ None,
+ Some((0, 3))
+);
+mat!(
+ match_basic_209,
+ r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$",
+ r"bar!bas",
+ Some((0, 7)),
+ Some((0, 4)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_210,
+ r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$",
+ r"foo!bar!bas",
+ Some((0, 11)),
+ None,
+ None,
+ Some((4, 8)),
+ Some((8, 11))
+);
+mat!(
+ match_basic_211,
+ r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$",
+ r"foo!bas",
+ Some((0, 7)),
+ Some((0, 4)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_212,
+ r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$",
+ r"bas",
+ Some((0, 3)),
+ Some((0, 3)),
+ None,
+ Some((0, 3))
+);
+mat!(
+ match_basic_213,
+ r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$",
+ r"bar!bas",
+ Some((0, 7)),
+ Some((0, 7)),
+ Some((0, 4)),
+ Some((4, 7))
+);
+mat!(
+ match_basic_214,
+ r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$",
+ r"foo!bar!bas",
+ Some((0, 11)),
+ Some((0, 11)),
+ None,
+ None,
+ Some((4, 8)),
+ Some((8, 11))
+);
+mat!(
+ match_basic_215,
+ r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$",
+ r"foo!bas",
+ Some((0, 7)),
+ Some((0, 7)),
+ Some((0, 4)),
+ Some((4, 7))
+);
+mat!(match_basic_216, r".*(/XXX).*", r"/XXX", Some((0, 4)), Some((0, 4)));
+mat!(match_basic_217, r".*(\\XXX).*", r"\XXX", Some((0, 4)), Some((0, 4)));
+mat!(match_basic_218, r"\\XXX", r"\XXX", Some((0, 4)));
+mat!(match_basic_219, r".*(/000).*", r"/000", Some((0, 4)), Some((0, 4)));
+mat!(match_basic_220, r".*(\\000).*", r"\000", Some((0, 4)), Some((0, 4)));
+mat!(match_basic_221, r"\\000", r"\000", Some((0, 4)));
+
+// Tests from nullsubexpr.dat
+mat!(match_nullsubexpr_3, r"(a*)*", r"a", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_5, r"(a*)*", r"x", Some((0, 0)), None);
+mat!(match_nullsubexpr_6, r"(a*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_7, r"(a*)*", r"aaaaaax", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_8, r"(a*)+", r"a", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_9, r"(a*)+", r"x", Some((0, 0)), Some((0, 0)));
+mat!(match_nullsubexpr_10, r"(a*)+", r"aaaaaa", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_11, r"(a*)+", r"aaaaaax", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_12, r"(a+)*", r"a", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_13, r"(a+)*", r"x", Some((0, 0)));
+mat!(match_nullsubexpr_14, r"(a+)*", r"aaaaaa", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_15, r"(a+)*", r"aaaaaax", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_16, r"(a+)+", r"a", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_17, r"(a+)+", r"x", None);
+mat!(match_nullsubexpr_18, r"(a+)+", r"aaaaaa", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_19, r"(a+)+", r"aaaaaax", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_21, r"([a]*)*", r"a", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_23, r"([a]*)*", r"x", Some((0, 0)), None);
+mat!(match_nullsubexpr_24, r"([a]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_25, r"([a]*)*", r"aaaaaax", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_26, r"([a]*)+", r"a", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_27, r"([a]*)+", r"x", Some((0, 0)), Some((0, 0)));
+mat!(match_nullsubexpr_28, r"([a]*)+", r"aaaaaa", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_29, r"([a]*)+", r"aaaaaax", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_30, r"([^b]*)*", r"a", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_32, r"([^b]*)*", r"b", Some((0, 0)), None);
+mat!(match_nullsubexpr_33, r"([^b]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)));
+mat!(
+ match_nullsubexpr_34,
+ r"([^b]*)*",
+ r"aaaaaab",
+ Some((0, 6)),
+ Some((0, 6))
+);
+mat!(match_nullsubexpr_35, r"([ab]*)*", r"a", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_36, r"([ab]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_37, r"([ab]*)*", r"ababab", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_38, r"([ab]*)*", r"bababa", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_39, r"([ab]*)*", r"b", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_40, r"([ab]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6)));
+mat!(
+ match_nullsubexpr_41,
+ r"([ab]*)*",
+ r"aaaabcde",
+ Some((0, 5)),
+ Some((0, 5))
+);
+mat!(match_nullsubexpr_42, r"([^a]*)*", r"b", Some((0, 1)), Some((0, 1)));
+mat!(match_nullsubexpr_43, r"([^a]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6)));
+mat!(match_nullsubexpr_45, r"([^a]*)*", r"aaaaaa", Some((0, 0)), None);
+mat!(
+ match_nullsubexpr_46,
+ r"([^ab]*)*",
+ r"ccccxx",
+ Some((0, 6)),
+ Some((0, 6))
+);
+mat!(match_nullsubexpr_48, r"([^ab]*)*", r"ababab", Some((0, 0)), None);
+mat!(
+ match_nullsubexpr_50,
+ r"((z)+|a)*",
+ r"zabcde",
+ Some((0, 2)),
+ Some((1, 2))
+);
+mat!(
+ match_nullsubexpr_69,
+ r"(a*)*(x)",
+ r"x",
+ Some((0, 1)),
+ None,
+ Some((0, 1))
+);
+mat!(
+ match_nullsubexpr_70,
+ r"(a*)*(x)",
+ r"ax",
+ Some((0, 2)),
+ Some((0, 1)),
+ Some((1, 2))
+);
+mat!(
+ match_nullsubexpr_71,
+ r"(a*)*(x)",
+ r"axa",
+ Some((0, 2)),
+ Some((0, 1)),
+ Some((1, 2))
+);
+mat!(
+ match_nullsubexpr_73,
+ r"(a*)+(x)",
+ r"x",
+ Some((0, 1)),
+ Some((0, 0)),
+ Some((0, 1))
+);
+mat!(
+ match_nullsubexpr_74,
+ r"(a*)+(x)",
+ r"ax",
+ Some((0, 2)),
+ Some((0, 1)),
+ Some((1, 2))
+);
+mat!(
+ match_nullsubexpr_75,
+ r"(a*)+(x)",
+ r"axa",
+ Some((0, 2)),
+ Some((0, 1)),
+ Some((1, 2))
+);
+mat!(
+ match_nullsubexpr_77,
+ r"(a*){2}(x)",
+ r"x",
+ Some((0, 1)),
+ Some((0, 0)),
+ Some((0, 1))
+);
+mat!(
+ match_nullsubexpr_78,
+ r"(a*){2}(x)",
+ r"ax",
+ Some((0, 2)),
+ Some((1, 1)),
+ Some((1, 2))
+);
+mat!(
+ match_nullsubexpr_79,
+ r"(a*){2}(x)",
+ r"axa",
+ Some((0, 2)),
+ Some((1, 1)),
+ Some((1, 2))
+);
+
+// Tests from repetition.dat
+mat!(match_repetition_10, r"((..)|(.))", r"", None);
+mat!(match_repetition_11, r"((..)|(.))((..)|(.))", r"", None);
+mat!(match_repetition_12, r"((..)|(.))((..)|(.))((..)|(.))", r"", None);
+mat!(match_repetition_14, r"((..)|(.)){1}", r"", None);
+mat!(match_repetition_15, r"((..)|(.)){2}", r"", None);
+mat!(match_repetition_16, r"((..)|(.)){3}", r"", None);
+mat!(match_repetition_18, r"((..)|(.))*", r"", Some((0, 0)));
+mat!(
+ match_repetition_20,
+ r"((..)|(.))",
+ r"a",
+ Some((0, 1)),
+ Some((0, 1)),
+ None,
+ Some((0, 1))
+);
+mat!(match_repetition_21, r"((..)|(.))((..)|(.))", r"a", None);
+mat!(match_repetition_22, r"((..)|(.))((..)|(.))((..)|(.))", r"a", None);
+mat!(
+ match_repetition_24,
+ r"((..)|(.)){1}",
+ r"a",
+ Some((0, 1)),
+ Some((0, 1)),
+ None,
+ Some((0, 1))
+);
+mat!(match_repetition_25, r"((..)|(.)){2}", r"a", None);
+mat!(match_repetition_26, r"((..)|(.)){3}", r"a", None);
+mat!(
+ match_repetition_28,
+ r"((..)|(.))*",
+ r"a",
+ Some((0, 1)),
+ Some((0, 1)),
+ None,
+ Some((0, 1))
+);
+mat!(
+ match_repetition_30,
+ r"((..)|(.))",
+ r"aa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_31,
+ r"((..)|(.))((..)|(.))",
+ r"aa",
+ Some((0, 2)),
+ Some((0, 1)),
+ None,
+ Some((0, 1)),
+ Some((1, 2)),
+ None,
+ Some((1, 2))
+);
+mat!(match_repetition_32, r"((..)|(.))((..)|(.))((..)|(.))", r"aa", None);
+mat!(
+ match_repetition_34,
+ r"((..)|(.)){1}",
+ r"aa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_35,
+ r"((..)|(.)){2}",
+ r"aa",
+ Some((0, 2)),
+ Some((1, 2)),
+ None,
+ Some((1, 2))
+);
+mat!(match_repetition_36, r"((..)|(.)){3}", r"aa", None);
+mat!(
+ match_repetition_38,
+ r"((..)|(.))*",
+ r"aa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_40,
+ r"((..)|(.))",
+ r"aaa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_41,
+ r"((..)|(.))((..)|(.))",
+ r"aaa",
+ Some((0, 3)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None,
+ Some((2, 3)),
+ None,
+ Some((2, 3))
+);
+mat!(
+ match_repetition_42,
+ r"((..)|(.))((..)|(.))((..)|(.))",
+ r"aaa",
+ Some((0, 3)),
+ Some((0, 1)),
+ None,
+ Some((0, 1)),
+ Some((1, 2)),
+ None,
+ Some((1, 2)),
+ Some((2, 3)),
+ None,
+ Some((2, 3))
+);
+mat!(
+ match_repetition_44,
+ r"((..)|(.)){1}",
+ r"aaa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_46,
+ r"((..)|(.)){2}",
+ r"aaa",
+ Some((0, 3)),
+ Some((2, 3)),
+ Some((0, 2)),
+ Some((2, 3))
+);
+mat!(
+ match_repetition_47,
+ r"((..)|(.)){3}",
+ r"aaa",
+ Some((0, 3)),
+ Some((2, 3)),
+ None,
+ Some((2, 3))
+);
+mat!(
+ match_repetition_50,
+ r"((..)|(.))*",
+ r"aaa",
+ Some((0, 3)),
+ Some((2, 3)),
+ Some((0, 2)),
+ Some((2, 3))
+);
+mat!(
+ match_repetition_52,
+ r"((..)|(.))",
+ r"aaaa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_53,
+ r"((..)|(.))((..)|(.))",
+ r"aaaa",
+ Some((0, 4)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None,
+ Some((2, 4)),
+ Some((2, 4)),
+ None
+);
+mat!(
+ match_repetition_54,
+ r"((..)|(.))((..)|(.))((..)|(.))",
+ r"aaaa",
+ Some((0, 4)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None,
+ Some((2, 3)),
+ None,
+ Some((2, 3)),
+ Some((3, 4)),
+ None,
+ Some((3, 4))
+);
+mat!(
+ match_repetition_56,
+ r"((..)|(.)){1}",
+ r"aaaa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_57,
+ r"((..)|(.)){2}",
+ r"aaaa",
+ Some((0, 4)),
+ Some((2, 4)),
+ Some((2, 4)),
+ None
+);
+mat!(
+ match_repetition_59,
+ r"((..)|(.)){3}",
+ r"aaaa",
+ Some((0, 4)),
+ Some((3, 4)),
+ Some((0, 2)),
+ Some((3, 4))
+);
+mat!(
+ match_repetition_61,
+ r"((..)|(.))*",
+ r"aaaa",
+ Some((0, 4)),
+ Some((2, 4)),
+ Some((2, 4)),
+ None
+);
+mat!(
+ match_repetition_63,
+ r"((..)|(.))",
+ r"aaaaa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_64,
+ r"((..)|(.))((..)|(.))",
+ r"aaaaa",
+ Some((0, 4)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None,
+ Some((2, 4)),
+ Some((2, 4)),
+ None
+);
+mat!(
+ match_repetition_65,
+ r"((..)|(.))((..)|(.))((..)|(.))",
+ r"aaaaa",
+ Some((0, 5)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None,
+ Some((2, 4)),
+ Some((2, 4)),
+ None,
+ Some((4, 5)),
+ None,
+ Some((4, 5))
+);
+mat!(
+ match_repetition_67,
+ r"((..)|(.)){1}",
+ r"aaaaa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_68,
+ r"((..)|(.)){2}",
+ r"aaaaa",
+ Some((0, 4)),
+ Some((2, 4)),
+ Some((2, 4)),
+ None
+);
+mat!(
+ match_repetition_70,
+ r"((..)|(.)){3}",
+ r"aaaaa",
+ Some((0, 5)),
+ Some((4, 5)),
+ Some((2, 4)),
+ Some((4, 5))
+);
+mat!(
+ match_repetition_73,
+ r"((..)|(.))*",
+ r"aaaaa",
+ Some((0, 5)),
+ Some((4, 5)),
+ Some((2, 4)),
+ Some((4, 5))
+);
+mat!(
+ match_repetition_75,
+ r"((..)|(.))",
+ r"aaaaaa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_76,
+ r"((..)|(.))((..)|(.))",
+ r"aaaaaa",
+ Some((0, 4)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None,
+ Some((2, 4)),
+ Some((2, 4)),
+ None
+);
+mat!(
+ match_repetition_77,
+ r"((..)|(.))((..)|(.))((..)|(.))",
+ r"aaaaaa",
+ Some((0, 6)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None,
+ Some((2, 4)),
+ Some((2, 4)),
+ None,
+ Some((4, 6)),
+ Some((4, 6)),
+ None
+);
+mat!(
+ match_repetition_79,
+ r"((..)|(.)){1}",
+ r"aaaaaa",
+ Some((0, 2)),
+ Some((0, 2)),
+ Some((0, 2)),
+ None
+);
+mat!(
+ match_repetition_80,
+ r"((..)|(.)){2}",
+ r"aaaaaa",
+ Some((0, 4)),
+ Some((2, 4)),
+ Some((2, 4)),
+ None
+);
+mat!(
+ match_repetition_81,
+ r"((..)|(.)){3}",
+ r"aaaaaa",
+ Some((0, 6)),
+ Some((4, 6)),
+ Some((4, 6)),
+ None
+);
+mat!(
+ match_repetition_83,
+ r"((..)|(.))*",
+ r"aaaaaa",
+ Some((0, 6)),
+ Some((4, 6)),
+ Some((4, 6)),
+ None
+);
+mat!(
+ match_repetition_90,
+ r"X(.?){0,}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((7, 8))
+);
+mat!(
+ match_repetition_91,
+ r"X(.?){1,}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((7, 8))
+);
+mat!(
+ match_repetition_92,
+ r"X(.?){2,}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((7, 8))
+);
+mat!(
+ match_repetition_93,
+ r"X(.?){3,}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((7, 8))
+);
+mat!(
+ match_repetition_94,
+ r"X(.?){4,}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((7, 8))
+);
+mat!(
+ match_repetition_95,
+ r"X(.?){5,}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((7, 8))
+);
+mat!(
+ match_repetition_96,
+ r"X(.?){6,}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((7, 8))
+);
+mat!(
+ match_repetition_97,
+ r"X(.?){7,}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((7, 8))
+);
+mat!(
+ match_repetition_98,
+ r"X(.?){8,}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_100,
+ r"X(.?){0,8}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_102,
+ r"X(.?){1,8}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_104,
+ r"X(.?){2,8}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_106,
+ r"X(.?){3,8}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_108,
+ r"X(.?){4,8}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_110,
+ r"X(.?){5,8}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_112,
+ r"X(.?){6,8}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_114,
+ r"X(.?){7,8}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_115,
+ r"X(.?){8,8}Y",
+ r"X1234567Y",
+ Some((0, 9)),
+ Some((8, 8))
+);
+mat!(
+ match_repetition_126,
+ r"(a|ab|c|bcd){0,}(d*)",
+ r"ababcd",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((1, 1))
+);
+mat!(
+ match_repetition_127,
+ r"(a|ab|c|bcd){1,}(d*)",
+ r"ababcd",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((1, 1))
+);
+mat!(
+ match_repetition_128,
+ r"(a|ab|c|bcd){2,}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((3, 6)),
+ Some((6, 6))
+);
+mat!(
+ match_repetition_129,
+ r"(a|ab|c|bcd){3,}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((3, 6)),
+ Some((6, 6))
+);
+mat!(match_repetition_130, r"(a|ab|c|bcd){4,}(d*)", r"ababcd", None);
+mat!(
+ match_repetition_131,
+ r"(a|ab|c|bcd){0,10}(d*)",
+ r"ababcd",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((1, 1))
+);
+mat!(
+ match_repetition_132,
+ r"(a|ab|c|bcd){1,10}(d*)",
+ r"ababcd",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((1, 1))
+);
+mat!(
+ match_repetition_133,
+ r"(a|ab|c|bcd){2,10}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((3, 6)),
+ Some((6, 6))
+);
+mat!(
+ match_repetition_134,
+ r"(a|ab|c|bcd){3,10}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((3, 6)),
+ Some((6, 6))
+);
+mat!(match_repetition_135, r"(a|ab|c|bcd){4,10}(d*)", r"ababcd", None);
+mat!(
+ match_repetition_136,
+ r"(a|ab|c|bcd)*(d*)",
+ r"ababcd",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((1, 1))
+);
+mat!(
+ match_repetition_137,
+ r"(a|ab|c|bcd)+(d*)",
+ r"ababcd",
+ Some((0, 1)),
+ Some((0, 1)),
+ Some((1, 1))
+);
+mat!(
+ match_repetition_143,
+ r"(ab|a|c|bcd){0,}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
+mat!(
+ match_repetition_145,
+ r"(ab|a|c|bcd){1,}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
+mat!(
+ match_repetition_147,
+ r"(ab|a|c|bcd){2,}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
+mat!(
+ match_repetition_149,
+ r"(ab|a|c|bcd){3,}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
+mat!(match_repetition_150, r"(ab|a|c|bcd){4,}(d*)", r"ababcd", None);
+mat!(
+ match_repetition_152,
+ r"(ab|a|c|bcd){0,10}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
+mat!(
+ match_repetition_154,
+ r"(ab|a|c|bcd){1,10}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
+mat!(
+ match_repetition_156,
+ r"(ab|a|c|bcd){2,10}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
+mat!(
+ match_repetition_158,
+ r"(ab|a|c|bcd){3,10}(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
+mat!(match_repetition_159, r"(ab|a|c|bcd){4,10}(d*)", r"ababcd", None);
+mat!(
+ match_repetition_161,
+ r"(ab|a|c|bcd)*(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
+mat!(
+ match_repetition_163,
+ r"(ab|a|c|bcd)+(d*)",
+ r"ababcd",
+ Some((0, 6)),
+ Some((4, 5)),
+ Some((5, 6))
+);
diff --git a/third_party/rust/regex/tests/macros.rs b/third_party/rust/regex/tests/macros.rs
new file mode 100644
index 0000000000..e70e9489fd
--- /dev/null
+++ b/third_party/rust/regex/tests/macros.rs
@@ -0,0 +1,160 @@
+// Convenience macros.
+
+macro_rules! findall {
+ ($re:expr, $text:expr) => {{
+ $re.find_iter(text!($text))
+ .map(|m| (m.start(), m.end())).collect::<Vec<_>>()
+ }}
+}
+
+// Macros for automatically producing tests.
+
+macro_rules! ismatch {
+ ($name:ident, $re:expr, $text:expr, $ismatch:expr) => {
+ #[test]
+ fn $name() {
+ let re = regex!($re);
+ assert_eq!($ismatch, re.is_match(text!($text)));
+ }
+ };
+}
+
+macro_rules! mat(
+ ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => (
+ #[test]
+ fn $name() {
+ let text = text!($text);
+ let expected: Vec<Option<_>> = vec![$($loc)+];
+ let r = regex!($re);
+ let got: Vec<Option<_>> = match r.captures(text) {
+ Some(c) => {
+ assert!(r.is_match(text));
+ assert!(r.shortest_match(text).is_some());
+ r.capture_names()
+ .enumerate()
+ .map(|(i, _)| c.get(i).map(|m| (m.start(), m.end())))
+ .collect()
+ }
+ None => vec![None],
+ };
+ // The test set sometimes leave out capture groups, so truncate
+ // actual capture groups to match test set.
+ let mut sgot = &got[..];
+ if sgot.len() > expected.len() {
+ sgot = &sgot[0..expected.len()]
+ }
+ if expected != sgot {
+ panic!("For RE '{}' against '{:?}', \
+ expected '{:?}' but got '{:?}'",
+ $re, text, expected, sgot);
+ }
+ }
+ );
+);
+
+macro_rules! matiter(
+ ($name:ident, $re:expr, $text:expr) => (
+ #[test]
+ fn $name() {
+ let text = text!($text);
+ let expected: Vec<(usize, usize)> = vec![];
+ let r = regex!($re);
+ let got: Vec<_> =
+ r.find_iter(text).map(|m| (m.start(), m.end())).collect();
+ if expected != got {
+ panic!("For RE '{}' against '{:?}', \
+ expected '{:?}' but got '{:?}'",
+ $re, text, expected, got);
+ }
+ let captures_got: Vec<_> =
+ r.captures_iter(text)
+ .map(|c| c.get(0).unwrap())
+ .map(|m| (m.start(), m.end()))
+ .collect();
+ if captures_got != got {
+ panic!("For RE '{}' against '{:?}', \
+ got '{:?}' using find_iter but got '{:?}' \
+ using captures_iter",
+ $re, text, got, captures_got);
+ }
+ }
+ );
+ ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => (
+ #[test]
+ fn $name() {
+ let text = text!($text);
+ let expected: Vec<_> = vec![$($loc)+];
+ let r = regex!($re);
+ let got: Vec<_> =
+ r.find_iter(text).map(|m| (m.start(), m.end())).collect();
+ if expected != got {
+ panic!("For RE '{}' against '{:?}', \
+ expected '{:?}' but got '{:?}'",
+ $re, text, expected, got);
+ }
+ let captures_got: Vec<_> =
+ r.captures_iter(text)
+ .map(|c| c.get(0).unwrap())
+ .map(|m| (m.start(), m.end()))
+ .collect();
+ if captures_got != got {
+ panic!("For RE '{}' against '{:?}', \
+ got '{:?}' using find_iter but got '{:?}' \
+ using captures_iter",
+ $re, text, got, captures_got);
+ }
+ }
+ );
+);
+
+macro_rules! matset {
+ ($name:ident, $res:expr, $text:expr, $($match_index:expr),*) => {
+ #[test]
+ fn $name() {
+ let text = text!($text);
+ let set = regex_set!($res);
+ assert!(set.is_match(text));
+ let expected = vec![$($match_index),*];
+ let matches = set.matches(text);
+ assert!(matches.matched_any());
+ let got: Vec<_> = matches.into_iter().collect();
+ assert_eq!(expected, got);
+ }
+ }
+}
+
+macro_rules! nomatset {
+ ($name:ident, $res:expr, $text:expr) => {
+ #[test]
+ fn $name() {
+ let text = text!($text);
+ let set = regex_set!($res);
+ assert!(!set.is_match(text));
+ let matches = set.matches(text);
+ assert!(!matches.matched_any());
+ assert_eq!(0, matches.into_iter().count());
+ }
+ }
+}
+
+macro_rules! split {
+ ($name:ident, $re:expr, $text:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let re = regex!($re);
+ let splitted: Vec<_> = re.split(t!($text)).collect();
+ assert_eq!($expected, &*splitted);
+ }
+ }
+}
+
+macro_rules! splitn {
+ ($name:ident, $re:expr, $text:expr, $limit:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let re = regex!($re);
+ let splitted: Vec<_> = re.splitn(t!($text), $limit).collect();
+ assert_eq!($expected, &*splitted);
+ }
+ }
+}
diff --git a/third_party/rust/regex/tests/macros_bytes.rs b/third_party/rust/regex/tests/macros_bytes.rs
new file mode 100644
index 0000000000..3d6c8c3ac8
--- /dev/null
+++ b/third_party/rust/regex/tests/macros_bytes.rs
@@ -0,0 +1,39 @@
+// Macros for use in writing tests generic over &str/&[u8].
+macro_rules! text { ($text:expr) => { $text.as_bytes() } }
+macro_rules! t { ($re:expr) => { text!($re) } }
+macro_rules! match_text { ($text:expr) => { $text.as_bytes() } }
+macro_rules! use_ { ($($path: tt)*) => { use regex::bytes::$($path)*; } }
+macro_rules! empty_vec { () => { <Vec<&[u8]>>::new() } }
+macro_rules! bytes { ($text:expr) => { $text } }
+
+macro_rules! no_expand {
+ ($text:expr) => {{
+ use regex::bytes::NoExpand;
+ NoExpand(text!($text))
+ }}
+}
+
+macro_rules! show {
+ ($text:expr) => {{
+ use std::ascii::escape_default;
+ let mut s = vec![];
+ for &b in bytes!($text) {
+ s.extend(escape_default(b));
+ }
+ String::from_utf8(s).unwrap()
+ }}
+}
+
+macro_rules! expand {
+ ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let re = regex!($re);
+ let cap = re.captures(t!($text)).unwrap();
+
+ let mut got = vec![];
+ cap.expand(t!($expand), &mut got);
+ assert_eq!(show!(t!($expected)), show!(&*got));
+ }
+ }
+}
diff --git a/third_party/rust/regex/tests/macros_str.rs b/third_party/rust/regex/tests/macros_str.rs
new file mode 100644
index 0000000000..7b7eb110c2
--- /dev/null
+++ b/third_party/rust/regex/tests/macros_str.rs
@@ -0,0 +1,38 @@
+// Macros for use in writing tests generic over &str/&[u8].
+macro_rules! text { ($text:expr) => { $text } }
+macro_rules! t { ($text:expr) => { text!($text) } }
+macro_rules! match_text { ($text:expr) => { $text.as_str() } }
+macro_rules! use_ { ($($path: tt)*) => { use regex::$($path)*; } }
+macro_rules! empty_vec { () => { <Vec<&str>>::new() } }
+macro_rules! bytes { ($text:expr) => { std::str::from_utf8($text.as_ref()).unwrap() } }
+
+macro_rules! no_expand {
+ ($text:expr) => {{
+ use regex::NoExpand;
+ NoExpand(text!($text))
+ }}
+}
+
+macro_rules! show { ($text:expr) => { $text } }
+
+// N.B. The expansion API for &str and &[u8] APIs differs slightly for now,
+// but they should be unified in 1.0. Then we can move this macro back into
+// tests/api.rs where it is used. ---AG
+macro_rules! expand {
+ ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => {
+ #[test]
+ fn $name() {
+ let re = regex!($re);
+ let cap = re.captures(t!($text)).unwrap();
+
+ let mut got = String::new();
+ cap.expand(t!($expand), &mut got);
+ assert_eq!(show!(t!($expected)), show!(&*got));
+ }
+ }
+}
+
+#[cfg(feature = "pattern")]
+macro_rules! searcher_expr { ($e:expr) => ($e) }
+#[cfg(not(feature = "pattern"))]
+macro_rules! searcher_expr { ($e:expr) => ({}) }
diff --git a/third_party/rust/regex/tests/misc.rs b/third_party/rust/regex/tests/misc.rs
new file mode 100644
index 0000000000..314811e252
--- /dev/null
+++ b/third_party/rust/regex/tests/misc.rs
@@ -0,0 +1,4 @@
+mat!(prefix_literal_match, r"^abc", r"abc", Some((0, 3)));
+mat!(prefix_literal_nomatch, r"^abc", r"zabc", None);
+mat!(one_literal_edge, r"abc", r"xxxxxab", None);
+matiter!(terminates, r"a$", r"a", (0, 1));
diff --git a/third_party/rust/regex/tests/multiline.rs b/third_party/rust/regex/tests/multiline.rs
new file mode 100644
index 0000000000..62ee47b62b
--- /dev/null
+++ b/third_party/rust/regex/tests/multiline.rs
@@ -0,0 +1,144 @@
+matiter!(
+ match_multi_1,
+ r"(?m)^[a-z]+$",
+ "abc\ndef\nxyz",
+ (0, 3),
+ (4, 7),
+ (8, 11)
+);
+matiter!(match_multi_2, r"(?m)^$", "abc\ndef\nxyz");
+matiter!(match_multi_3, r"(?m)^", "abc\ndef\nxyz", (0, 0), (4, 4), (8, 8));
+matiter!(match_multi_4, r"(?m)$", "abc\ndef\nxyz", (3, 3), (7, 7), (11, 11));
+matiter!(
+ match_multi_5,
+ r"(?m)^[a-z]",
+ "abc\ndef\nxyz",
+ (0, 1),
+ (4, 5),
+ (8, 9)
+);
+matiter!(match_multi_6, r"(?m)[a-z]^", "abc\ndef\nxyz");
+matiter!(
+ match_multi_7,
+ r"(?m)[a-z]$",
+ "abc\ndef\nxyz",
+ (2, 3),
+ (6, 7),
+ (10, 11)
+);
+matiter!(match_multi_8, r"(?m)$[a-z]", "abc\ndef\nxyz");
+matiter!(match_multi_9, r"(?m)^$", "", (0, 0));
+
+matiter!(
+ match_multi_rep_1,
+ r"(?m)(?:^$)*",
+ "a\nb\nc",
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5)
+);
+matiter!(
+ match_multi_rep_2,
+ r"(?m)(?:^|a)+",
+ "a\naaa\n",
+ (0, 0),
+ (2, 2),
+ (3, 5),
+ (6, 6)
+);
+matiter!(
+ match_multi_rep_3,
+ r"(?m)(?:^|a)*",
+ "a\naaa\n",
+ (0, 1),
+ (2, 5),
+ (6, 6)
+);
+matiter!(
+ match_multi_rep_4,
+ r"(?m)(?:^[a-z])+",
+ "abc\ndef\nxyz",
+ (0, 1),
+ (4, 5),
+ (8, 9)
+);
+matiter!(
+ match_multi_rep_5,
+ r"(?m)(?:^[a-z]{3}\n?)+",
+ "abc\ndef\nxyz",
+ (0, 11)
+);
+matiter!(
+ match_multi_rep_6,
+ r"(?m)(?:^[a-z]{3}\n?)*",
+ "abc\ndef\nxyz",
+ (0, 11)
+);
+matiter!(
+ match_multi_rep_7,
+ r"(?m)(?:\n?[a-z]{3}$)+",
+ "abc\ndef\nxyz",
+ (0, 11)
+);
+matiter!(
+ match_multi_rep_8,
+ r"(?m)(?:\n?[a-z]{3}$)*",
+ "abc\ndef\nxyz",
+ (0, 11)
+);
+matiter!(
+ match_multi_rep_9,
+ r"(?m)^*",
+ "\naa\n",
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4)
+);
+matiter!(match_multi_rep_10, r"(?m)^+", "\naa\n", (0, 0), (1, 1), (4, 4));
+matiter!(
+ match_multi_rep_11,
+ r"(?m)$*",
+ "\naa\n",
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4)
+);
+matiter!(match_multi_rep_12, r"(?m)$+", "\naa\n", (0, 0), (3, 3), (4, 4));
+matiter!(match_multi_rep_13, r"(?m)(?:$\n)+", "\n\naaa\n\n", (0, 2), (5, 7));
+matiter!(
+ match_multi_rep_14,
+ r"(?m)(?:$\n)*",
+ "\n\naaa\n\n",
+ (0, 2),
+ (3, 3),
+ (4, 4),
+ (5, 7)
+);
+matiter!(match_multi_rep_15, r"(?m)(?:$\n^)+", "\n\naaa\n\n", (0, 2), (5, 7));
+matiter!(
+ match_multi_rep_16,
+ r"(?m)(?:^|$)+",
+ "\n\naaa\n\n",
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (5, 5),
+ (6, 6),
+ (7, 7)
+);
+matiter!(
+ match_multi_rep_17,
+ r"(?m)(?:$\n)*",
+ "\n\naaa\n\n",
+ (0, 2),
+ (3, 3),
+ (4, 4),
+ (5, 7)
+);
diff --git a/third_party/rust/regex/tests/noparse.rs b/third_party/rust/regex/tests/noparse.rs
new file mode 100644
index 0000000000..8ded1dce7b
--- /dev/null
+++ b/third_party/rust/regex/tests/noparse.rs
@@ -0,0 +1,45 @@
+macro_rules! noparse(
+ ($name:ident, $re:expr) => (
+ #[test]
+ fn $name() {
+ let re = $re;
+ match regex_new!(re) {
+ Err(_) => {},
+ Ok(_) => panic!("Regex '{}' should cause a parse error.", re),
+ }
+ }
+ );
+);
+
+noparse!(fail_no_repeat_arg, "*");
+noparse!(fail_incomplete_escape, "\\");
+noparse!(fail_class_incomplete, "[A-");
+noparse!(fail_class_not_closed, "[A");
+noparse!(fail_class_no_begin, r"[\A]");
+noparse!(fail_class_no_end, r"[\z]");
+noparse!(fail_class_no_boundary, r"[\b]");
+noparse!(fail_open_paren, "(");
+noparse!(fail_close_paren, ")");
+noparse!(fail_invalid_range, "[a-Z]");
+noparse!(fail_empty_capture_name, "(?P<>a)");
+noparse!(fail_bad_capture_name, "(?P<na-me>)");
+noparse!(fail_bad_flag, "(?a)a");
+noparse!(fail_too_big, "a{10000000}");
+noparse!(fail_counted_no_close, "a{1001");
+noparse!(fail_counted_decreasing, "a{2,1}");
+noparse!(fail_counted_nonnegative, "a{-1,1}");
+noparse!(fail_unfinished_cap, "(?");
+noparse!(fail_unfinished_escape, "\\");
+noparse!(fail_octal_digit, r"\8");
+noparse!(fail_hex_digit, r"\xG0");
+noparse!(fail_hex_short, r"\xF");
+noparse!(fail_hex_long_digits, r"\x{fffg}");
+noparse!(fail_flag_bad, "(?a)");
+noparse!(fail_flag_empty, "(?)");
+noparse!(fail_double_neg, "(?-i-i)");
+noparse!(fail_neg_empty, "(?i-)");
+noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)");
+noparse!(fail_range_end_no_class, "[a-[:lower:]]");
+noparse!(fail_range_end_no_begin, r"[a-\A]");
+noparse!(fail_range_end_no_end, r"[a-\z]");
+noparse!(fail_range_end_no_boundary, r"[a-\b]");
diff --git a/third_party/rust/regex/tests/regression.rs b/third_party/rust/regex/tests/regression.rs
new file mode 100644
index 0000000000..e8b2525385
--- /dev/null
+++ b/third_party/rust/regex/tests/regression.rs
@@ -0,0 +1,222 @@
+// See: https://github.com/rust-lang/regex/issues/48
+#[test]
+fn invalid_regexes_no_crash() {
+ assert!(regex_new!("(*)").is_err());
+ assert!(regex_new!("(?:?)").is_err());
+ assert!(regex_new!("(?)").is_err());
+ assert!(regex_new!("*").is_err());
+}
+
+// See: https://github.com/rust-lang/regex/issues/98
+#[test]
+fn regression_many_repeat_stack_overflow() {
+ let re = regex!("^.{1,2500}");
+ assert_eq!(vec![(0, 1)], findall!(re, "a"));
+}
+
+// See: https://github.com/rust-lang/regex/issues/555
+#[test]
+fn regression_invalid_repetition_expr() {
+ assert!(regex_new!("(?m){1,1}").is_err());
+}
+
+// See: https://github.com/rust-lang/regex/issues/527
+#[test]
+fn regression_invalid_flags_expression() {
+ assert!(regex_new!("(((?x)))").is_ok());
+}
+
+// See: https://github.com/rust-lang/regex/issues/75
+mat!(regression_unsorted_binary_search_1, r"(?i-u)[a_]+", "A_", Some((0, 2)));
+mat!(regression_unsorted_binary_search_2, r"(?i-u)[A_]+", "a_", Some((0, 2)));
+
+// See: https://github.com/rust-lang/regex/issues/99
+#[cfg(feature = "unicode-case")]
+mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None);
+#[cfg(feature = "unicode-case")]
+mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None);
+
+// See: https://github.com/rust-lang/regex/issues/101
+mat!(regression_ascii_word_underscore, r"[[:word:]]", "_", Some((0, 1)));
+
+// See: https://github.com/rust-lang/regex/issues/129
+#[test]
+fn regression_captures_rep() {
+ let re = regex!(r"([a-f]){2}(?P<foo>[x-z])");
+ let caps = re.captures(text!("abx")).unwrap();
+ assert_eq!(match_text!(caps.name("foo").unwrap()), text!("x"));
+}
+
+// See: https://github.com/rust-lang/regex/issues/153
+mat!(regression_alt_in_alt1, r"ab?|$", "az", Some((0, 1)));
+mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3)));
+
+// See: https://github.com/rust-lang/regex/issues/169
+mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3)));
+
+// See: https://github.com/rust-lang/regex/issues/76
+#[cfg(all(feature = "unicode-case", feature = "unicode-gencat"))]
+mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10)));
+
+// See: https://github.com/rust-lang/regex/issues/191
+mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3)));
+
+// burntsushi was bad and didn't create an issue for this bug.
+mat!(anchored_prefix1, r"^a[[:^space:]]", "a ", None);
+mat!(anchored_prefix2, r"^a[[:^space:]]", "foo boo a ", None);
+mat!(anchored_prefix3, r"^-[a-z]", "r-f", None);
+
+// See: https://github.com/rust-lang/regex/issues/204
+#[cfg(feature = "unicode-perl")]
+split!(
+ split_on_word_boundary,
+ r"\b",
+ r"Should this (work?)",
+ &[
+ t!(""),
+ t!("Should"),
+ t!(" "),
+ t!("this"),
+ t!(" ("),
+ t!("work"),
+ t!("?)")
+ ]
+);
+#[cfg(feature = "unicode-perl")]
+matiter!(
+ word_boundary_dfa,
+ r"\b",
+ "a b c",
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5)
+);
+
+// See: https://github.com/rust-lang/regex/issues/268
+matiter!(partial_anchor, r"^a|b", "ba", (0, 1));
+
+// See: https://github.com/rust-lang/regex/issues/280
+ismatch!(partial_anchor_alternate_begin, r"^a|z", "yyyyya", false);
+ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false);
+
+// See: https://github.com/rust-lang/regex/issues/289
+mat!(lits_unambiguous1, r"(ABC|CDA|BC)X", "CDAX", Some((0, 4)));
+
+// See: https://github.com/rust-lang/regex/issues/291
+mat!(
+ lits_unambiguous2,
+ r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$",
+ "CIMG2341",
+ Some((0, 8)),
+ Some((0, 4)),
+ None,
+ Some((0, 4)),
+ Some((4, 8))
+);
+
+// See: https://github.com/rust-lang/regex/issues/271
+mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4)));
+mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4)));
+mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4)));
+#[cfg(feature = "unicode-perl")]
+mat!(wb_start_x, r"(?u:\b)^(?-u:X)", "X", Some((0, 1)));
+
+// See: https://github.com/rust-lang/regex/issues/321
+ismatch!(strange_anchor_non_complete_prefix, r"a^{2}", "", false);
+ismatch!(strange_anchor_non_complete_suffix, r"${2}a", "", false);
+
+// See: https://github.com/BurntSushi/ripgrep/issues/1203
+ismatch!(reverse_suffix1, r"[0-4][0-4][0-4]000", "153.230000", true);
+ismatch!(reverse_suffix2, r"[0-9][0-9][0-9]000", "153.230000\n", true);
+matiter!(reverse_suffix3, r"[0-9][0-9][0-9]000", "153.230000\n", (4, 10));
+
+// See: https://github.com/rust-lang/regex/issues/334
+// See: https://github.com/rust-lang/regex/issues/557
+mat!(
+ captures_after_dfa_premature_end1,
+ r"a(b*(X|$))?",
+ "abcbX",
+ Some((0, 1)),
+ None,
+ None
+);
+mat!(
+ captures_after_dfa_premature_end2,
+ r"a(bc*(X|$))?",
+ "abcbX",
+ Some((0, 1)),
+ None,
+ None
+);
+mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz", Some((0, 0)));
+
+// See: https://github.com/rust-lang/regex/issues/437
+ismatch!(
+ literal_panic,
+ r"typename type\-parameter\-[0-9]+\-[0-9]+::.+",
+ "test",
+ false
+);
+
+// See: https://github.com/rust-lang/regex/issues/533
+ismatch!(
+ blank_matches_nothing_between_space_and_tab,
+ r"[[:blank:]]",
+ "\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\
+ \u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
+ \u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
+ false
+);
+
+ismatch!(
+ inverted_blank_matches_everything_between_space_and_tab,
+ r"^[[:^blank:]]+$",
+ "\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\
+ \u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
+ \u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
+ true
+);
+
+// Tests that our Aho-Corasick optimization works correctly. It only
+// kicks in when we have >32 literals. By "works correctly," we mean that
+// leftmost-first match semantics are properly respected. That is, samwise
+// should match, not sam.
+mat!(
+ ahocorasick1,
+ "samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|\
+ A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z",
+ "samwise",
+ Some((0, 7))
+);
+
+// See: https://github.com/BurntSushi/ripgrep/issues/1247
+#[test]
+#[cfg(feature = "unicode-perl")]
+fn regression_nfa_stops1() {
+ let re = ::regex::bytes::Regex::new(r"\bs(?:[ab])").unwrap();
+ assert_eq!(0, re.find_iter(b"s\xE4").count());
+}
+
+// See: https://github.com/rust-lang/regex/issues/640
+#[cfg(feature = "unicode-case")]
+matiter!(
+ flags_are_unset,
+ r"((?i)foo)|Bar",
+ "foo Foo bar Bar",
+ (0, 3),
+ (4, 7),
+ (12, 15)
+);
+
+// See: https://github.com/rust-lang/regex/issues/659
+//
+// Note that 'Ј' is not 'j', but cyrillic Je
+// https://en.wikipedia.org/wiki/Je_(Cyrillic)
+ismatch!(empty_group_match, r"()Ј01", "zЈ01", true);
+matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));
+
+// See: https://github.com/rust-lang/regex/issues/862
+mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));
diff --git a/third_party/rust/regex/tests/regression_fuzz.rs b/third_party/rust/regex/tests/regression_fuzz.rs
new file mode 100644
index 0000000000..4e76704d2a
--- /dev/null
+++ b/third_party/rust/regex/tests/regression_fuzz.rs
@@ -0,0 +1,31 @@
+// These tests are only run for the "default" test target because some of them
+// can take quite a long time. Some of them take long enough that it's not
+// practical to run them in debug mode. :-/
+
+// See: https://oss-fuzz.com/testcase-detail/5673225499181056
+//
+// Ignored by default since it takes too long in debug mode (almost a minute).
+#[test]
+#[ignore]
+fn fuzz1() {
+ regex!(r"1}{55}{0}*{1}{55}{55}{5}*{1}{55}+{56}|;**");
+}
+
+// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=26505
+// See: https://github.com/rust-lang/regex/issues/722
+#[test]
+fn empty_any_errors_no_panic() {
+ assert!(regex_new!(r"\P{any}").is_err());
+}
+
+// This tests that a very large regex errors during compilation instead of
+// using gratuitous amounts of memory. The specific problem is that the
+// compiler wasn't accounting for the memory used by Unicode character classes
+// correctly.
+//
+// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579
+#[test]
+fn big_regex_fails_to_compile() {
+ let pat = "[\u{0}\u{e}\u{2}\\w~~>[l\t\u{0}]p?<]{971158}";
+ assert!(regex_new!(pat).is_err());
+}
diff --git a/third_party/rust/regex/tests/replace.rs b/third_party/rust/regex/tests/replace.rs
new file mode 100644
index 0000000000..d65be072ff
--- /dev/null
+++ b/third_party/rust/regex/tests/replace.rs
@@ -0,0 +1,248 @@
+macro_rules! replace(
+ ($name:ident, $which:ident, $re:expr,
+ $search:expr, $replace:expr, $result:expr) => (
+ #[test]
+ fn $name() {
+ let re = regex!($re);
+ assert_eq!(re.$which(text!($search), $replace), text!($result));
+ }
+ );
+);
+
+replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6");
+replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z");
+replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ");
+replace!(
+ groups,
+ replace,
+ r"(?-u)(\S+)\s+(\S+)",
+ "w1 w2",
+ t!("$2 $1"),
+ "w2 w1"
+);
+replace!(
+ double_dollar,
+ replace,
+ r"(?-u)(\S+)\s+(\S+)",
+ "w1 w2",
+ t!("$2 $$1"),
+ "w2 $1"
+);
+// replace!(adjacent_index, replace,
+// r"([^aeiouy])ies$", "skies", t!("$1y"), "sky");
+replace!(
+ named,
+ replace_all,
+ r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
+ "w1 w2 w3 w4",
+ t!("$last $first$space"),
+ "w2 w1 w4 w3"
+);
+replace!(
+ trim,
+ replace_all,
+ "^[ \t]+|[ \t]+$",
+ " \t trim me\t \t",
+ t!(""),
+ "trim me"
+);
+replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
+// replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b");
+replace!(
+ simple_expand,
+ replace_all,
+ r"(?-u)(\w) (\w)",
+ "a b",
+ t!("$2 $1"),
+ "b a"
+);
+replace!(
+ literal_dollar1,
+ replace_all,
+ r"(?-u)(\w+) (\w+)",
+ "a b",
+ t!("$$1"),
+ "$1"
+);
+replace!(
+ literal_dollar2,
+ replace_all,
+ r"(?-u)(\w+) (\w+)",
+ "a b",
+ t!("$2 $$c $1"),
+ "b $c a"
+);
+replace!(
+ no_expand1,
+ replace,
+ r"(?-u)(\S+)\s+(\S+)",
+ "w1 w2",
+ no_expand!("$2 $1"),
+ "$2 $1"
+);
+replace!(
+ no_expand2,
+ replace,
+ r"(?-u)(\S+)\s+(\S+)",
+ "w1 w2",
+ no_expand!("$$1"),
+ "$$1"
+);
+use_!(Captures);
+replace!(
+ closure_returning_reference,
+ replace,
+ r"([0-9]+)",
+ "age: 26",
+ |captures: &Captures<'_>| {
+ match_text!(captures.get(1).unwrap())[0..1].to_owned()
+ },
+ "age: 2"
+);
+replace!(
+ closure_returning_value,
+ replace,
+ r"[0-9]+",
+ "age: 26",
+ |_captures: &Captures<'_>| t!("Z").to_owned(),
+ "age: Z"
+);
+
+// See https://github.com/rust-lang/regex/issues/314
+replace!(
+ match_at_start_replace_with_empty,
+ replace_all,
+ r"foo",
+ "foobar",
+ t!(""),
+ "bar"
+);
+
+// See https://github.com/rust-lang/regex/issues/393
+replace!(single_empty_match, replace, r"^", "bar", t!("foo"), "foobar");
+
+// See https://github.com/rust-lang/regex/issues/399
+replace!(
+ capture_longest_possible_name,
+ replace_all,
+ r"(.)",
+ "b",
+ t!("${1}a $1a"),
+ "ba "
+);
+
+replace!(
+ impl_string,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!("Z".to_string()),
+ "age: Z6"
+);
+replace!(
+ impl_string_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(&"Z".to_string()),
+ "age: Z6"
+);
+replace!(
+ impl_cow_str_borrowed,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(std::borrow::Cow::<'_, str>::Borrowed("Z")),
+ "age: Z6"
+);
+replace!(
+ impl_cow_str_borrowed_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(&std::borrow::Cow::<'_, str>::Borrowed("Z")),
+ "age: Z6"
+);
+replace!(
+ impl_cow_str_owned,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(std::borrow::Cow::<'_, str>::Owned("Z".to_string())),
+ "age: Z6"
+);
+replace!(
+ impl_cow_str_owned_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ t!(&std::borrow::Cow::<'_, str>::Owned("Z".to_string())),
+ "age: Z6"
+);
+
+replace!(
+ impl_vec_u8,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(vec![b'Z']),
+ "age: Z6"
+);
+replace!(
+ impl_vec_u8_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(&vec![b'Z']),
+ "age: Z6"
+);
+replace!(
+ impl_cow_slice_borrowed,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])),
+ "age: Z6"
+);
+replace!(
+ impl_cow_slice_borrowed_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(&std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])),
+ "age: Z6"
+);
+replace!(
+ impl_cow_slice_owned,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])),
+ "age: Z6"
+);
+replace!(
+ impl_cow_slice_owned_ref,
+ replace,
+ r"[0-9]",
+ "age: 26",
+ bytes!(&std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])),
+ "age: Z6"
+);
+
+#[test]
+fn replacen_no_captures() {
+ let re = regex!(r"[0-9]");
+ assert_eq!(
+ re.replacen(text!("age: 1234"), 2, t!("Z")),
+ text!("age: ZZ34")
+ );
+}
+
+#[test]
+fn replacen_with_captures() {
+ let re = regex!(r"([0-9])");
+ assert_eq!(
+ re.replacen(text!("age: 1234"), 2, t!("${1}Z")),
+ text!("age: 1Z2Z34")
+ );
+}
diff --git a/third_party/rust/regex/tests/searcher.rs b/third_party/rust/regex/tests/searcher.rs
new file mode 100644
index 0000000000..3779f54c31
--- /dev/null
+++ b/third_party/rust/regex/tests/searcher.rs
@@ -0,0 +1,95 @@
+macro_rules! searcher {
+ ($name:ident, $re:expr, $haystack:expr) => (
+ searcher!($name, $re, $haystack, vec vec![]);
+ );
+ ($name:ident, $re:expr, $haystack:expr, $($steps:expr,)*) => (
+ searcher!($name, $re, $haystack, vec vec![$($steps),*]);
+ );
+ ($name:ident, $re:expr, $haystack:expr, $($steps:expr),*) => (
+ searcher!($name, $re, $haystack, vec vec![$($steps),*]);
+ );
+ ($name:ident, $re:expr, $haystack:expr, vec $expect_steps:expr) => (
+ #[test]
+ #[allow(unused_imports)]
+ fn $name() {
+ searcher_expr! {{
+ use std::str::pattern::{Pattern, Searcher};
+ use std::str::pattern::SearchStep::{Match, Reject, Done};
+ let re = regex!($re);
+ let mut se = re.into_searcher($haystack);
+ let mut got_steps = vec![];
+ loop {
+ match se.next() {
+ Done => break,
+ step => { got_steps.push(step); }
+ }
+ }
+ assert_eq!(got_steps, $expect_steps);
+ }}
+ }
+ );
+}
+
+searcher!(searcher_empty_regex_empty_haystack, r"", "", Match(0, 0));
+searcher!(
+ searcher_empty_regex,
+ r"",
+ "ab",
+ Match(0, 0),
+ Reject(0, 1),
+ Match(1, 1),
+ Reject(1, 2),
+ Match(2, 2)
+);
+searcher!(searcher_empty_haystack, r"\d", "");
+searcher!(searcher_one_match, r"\d", "5", Match(0, 1));
+searcher!(searcher_no_match, r"\d", "a", Reject(0, 1));
+searcher!(
+ searcher_two_adjacent_matches,
+ r"\d",
+ "56",
+ Match(0, 1),
+ Match(1, 2)
+);
+searcher!(
+ searcher_two_non_adjacent_matches,
+ r"\d",
+ "5a6",
+ Match(0, 1),
+ Reject(1, 2),
+ Match(2, 3)
+);
+searcher!(searcher_reject_first, r"\d", "a6", Reject(0, 1), Match(1, 2));
+searcher!(
+ searcher_one_zero_length_matches,
+ r"\d*",
+ "a1b2",
+ Match(0, 0), // ^
+ Reject(0, 1), // a
+ Match(1, 2), // a1
+ Reject(2, 3), // a1b
+ Match(3, 4), // a1b2
+);
+searcher!(
+ searcher_many_zero_length_matches,
+ r"\d*",
+ "a1bbb2",
+ Match(0, 0), // ^
+ Reject(0, 1), // a
+ Match(1, 2), // a1
+ Reject(2, 3), // a1b
+ Match(3, 3), // a1bb
+ Reject(3, 4), // a1bb
+ Match(4, 4), // a1bbb
+ Reject(4, 5), // a1bbb
+ Match(5, 6), // a1bbba
+);
+searcher!(
+ searcher_unicode,
+ r".+?",
+ "Ⅰ1Ⅱ2",
+ Match(0, 3),
+ Match(3, 4),
+ Match(4, 7),
+ Match(7, 8)
+);
diff --git a/third_party/rust/regex/tests/set.rs b/third_party/rust/regex/tests/set.rs
new file mode 100644
index 0000000000..37fcf8700c
--- /dev/null
+++ b/third_party/rust/regex/tests/set.rs
@@ -0,0 +1,67 @@
+matset!(set1, &["a", "a"], "a", 0, 1);
+matset!(set2, &["a", "a"], "ba", 0, 1);
+matset!(set3, &["a", "b"], "a", 0);
+matset!(set4, &["a", "b"], "b", 1);
+matset!(set5, &["a|b", "b|a"], "b", 0, 1);
+matset!(set6, &["foo", "oo"], "foo", 0, 1);
+matset!(set7, &["^foo", "bar$"], "foo", 0);
+matset!(set8, &["^foo", "bar$"], "foo bar", 0, 1);
+matset!(set9, &["^foo", "bar$"], "bar", 1);
+matset!(set10, &[r"[a-z]+$", "foo"], "01234 foo", 0, 1);
+matset!(set11, &[r"[a-z]+$", "foo"], "foo 01234", 1);
+matset!(set12, &[r".*?", "a"], "zzzzzza", 0, 1);
+matset!(set13, &[r".*", "a"], "zzzzzza", 0, 1);
+matset!(set14, &[r".*", "a"], "zzzzzz", 0);
+matset!(set15, &[r"(?-u)\ba\b"], "hello a bye", 0);
+matset!(set16, &["a"], "a", 0);
+matset!(set17, &[".*a"], "a", 0);
+matset!(set18, &["a", "β"], "β", 1);
+
+// regexes that match the empty string
+matset!(setempty1, &["", "a"], "abc", 0, 1);
+matset!(setempty2, &["", "b"], "abc", 0, 1);
+matset!(setempty3, &["", "z"], "abc", 0);
+matset!(setempty4, &["a", ""], "abc", 0, 1);
+matset!(setempty5, &["b", ""], "abc", 0, 1);
+matset!(setempty6, &["z", ""], "abc", 1);
+matset!(setempty7, &["b", "(?:)"], "abc", 0, 1);
+matset!(setempty8, &["(?:)", "b"], "abc", 0, 1);
+matset!(setempty9, &["c(?:)", "b"], "abc", 0, 1);
+
+nomatset!(nset1, &["a", "a"], "b");
+nomatset!(nset2, &["^foo", "bar$"], "bar foo");
+nomatset!(
+ nset3,
+ {
+ let xs: &[&str] = &[];
+ xs
+ },
+ "a"
+);
+nomatset!(nset4, &[r"^rooted$", r"\.log$"], "notrooted");
+
+// See: https://github.com/rust-lang/regex/issues/187
+#[test]
+fn regression_subsequent_matches() {
+ let set = regex_set!(&["ab", "b"]);
+ let text = text!("ba");
+ assert!(set.matches(text).matched(1));
+ assert!(set.matches(text).matched(1));
+}
+
+#[test]
+fn get_set_patterns() {
+ let set = regex_set!(&["a", "b"]);
+ assert_eq!(vec!["a", "b"], set.patterns());
+}
+
+#[test]
+fn len_and_empty() {
+ let empty = regex_set!(&[""; 0]);
+ assert_eq!(empty.len(), 0);
+ assert!(empty.is_empty());
+
+ let not_empty = regex_set!(&["ab", "b"]);
+ assert_eq!(not_empty.len(), 2);
+ assert!(!not_empty.is_empty());
+}
diff --git a/third_party/rust/regex/tests/shortest_match.rs b/third_party/rust/regex/tests/shortest_match.rs
new file mode 100644
index 0000000000..f8b4fed156
--- /dev/null
+++ b/third_party/rust/regex/tests/shortest_match.rs
@@ -0,0 +1,14 @@
+macro_rules! shortmat {
+ ($name:ident, $re:expr, $text:expr, $shortest_match:expr) => {
+ #[test]
+ fn $name() {
+ let text = text!($text);
+ let re = regex!($re);
+ assert_eq!($shortest_match, re.shortest_match(text));
+ }
+ };
+}
+
+shortmat!(t01, r"a+", r"aa", Some(1));
+// Test that the reverse suffix optimization gets it right.
+shortmat!(t02, r".*(?:abcd)+", r"abcdabcd", Some(4));
diff --git a/third_party/rust/regex/tests/suffix_reverse.rs b/third_party/rust/regex/tests/suffix_reverse.rs
new file mode 100644
index 0000000000..774c9e85f0
--- /dev/null
+++ b/third_party/rust/regex/tests/suffix_reverse.rs
@@ -0,0 +1,6 @@
+mat!(t01, r".*abcd", r"abcd", Some((0, 4)));
+mat!(t02, r".*(?:abcd)+", r"abcd", Some((0, 4)));
+mat!(t03, r".*(?:abcd)+", r"abcdabcd", Some((0, 8)));
+mat!(t04, r".*(?:abcd)+", r"abcdxabcd", Some((0, 9)));
+mat!(t05, r".*x(?:abcd)+", r"abcdxabcd", Some((0, 9)));
+mat!(t06, r"[^abcd]*x(?:abcd)+", r"abcdxabcd", Some((4, 9)));
diff --git a/third_party/rust/regex/tests/test_backtrack.rs b/third_party/rust/regex/tests/test_backtrack.rs
new file mode 100644
index 0000000000..fb934e2d8f
--- /dev/null
+++ b/third_party/rust/regex/tests/test_backtrack.rs
@@ -0,0 +1,56 @@
+#![cfg_attr(feature = "pattern", feature(pattern))]
+
+macro_rules! regex_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new($re)
+ .bounded_backtracking()
+ .build()
+ .map(|e| e.into_regex())
+ }};
+}
+
+macro_rules! regex {
+ ($re:expr) => {
+ regex_new!($re).unwrap()
+ };
+}
+
+macro_rules! regex_set_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new_many($re)
+ .bounded_backtracking()
+ .build()
+ .map(|e| e.into_regex_set())
+ }};
+}
+
+macro_rules! regex_set {
+ ($res:expr) => {
+ regex_set_new!($res).unwrap()
+ };
+}
+
+// Must come before other module definitions.
+include!("macros_str.rs");
+include!("macros.rs");
+
+mod api;
+mod api_str;
+mod crazy;
+mod flags;
+mod fowler;
+mod multiline;
+mod noparse;
+mod regression;
+mod replace;
+mod searcher;
+mod set;
+mod suffix_reverse;
+#[cfg(feature = "unicode")]
+mod unicode;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary_unicode;
diff --git a/third_party/rust/regex/tests/test_backtrack_bytes.rs b/third_party/rust/regex/tests/test_backtrack_bytes.rs
new file mode 100644
index 0000000000..a59426c949
--- /dev/null
+++ b/third_party/rust/regex/tests/test_backtrack_bytes.rs
@@ -0,0 +1,55 @@
+macro_rules! regex_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new($re)
+ .bounded_backtracking()
+ .only_utf8(false)
+ .build()
+ .map(|e| e.into_byte_regex())
+ }};
+}
+
+macro_rules! regex {
+ ($re:expr) => {
+ regex_new!($re).unwrap()
+ };
+}
+
+macro_rules! regex_set_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new_many($re)
+ .bounded_backtracking()
+ .only_utf8(false)
+ .build()
+ .map(|e| e.into_byte_regex_set())
+ }};
+}
+
+macro_rules! regex_set {
+ ($res:expr) => {
+ regex_set_new!($res).unwrap()
+ };
+}
+
+// Must come before other module definitions.
+include!("macros_bytes.rs");
+include!("macros.rs");
+
+mod api;
+mod bytes;
+mod crazy;
+mod flags;
+mod fowler;
+mod multiline;
+mod noparse;
+mod regression;
+mod replace;
+mod set;
+mod suffix_reverse;
+#[cfg(feature = "unicode")]
+mod unicode;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary_ascii;
diff --git a/third_party/rust/regex/tests/test_backtrack_utf8bytes.rs b/third_party/rust/regex/tests/test_backtrack_utf8bytes.rs
new file mode 100644
index 0000000000..6d308e9e1c
--- /dev/null
+++ b/third_party/rust/regex/tests/test_backtrack_utf8bytes.rs
@@ -0,0 +1,58 @@
+#![cfg_attr(feature = "pattern", feature(pattern))]
+
+macro_rules! regex_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new($re)
+ .bounded_backtracking()
+ .bytes(true)
+ .build()
+ .map(|e| e.into_regex())
+ }};
+}
+
+macro_rules! regex {
+ ($re:expr) => {
+ regex_new!($re).unwrap()
+ };
+}
+
+macro_rules! regex_set_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new_many($re)
+ .bounded_backtracking()
+ .bytes(true)
+ .build()
+ .map(|e| e.into_regex_set())
+ }};
+}
+
+macro_rules! regex_set {
+ ($res:expr) => {
+ regex_set_new!($res).unwrap()
+ };
+}
+
+// Must come before other module definitions.
+include!("macros_str.rs");
+include!("macros.rs");
+
+mod api;
+mod api_str;
+mod crazy;
+mod flags;
+mod fowler;
+mod multiline;
+mod noparse;
+mod regression;
+mod replace;
+mod searcher;
+mod set;
+mod suffix_reverse;
+#[cfg(feature = "unicode")]
+mod unicode;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary_unicode;
diff --git a/third_party/rust/regex/tests/test_crates_regex.rs b/third_party/rust/regex/tests/test_crates_regex.rs
new file mode 100644
index 0000000000..a681604727
--- /dev/null
+++ b/third_party/rust/regex/tests/test_crates_regex.rs
@@ -0,0 +1,54 @@
+/*
+ * This test is a minimal version of <rofl_0> and <subdiff_0>
+ *
+ * Once this bug gets fixed, uncomment rofl_0 and subdiff_0
+ * (in `tests/crates_regex.rs`).
+#[test]
+fn word_boundary_backtracking_default_mismatch() {
+ use regex::internal::ExecBuilder;
+
+ let backtrack_re = ExecBuilder::new(r"\b")
+ .bounded_backtracking()
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let default_re = ExecBuilder::new(r"\b")
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let input = "䅅\\u{a0}";
+
+ let fi1 = backtrack_re.find_iter(input);
+ let fi2 = default_re.find_iter(input);
+ for (m1, m2) in fi1.zip(fi2) {
+ assert_eq!(m1, m2);
+ }
+}
+*/
+
+mod consistent;
+
+mod crates_regex {
+
+ macro_rules! consistent {
+ ($test_name:ident, $regex_src:expr) => {
+ #[test]
+ fn $test_name() {
+ use super::consistent::backends_are_consistent;
+
+ if option_env!("RUST_REGEX_RANDOM_TEST").is_some() {
+ match backends_are_consistent($regex_src) {
+ Ok(_) => {}
+ Err(err) => panic!("{}", err),
+ }
+ }
+ }
+ };
+ }
+
+ include!("crates_regex.rs");
+}
diff --git a/third_party/rust/regex/tests/test_default.rs b/third_party/rust/regex/tests/test_default.rs
new file mode 100644
index 0000000000..be627f7a68
--- /dev/null
+++ b/third_party/rust/regex/tests/test_default.rs
@@ -0,0 +1,222 @@
+#![cfg_attr(feature = "pattern", feature(pattern))]
+
+use regex;
+
+// Due to macro scoping rules, this definition only applies for the modules
+// defined below. Effectively, it allows us to use the same tests for both
+// native and dynamic regexes.
+//
+// This is also used to test the various matching engines. This one exercises
+// the normal code path which automatically chooses the engine based on the
+// regex and the input. Other dynamic tests explicitly set the engine to use.
+macro_rules! regex_new {
+ ($re:expr) => {{
+ use regex::Regex;
+ Regex::new($re)
+ }};
+}
+
+macro_rules! regex {
+ ($re:expr) => {
+ regex_new!($re).unwrap()
+ };
+}
+
+macro_rules! regex_set_new {
+ ($re:expr) => {{
+ use regex::RegexSet;
+ RegexSet::new($re)
+ }};
+}
+
+macro_rules! regex_set {
+ ($res:expr) => {
+ regex_set_new!($res).unwrap()
+ };
+}
+
+// Must come before other module definitions.
+include!("macros_str.rs");
+include!("macros.rs");
+
+mod api;
+mod api_str;
+mod crazy;
+mod flags;
+mod fowler;
+mod misc;
+mod multiline;
+mod noparse;
+mod regression;
+mod regression_fuzz;
+mod replace;
+mod searcher;
+mod set;
+mod shortest_match;
+mod suffix_reverse;
+#[cfg(feature = "unicode")]
+mod unicode;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary_unicode;
+
+#[test]
+fn disallow_non_utf8() {
+ assert!(regex::Regex::new(r"(?-u)\xFF").is_err());
+ assert!(regex::Regex::new(r"(?-u).").is_err());
+ assert!(regex::Regex::new(r"(?-u)[\xFF]").is_err());
+ assert!(regex::Regex::new(r"(?-u)☃").is_err());
+}
+
+#[test]
+fn disallow_octal() {
+ assert!(regex::Regex::new(r"\0").is_err());
+}
+
+#[test]
+fn allow_octal() {
+ assert!(regex::RegexBuilder::new(r"\0").octal(true).build().is_ok());
+}
+
+#[test]
+fn oibits() {
+ use regex::bytes;
+ use regex::{Regex, RegexBuilder, RegexSet, RegexSetBuilder};
+ use std::panic::{RefUnwindSafe, UnwindSafe};
+
+ fn assert_send<T: Send>() {}
+ fn assert_sync<T: Sync>() {}
+ fn assert_unwind_safe<T: UnwindSafe>() {}
+ fn assert_ref_unwind_safe<T: RefUnwindSafe>() {}
+
+ assert_send::<Regex>();
+ assert_sync::<Regex>();
+ assert_unwind_safe::<Regex>();
+ assert_ref_unwind_safe::<Regex>();
+ assert_send::<RegexBuilder>();
+ assert_sync::<RegexBuilder>();
+ assert_unwind_safe::<RegexBuilder>();
+ assert_ref_unwind_safe::<RegexBuilder>();
+
+ assert_send::<bytes::Regex>();
+ assert_sync::<bytes::Regex>();
+ assert_unwind_safe::<bytes::Regex>();
+ assert_ref_unwind_safe::<bytes::Regex>();
+ assert_send::<bytes::RegexBuilder>();
+ assert_sync::<bytes::RegexBuilder>();
+ assert_unwind_safe::<bytes::RegexBuilder>();
+ assert_ref_unwind_safe::<bytes::RegexBuilder>();
+
+ assert_send::<RegexSet>();
+ assert_sync::<RegexSet>();
+ assert_unwind_safe::<RegexSet>();
+ assert_ref_unwind_safe::<RegexSet>();
+ assert_send::<RegexSetBuilder>();
+ assert_sync::<RegexSetBuilder>();
+ assert_unwind_safe::<RegexSetBuilder>();
+ assert_ref_unwind_safe::<RegexSetBuilder>();
+
+ assert_send::<bytes::RegexSet>();
+ assert_sync::<bytes::RegexSet>();
+ assert_unwind_safe::<bytes::RegexSet>();
+ assert_ref_unwind_safe::<bytes::RegexSet>();
+ assert_send::<bytes::RegexSetBuilder>();
+ assert_sync::<bytes::RegexSetBuilder>();
+ assert_unwind_safe::<bytes::RegexSetBuilder>();
+ assert_ref_unwind_safe::<bytes::RegexSetBuilder>();
+}
+
+// See: https://github.com/rust-lang/regex/issues/568
+#[test]
+fn oibits_regression() {
+ use regex::Regex;
+ use std::panic;
+
+ let _ = panic::catch_unwind(|| Regex::new("a").unwrap());
+}
+
+// See: https://github.com/rust-lang/regex/issues/750
+#[test]
+#[cfg(target_pointer_width = "64")]
+fn regex_is_reasonably_small() {
+ use std::mem::size_of;
+
+ use regex::bytes;
+ use regex::{Regex, RegexSet};
+
+ assert_eq!(16, size_of::<Regex>());
+ assert_eq!(16, size_of::<RegexSet>());
+ assert_eq!(16, size_of::<bytes::Regex>());
+ assert_eq!(16, size_of::<bytes::RegexSet>());
+}
+
+// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+// See: CVE-2022-24713
+//
+// We test that our regex compiler will correctly return a "too big" error when
+// we try to use a very large repetition on an *empty* sub-expression.
+//
+// At the time this test was written, the regex compiler does not represent
+// empty sub-expressions with any bytecode instructions. In effect, it's an
+// "optimization" to leave them out, since they would otherwise correspond
+// to an unconditional JUMP in the regex bytecode (i.e., an unconditional
+// epsilon transition in the NFA graph). Therefore, an empty sub-expression
+// represents an interesting case for the compiler's size limits. Since it
+// doesn't actually contribute any additional memory to the compiled regex
+// instructions, the size limit machinery never detects it. Instead, it just
+// dumbly tries to compile the empty sub-expression N times, where N is the
+// repetition size.
+//
+// When N is very large, this will cause the compiler to essentially spin and
+// do nothing for a decently large amount of time. It causes the regex to take
+// quite a bit of time to compile, despite the concrete syntax of the regex
+// being quite small.
+//
+// The degree to which this is actually a problem is somewhat of a judgment
+// call. Some regexes simply take a long time to compile. But in general, you
+// should be able to reasonably control this by setting lower or higher size
+// limits on the compiled object size. But this mitigation doesn't work at all
+// for this case.
+//
+// This particular test is somewhat narrow. It merely checks that regex
+// compilation will, at some point, return a "too big" error. Before the
+// fix landed, this test would eventually fail because the regex would be
+// successfully compiled (after enough time elapsed). So while this test
+// doesn't check that we exit in a reasonable amount of time, it does at least
+// check that we are properly returning an error at some point.
+#[test]
+fn big_empty_regex_fails() {
+ use regex::Regex;
+
+ let result = Regex::new("(?:){4294967295}");
+ assert!(result.is_err());
+}
+
+// Below is a "billion laughs" variant of the previous test case.
+#[test]
+fn big_empty_reps_chain_regex_fails() {
+ use regex::Regex;
+
+ let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}");
+ assert!(result.is_err());
+}
+
+// Below is another situation where a zero-length sub-expression can be
+// introduced.
+#[test]
+fn big_zero_reps_regex_fails() {
+ use regex::Regex;
+
+ let result = Regex::new(r"x{0}{4294967295}");
+ assert!(result.is_err());
+}
+
+// Testing another case for completeness.
+#[test]
+fn empty_alt_regex_fails() {
+ use regex::Regex;
+
+ let result = Regex::new(r"(?:|){4294967295}");
+ assert!(result.is_err());
+}
diff --git a/third_party/rust/regex/tests/test_default_bytes.rs b/third_party/rust/regex/tests/test_default_bytes.rs
new file mode 100644
index 0000000000..f200596ba1
--- /dev/null
+++ b/third_party/rust/regex/tests/test_default_bytes.rs
@@ -0,0 +1,75 @@
+macro_rules! regex_new {
+ ($re:expr) => {{
+ use regex::bytes::Regex;
+ Regex::new($re)
+ }};
+}
+
+macro_rules! regex_set_new {
+ ($res:expr) => {{
+ use regex::bytes::RegexSet;
+ RegexSet::new($res)
+ }};
+}
+
+macro_rules! regex {
+ ($re:expr) => {
+ regex_new!($re).unwrap()
+ };
+}
+
+macro_rules! regex_set {
+ ($res:expr) => {
+ regex_set_new!($res).unwrap()
+ };
+}
+
+// Must come before other module definitions.
+include!("macros_bytes.rs");
+include!("macros.rs");
+
+// A silly wrapper to make it possible to write and match raw bytes.
+struct R<'a>(&'a [u8]);
+impl<'a> R<'a> {
+ fn as_bytes(&self) -> &'a [u8] {
+ self.0
+ }
+}
+
+// See: https://github.com/rust-lang/regex/issues/321
+//
+// These tests are here because they do not have the same behavior in every
+// regex engine.
+mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3)));
+mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None);
+mat!(
+ invalid_utf8_nfa3,
+ r".",
+ R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
+ Some((1, 3))
+);
+mat!(
+ invalid_utf8_nfa4,
+ r"${2}ä",
+ R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
+ None
+);
+
+mod api;
+mod bytes;
+mod crazy;
+mod flags;
+mod fowler;
+mod multiline;
+mod noparse;
+mod regression;
+mod replace;
+mod set;
+mod shortest_match;
+mod suffix_reverse;
+#[cfg(feature = "unicode")]
+mod unicode;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary_unicode;
diff --git a/third_party/rust/regex/tests/test_nfa.rs b/third_party/rust/regex/tests/test_nfa.rs
new file mode 100644
index 0000000000..e5a67d180a
--- /dev/null
+++ b/third_party/rust/regex/tests/test_nfa.rs
@@ -0,0 +1,50 @@
+#![cfg_attr(feature = "pattern", feature(pattern))]
+
+macro_rules! regex_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new($re).nfa().build().map(|e| e.into_regex())
+ }};
+}
+
+macro_rules! regex {
+ ($re:expr) => {
+ regex_new!($re).unwrap()
+ };
+}
+
+macro_rules! regex_set_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new_many($re).nfa().build().map(|e| e.into_regex_set())
+ }};
+}
+
+macro_rules! regex_set {
+ ($res:expr) => {
+ regex_set_new!($res).unwrap()
+ };
+}
+
+// Must come before other module definitions.
+include!("macros_str.rs");
+include!("macros.rs");
+
+mod api;
+mod api_str;
+mod crazy;
+mod flags;
+mod fowler;
+mod multiline;
+mod noparse;
+mod regression;
+mod replace;
+mod searcher;
+mod set;
+mod suffix_reverse;
+#[cfg(feature = "unicode")]
+mod unicode;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary_unicode;
diff --git a/third_party/rust/regex/tests/test_nfa_bytes.rs b/third_party/rust/regex/tests/test_nfa_bytes.rs
new file mode 100644
index 0000000000..0a10e032a2
--- /dev/null
+++ b/third_party/rust/regex/tests/test_nfa_bytes.rs
@@ -0,0 +1,55 @@
+macro_rules! regex_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new($re)
+ .nfa()
+ .only_utf8(false)
+ .build()
+ .map(|e| e.into_byte_regex())
+ }};
+}
+
+macro_rules! regex {
+ ($re:expr) => {
+ regex_new!($re).unwrap()
+ };
+}
+
+macro_rules! regex_set_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new_many($re)
+ .nfa()
+ .only_utf8(false)
+ .build()
+ .map(|e| e.into_byte_regex_set())
+ }};
+}
+
+macro_rules! regex_set {
+ ($res:expr) => {
+ regex_set_new!($res).unwrap()
+ };
+}
+
+// Must come before other module definitions.
+include!("macros_bytes.rs");
+include!("macros.rs");
+
+mod api;
+mod bytes;
+mod crazy;
+mod flags;
+mod fowler;
+mod multiline;
+mod noparse;
+mod regression;
+mod replace;
+mod set;
+mod suffix_reverse;
+#[cfg(feature = "unicode")]
+mod unicode;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary_unicode;
diff --git a/third_party/rust/regex/tests/test_nfa_utf8bytes.rs b/third_party/rust/regex/tests/test_nfa_utf8bytes.rs
new file mode 100644
index 0000000000..36a572b5fc
--- /dev/null
+++ b/third_party/rust/regex/tests/test_nfa_utf8bytes.rs
@@ -0,0 +1,54 @@
+#![cfg_attr(feature = "pattern", feature(pattern))]
+
+macro_rules! regex_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new($re).nfa().bytes(true).build().map(|e| e.into_regex())
+ }};
+}
+
+macro_rules! regex {
+ ($re:expr) => {
+ regex_new!($re).unwrap()
+ };
+}
+
+macro_rules! regex_set_new {
+ ($re:expr) => {{
+ use regex::internal::ExecBuilder;
+ ExecBuilder::new_many($re)
+ .nfa()
+ .bytes(true)
+ .build()
+ .map(|e| e.into_regex_set())
+ }};
+}
+
+macro_rules! regex_set {
+ ($res:expr) => {
+ regex_set_new!($res).unwrap()
+ };
+}
+
+// Must come before other module definitions.
+include!("macros_str.rs");
+include!("macros.rs");
+
+mod api;
+mod api_str;
+mod crazy;
+mod flags;
+mod fowler;
+mod multiline;
+mod noparse;
+mod regression;
+mod replace;
+mod searcher;
+mod set;
+mod suffix_reverse;
+#[cfg(feature = "unicode")]
+mod unicode;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary;
+#[cfg(feature = "unicode-perl")]
+mod word_boundary_unicode;
diff --git a/third_party/rust/regex/tests/unicode.rs b/third_party/rust/regex/tests/unicode.rs
new file mode 100644
index 0000000000..9b32286247
--- /dev/null
+++ b/third_party/rust/regex/tests/unicode.rs
@@ -0,0 +1,251 @@
+mat!(uni_literal, r"☃", "☃", Some((0, 3)));
+mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3)));
+mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3)));
+mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3)));
+mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3)));
+mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8)));
+mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2)));
+mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2)));
+mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5)));
+mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)));
+mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)));
+mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
+mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
+mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));
+
+// Test the Unicode friendliness of Perl character classes.
+mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
+mat!(uni_perl_w_not, r"\w+", "⥡", None);
+mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3)));
+mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)));
+mat!(uni_perl_d_not, r"\d+", "Ⅱ", None);
+mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)));
+mat!(uni_perl_s, r"\s+", " ", Some((0, 3)));
+mat!(uni_perl_s_not, r"\s+", "☃", None);
+mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3)));
+
+// And do the same for word boundaries.
+mat!(uni_boundary_none, r"\d\b", "6δ", None);
+mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1)));
+mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1)));
+mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None);
+
+// Test general categories.
+//
+// We should test more, but there's a lot. Write a script to generate more of
+// these tests.
+mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3)));
+mat!(
+ uni_class_gencat_close_punctuation,
+ r"\p{Close_Punctuation}",
+ "❯",
+ Some((0, 3))
+);
+mat!(
+ uni_class_gencat_connector_punctuation,
+ r"\p{Connector_Punctuation}",
+ "⁀",
+ Some((0, 3))
+);
+mat!(uni_class_gencat_control, r"\p{Control}", "\u{9f}", Some((0, 2)));
+mat!(
+ uni_class_gencat_currency_symbol,
+ r"\p{Currency_Symbol}",
+ "£",
+ Some((0, 3))
+);
+mat!(
+ uni_class_gencat_dash_punctuation,
+ r"\p{Dash_Punctuation}",
+ "〰",
+ Some((0, 3))
+);
+mat!(uni_class_gencat_decimal_numer, r"\p{Decimal_Number}", "𑓙", Some((0, 4)));
+mat!(
+ uni_class_gencat_enclosing_mark,
+ r"\p{Enclosing_Mark}",
+ "\u{A672}",
+ Some((0, 3))
+);
+mat!(
+ uni_class_gencat_final_punctuation,
+ r"\p{Final_Punctuation}",
+ "⸡",
+ Some((0, 3))
+);
+mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4)));
+// See: https://github.com/rust-lang/regex/issues/719
+mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4)));
+mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4)));
+mat!(
+ uni_class_gencat_initial_punctuation,
+ r"\p{Initial_Punctuation}",
+ "⸜",
+ Some((0, 3))
+);
+mat!(uni_class_gencat_letter, r"\p{Letter}", "Έ", Some((0, 2)));
+mat!(uni_class_gencat_letter_number, r"\p{Letter_Number}", "ↂ", Some((0, 3)));
+mat!(
+ uni_class_gencat_line_separator,
+ r"\p{Line_Separator}",
+ "\u{2028}",
+ Some((0, 3))
+);
+mat!(
+ uni_class_gencat_lowercase_letter,
+ r"\p{Lowercase_Letter}",
+ "ϛ",
+ Some((0, 2))
+);
+mat!(uni_class_gencat_mark, r"\p{Mark}", "\u{E01EF}", Some((0, 4)));
+mat!(uni_class_gencat_math, r"\p{Math}", "⋿", Some((0, 3)));
+mat!(
+ uni_class_gencat_modifier_letter,
+ r"\p{Modifier_Letter}",
+ "𖭃",
+ Some((0, 4))
+);
+mat!(
+ uni_class_gencat_modifier_symbol,
+ r"\p{Modifier_Symbol}",
+ "🏿",
+ Some((0, 4))
+);
+mat!(
+ uni_class_gencat_nonspacing_mark,
+ r"\p{Nonspacing_Mark}",
+ "\u{1E94A}",
+ Some((0, 4))
+);
+mat!(uni_class_gencat_number, r"\p{Number}", "⓿", Some((0, 3)));
+mat!(
+ uni_class_gencat_open_punctuation,
+ r"\p{Open_Punctuation}",
+ "⦅",
+ Some((0, 3))
+);
+mat!(uni_class_gencat_other, r"\p{Other}", "\u{bc9}", Some((0, 3)));
+mat!(uni_class_gencat_other_letter, r"\p{Other_Letter}", "ꓷ", Some((0, 3)));
+mat!(uni_class_gencat_other_number, r"\p{Other_Number}", "㉏", Some((0, 3)));
+mat!(
+ uni_class_gencat_other_punctuation,
+ r"\p{Other_Punctuation}",
+ "𞥞",
+ Some((0, 4))
+);
+mat!(uni_class_gencat_other_symbol, r"\p{Other_Symbol}", "⅌", Some((0, 3)));
+mat!(
+ uni_class_gencat_paragraph_separator,
+ r"\p{Paragraph_Separator}",
+ "\u{2029}",
+ Some((0, 3))
+);
+mat!(
+ uni_class_gencat_private_use,
+ r"\p{Private_Use}",
+ "\u{10FFFD}",
+ Some((0, 4))
+);
+mat!(uni_class_gencat_punctuation, r"\p{Punctuation}", "𑁍", Some((0, 4)));
+mat!(uni_class_gencat_separator, r"\p{Separator}", "\u{3000}", Some((0, 3)));
+mat!(
+ uni_class_gencat_space_separator,
+ r"\p{Space_Separator}",
+ "\u{205F}",
+ Some((0, 3))
+);
+mat!(
+ uni_class_gencat_spacing_mark,
+ r"\p{Spacing_Mark}",
+ "\u{16F7E}",
+ Some((0, 4))
+);
+mat!(uni_class_gencat_symbol, r"\p{Symbol}", "⯈", Some((0, 3)));
+mat!(
+ uni_class_gencat_titlecase_letter,
+ r"\p{Titlecase_Letter}",
+ "ῼ",
+ Some((0, 3))
+);
+mat!(
+ uni_class_gencat_unassigned,
+ r"\p{Unassigned}",
+ "\u{10FFFF}",
+ Some((0, 4))
+);
+mat!(
+ uni_class_gencat_uppercase_letter,
+ r"\p{Uppercase_Letter}",
+ "Ꝋ",
+ Some((0, 3))
+);
+
+// Test a smattering of properties.
+mat!(uni_class_prop_emoji1, r"\p{Emoji}", "\u{23E9}", Some((0, 3)));
+mat!(uni_class_prop_emoji2, r"\p{emoji}", "\u{1F21A}", Some((0, 4)));
+mat!(
+ uni_class_prop_picto1,
+ r"\p{extendedpictographic}",
+ "\u{1FA6E}",
+ Some((0, 4))
+);
+mat!(
+ uni_class_prop_picto2,
+ r"\p{extendedpictographic}",
+ "\u{1FFFD}",
+ Some((0, 4))
+);
+
+// grapheme_cluster_break
+mat!(
+ uni_class_gcb_prepend,
+ r"\p{grapheme_cluster_break=prepend}",
+ "\u{11D46}",
+ Some((0, 4))
+);
+mat!(
+ uni_class_gcb_ri1,
+ r"\p{gcb=regional_indicator}",
+ "\u{1F1E6}",
+ Some((0, 4))
+);
+mat!(uni_class_gcb_ri2, r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4)));
+mat!(
+ uni_class_gcb_ri3,
+ r"\p{gcb=regionalindicator}",
+ "\u{1F1FF}",
+ Some((0, 4))
+);
+mat!(uni_class_gcb_lvt, r"\p{gcb=lvt}", "\u{C989}", Some((0, 3)));
+mat!(uni_class_gcb_zwj, r"\p{gcb=zwj}", "\u{200D}", Some((0, 3)));
+
+// word_break
+mat!(uni_class_wb1, r"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3)));
+mat!(uni_class_wb2, r"\p{wb=hebrewletter}", "\u{FB46}", Some((0, 3)));
+mat!(uni_class_wb3, r"\p{wb=ExtendNumLet}", "\u{FF3F}", Some((0, 3)));
+mat!(uni_class_wb4, r"\p{wb=WSegSpace}", "\u{3000}", Some((0, 3)));
+mat!(uni_class_wb5, r"\p{wb=numeric}", "\u{1E950}", Some((0, 4)));
+
+// sentence_break
+mat!(uni_class_sb1, r"\p{sentence_break=Lower}", "\u{0469}", Some((0, 2)));
+mat!(uni_class_sb2, r"\p{sb=lower}", "\u{0469}", Some((0, 2)));
+mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3)));
+mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4)));
+mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3)));
+
+// Test 'Vithkuqi' support, which was added in Unicode 14.
+// See: https://github.com/rust-lang/regex/issues/877
+mat!(
+ uni_vithkuqi_literal_upper,
+ r"(?i)^\u{10570}$",
+ "\u{10570}",
+ Some((0, 4))
+);
+mat!(
+ uni_vithkuqi_literal_lower,
+ r"(?i)^\u{10570}$",
+ "\u{10597}",
+ Some((0, 4))
+);
+mat!(uni_vithkuqi_word_upper, r"^\w$", "\u{10570}", Some((0, 4)));
+mat!(uni_vithkuqi_word_lower, r"^\w$", "\u{10597}", Some((0, 4)));
diff --git a/third_party/rust/regex/tests/word_boundary.rs b/third_party/rust/regex/tests/word_boundary.rs
new file mode 100644
index 0000000000..7fe97a2974
--- /dev/null
+++ b/third_party/rust/regex/tests/word_boundary.rs
@@ -0,0 +1,89 @@
+// Many of these are cribbed from RE2's test suite.
+
+matiter!(wb1, r"\b", "");
+matiter!(wb2, r"\b", "a", (0, 0), (1, 1));
+matiter!(wb3, r"\b", "ab", (0, 0), (2, 2));
+matiter!(wb4, r"^\b", "ab", (0, 0));
+matiter!(wb5, r"\b$", "ab", (2, 2));
+matiter!(wb6, r"^\b$", "ab");
+matiter!(wb7, r"\bbar\b", "nobar bar foo bar", (6, 9), (14, 17));
+matiter!(wb8, r"a\b", "faoa x", (3, 4));
+matiter!(wb9, r"\bbar", "bar x", (0, 3));
+matiter!(wb10, r"\bbar", "foo\nbar x", (4, 7));
+matiter!(wb11, r"bar\b", "foobar", (3, 6));
+matiter!(wb12, r"bar\b", "foobar\nxxx", (3, 6));
+matiter!(wb13, r"(foo|bar|[A-Z])\b", "foo", (0, 3));
+matiter!(wb14, r"(foo|bar|[A-Z])\b", "foo\n", (0, 3));
+matiter!(wb15, r"\b(foo|bar|[A-Z])", "foo", (0, 3));
+matiter!(wb16, r"\b(foo|bar|[A-Z])\b", "X", (0, 1));
+matiter!(wb17, r"\b(foo|bar|[A-Z])\b", "XY");
+matiter!(wb18, r"\b(foo|bar|[A-Z])\b", "bar", (0, 3));
+matiter!(wb19, r"\b(foo|bar|[A-Z])\b", "foo", (0, 3));
+matiter!(wb20, r"\b(foo|bar|[A-Z])\b", "foo\n", (0, 3));
+matiter!(wb21, r"\b(foo|bar|[A-Z])\b", "ffoo bbar N x", (10, 11));
+matiter!(wb22, r"\b(fo|foo)\b", "fo", (0, 2));
+matiter!(wb23, r"\b(fo|foo)\b", "foo", (0, 3));
+matiter!(wb24, r"\b\b", "");
+matiter!(wb25, r"\b\b", "a", (0, 0), (1, 1));
+matiter!(wb26, r"\b$", "");
+matiter!(wb27, r"\b$", "x", (1, 1));
+matiter!(wb28, r"\b$", "y x", (3, 3));
+matiter!(wb29, r"\b.$", "x", (0, 1));
+matiter!(wb30, r"^\b(fo|foo)\b", "fo", (0, 2));
+matiter!(wb31, r"^\b(fo|foo)\b", "foo", (0, 3));
+matiter!(wb32, r"^\b$", "");
+matiter!(wb33, r"^\b$", "x");
+matiter!(wb34, r"^\b.$", "x", (0, 1));
+matiter!(wb35, r"^\b.\b$", "x", (0, 1));
+matiter!(wb36, r"^^^^^\b$$$$$", "");
+matiter!(wb37, r"^^^^^\b.$$$$$", "x", (0, 1));
+matiter!(wb38, r"^^^^^\b$$$$$", "x");
+matiter!(wb39, r"^^^^^\b\b\b.\b\b\b$$$$$", "x", (0, 1));
+matiter!(wb40, r"\b.+\b", "$$abc$$", (2, 5));
+matiter!(wb41, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+
+matiter!(nb1, r"\Bfoo\B", "n foo xfoox that", (7, 10));
+matiter!(nb2, r"a\B", "faoa x", (1, 2));
+matiter!(nb3, r"\Bbar", "bar x");
+matiter!(nb4, r"\Bbar", "foo\nbar x");
+matiter!(nb5, r"bar\B", "foobar");
+matiter!(nb6, r"bar\B", "foobar\nxxx");
+matiter!(nb7, r"(foo|bar|[A-Z])\B", "foox", (0, 3));
+matiter!(nb8, r"(foo|bar|[A-Z])\B", "foo\n");
+matiter!(nb9, r"\B", "", (0, 0));
+matiter!(nb10, r"\B", "x");
+matiter!(nb11, r"\B(foo|bar|[A-Z])", "foo");
+matiter!(nb12, r"\B(foo|bar|[A-Z])\B", "xXy", (1, 2));
+matiter!(nb13, r"\B(foo|bar|[A-Z])\B", "XY");
+matiter!(nb14, r"\B(foo|bar|[A-Z])\B", "XYZ", (1, 2));
+matiter!(nb15, r"\B(foo|bar|[A-Z])\B", "abara", (1, 4));
+matiter!(nb16, r"\B(foo|bar|[A-Z])\B", "xfoo_", (1, 4));
+matiter!(nb17, r"\B(foo|bar|[A-Z])\B", "xfoo\n");
+matiter!(nb18, r"\B(foo|bar|[A-Z])\B", "foo bar vNX", (9, 10));
+matiter!(nb19, r"\B(fo|foo)\B", "xfoo", (1, 3));
+matiter!(nb20, r"\B(foo|fo)\B", "xfooo", (1, 4));
+matiter!(nb21, r"\B\B", "", (0, 0));
+matiter!(nb22, r"\B\B", "x");
+matiter!(nb23, r"\B$", "", (0, 0));
+matiter!(nb24, r"\B$", "x");
+matiter!(nb25, r"\B$", "y x");
+matiter!(nb26, r"\B.$", "x");
+matiter!(nb27, r"^\B(fo|foo)\B", "fo");
+matiter!(nb28, r"^\B(fo|foo)\B", "foo");
+matiter!(nb29, r"^\B", "", (0, 0));
+matiter!(nb30, r"^\B", "x");
+matiter!(nb31, r"^\B\B", "", (0, 0));
+matiter!(nb32, r"^\B\B", "x");
+matiter!(nb33, r"^\B$", "", (0, 0));
+matiter!(nb34, r"^\B$", "x");
+matiter!(nb35, r"^\B.$", "x");
+matiter!(nb36, r"^\B.\B$", "x");
+matiter!(nb37, r"^^^^^\B$$$$$", "", (0, 0));
+matiter!(nb38, r"^^^^^\B.$$$$$", "x");
+matiter!(nb39, r"^^^^^\B$$$$$", "x");
+
+// These work for both Unicode and ASCII because all matches are reported as
+// byte offsets, and « and » do not correspond to word boundaries at either
+// the character or byte level.
+matiter!(unicode1, r"\bx\b", "«x", (2, 3));
+matiter!(unicode2, r"\bx\b", "x»", (0, 1));
diff --git a/third_party/rust/regex/tests/word_boundary_ascii.rs b/third_party/rust/regex/tests/word_boundary_ascii.rs
new file mode 100644
index 0000000000..5a3cf1166c
--- /dev/null
+++ b/third_party/rust/regex/tests/word_boundary_ascii.rs
@@ -0,0 +1,9 @@
+// ASCII word boundaries are completely oblivious to Unicode characters.
+// For Unicode word boundaries, the tests are precisely inverted.
+matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
+matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
+matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
+
+// We still get Unicode word boundaries by default in byte regexes.
+matiter!(unicode1, r"\bx\b", "áxβ");
+matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));
diff --git a/third_party/rust/regex/tests/word_boundary_unicode.rs b/third_party/rust/regex/tests/word_boundary_unicode.rs
new file mode 100644
index 0000000000..c41355ffc4
--- /dev/null
+++ b/third_party/rust/regex/tests/word_boundary_unicode.rs
@@ -0,0 +1,6 @@
+// Unicode word boundaries know about Unicode characters.
+// For ASCII word boundaries, the tests are precisely inverted.
+matiter!(unicode1, r"\bx\b", "áxβ");
+matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));
+
+matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));